summaryrefslogtreecommitdiffstats
path: root/ctdb/server
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-05 17:47:29 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-05 17:47:29 +0000
commit4f5791ebd03eaec1c7da0865a383175b05102712 (patch)
tree8ce7b00f7a76baa386372422adebbe64510812d4 /ctdb/server
parentInitial commit. (diff)
downloadsamba-upstream.tar.xz
samba-upstream.zip
Adding upstream version 2:4.17.12+dfsg.upstream/2%4.17.12+dfsgupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'ctdb/server')
-rw-r--r--ctdb/server/ctdb_banning.c146
-rw-r--r--ctdb/server/ctdb_call.c2082
-rw-r--r--ctdb/server/ctdb_client.c1709
-rw-r--r--ctdb/server/ctdb_cluster_mutex.c382
-rw-r--r--ctdb/server/ctdb_cluster_mutex.h51
-rw-r--r--ctdb/server/ctdb_config.c183
-rw-r--r--ctdb/server/ctdb_config.h59
-rw-r--r--ctdb/server/ctdb_control.c1089
-rw-r--r--ctdb/server/ctdb_daemon.c2248
-rw-r--r--ctdb/server/ctdb_fork.c216
-rw-r--r--ctdb/server/ctdb_freeze.c923
-rw-r--r--ctdb/server/ctdb_keepalive.c234
-rw-r--r--ctdb/server/ctdb_lock.c996
-rw-r--r--ctdb/server/ctdb_lock_helper.c350
-rw-r--r--ctdb/server/ctdb_logging.c174
-rw-r--r--ctdb/server/ctdb_ltdb_server.c1663
-rw-r--r--ctdb/server/ctdb_monitor.c509
-rw-r--r--ctdb/server/ctdb_mutex_fcntl_helper.c794
-rw-r--r--ctdb/server/ctdb_persistent.c397
-rw-r--r--ctdb/server/ctdb_recover.c1243
-rw-r--r--ctdb/server/ctdb_recoverd.c3286
-rw-r--r--ctdb/server/ctdb_recovery_helper.c3200
-rw-r--r--ctdb/server/ctdb_server.c608
-rw-r--r--ctdb/server/ctdb_statistics.c93
-rw-r--r--ctdb/server/ctdb_takeover.c2653
-rw-r--r--ctdb/server/ctdb_takeover_helper.c1276
-rw-r--r--ctdb/server/ctdb_traverse.c781
-rw-r--r--ctdb/server/ctdb_tunables.c170
-rw-r--r--ctdb/server/ctdb_tunnel.c141
-rw-r--r--ctdb/server/ctdb_update_record.c372
-rw-r--r--ctdb/server/ctdb_uptime.c55
-rw-r--r--ctdb/server/ctdb_vacuum.c1990
-rw-r--r--ctdb/server/ctdbd.c405
-rw-r--r--ctdb/server/eventscript.c845
-rw-r--r--ctdb/server/ipalloc.c284
-rw-r--r--ctdb/server/ipalloc.h67
-rw-r--r--ctdb/server/ipalloc_common.c192
-rw-r--r--ctdb/server/ipalloc_deterministic.c63
-rw-r--r--ctdb/server/ipalloc_lcp2.c525
-rw-r--r--ctdb/server/ipalloc_nondeterministic.c150
-rw-r--r--ctdb/server/ipalloc_private.h57
-rw-r--r--ctdb/server/legacy_conf.c80
-rw-r--r--ctdb/server/legacy_conf.h35
43 files changed, 32776 insertions, 0 deletions
diff --git a/ctdb/server/ctdb_banning.c b/ctdb/server/ctdb_banning.c
new file mode 100644
index 0000000..3c71157
--- /dev/null
+++ b/ctdb/server/ctdb_banning.c
@@ -0,0 +1,146 @@
+/*
+ ctdb banning code
+
+ Copyright (C) Ronnie Sahlberg 2009
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "replace.h"
+#include "system/time.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "system/wait.h"
+
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+
+#include "ctdb_private.h"
+#include "ctdb_client.h"
+
+#include "common/common.h"
+#include "common/logging.h"
+
+static void ctdb_ban_node_event(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *private_data)
+{
+ struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+
+ /* Make sure we were able to freeze databases during banning */
+ if (!ctdb_db_all_frozen(ctdb)) {
+ DEBUG(DEBUG_ERR, ("Banning timed out, but not all databases "
+ "frozen yet - banning this node again.\n"));
+ ctdb_ban_self(ctdb);
+ return;
+ }
+
+ DEBUG(DEBUG_ERR,("Banning timed out\n"));
+ ctdb->nodes[ctdb->pnn]->flags &= ~NODE_FLAGS_BANNED;
+
+ if (ctdb->banning_ctx != NULL) {
+ talloc_free(ctdb->banning_ctx);
+ ctdb->banning_ctx = NULL;
+ }
+}
+
+int32_t ctdb_control_set_ban_state(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+ struct ctdb_ban_state *bantime = (struct ctdb_ban_state *)indata.dptr;
+ bool already_banned;
+
+ DEBUG(DEBUG_INFO,("SET BAN STATE\n"));
+
+ if (bantime->pnn != ctdb->pnn) {
+ DEBUG(DEBUG_WARNING,
+ ("SET_BAN_STATE control for PNN %d ignored\n",
+ bantime->pnn));
+ return -1;
+ }
+
+ already_banned = false;
+ if (ctdb->banning_ctx != NULL) {
+ talloc_free(ctdb->banning_ctx);
+ ctdb->banning_ctx = NULL;
+ already_banned = true;
+ }
+
+ if (bantime->time == 0) {
+ DEBUG(DEBUG_ERR,("Unbanning this node\n"));
+ ctdb->nodes[bantime->pnn]->flags &= ~NODE_FLAGS_BANNED;
+ return 0;
+ }
+
+ if (ctdb->tunable.enable_bans == 0) {
+ DEBUG(DEBUG_ERR,("Bans are disabled - ignoring ban of node %u\n", bantime->pnn));
+ return 0;
+ }
+
+ ctdb->banning_ctx = talloc(ctdb, struct ctdb_ban_state);
+ if (ctdb->banning_ctx == NULL) {
+ DEBUG(DEBUG_CRIT,(__location__ " ERROR Failed to allocate new banning state\n"));
+ return -1;
+ }
+ *((struct ctdb_ban_state *)(ctdb->banning_ctx)) = *bantime;
+
+
+ DEBUG(DEBUG_ERR,("Banning this node for %d seconds\n", bantime->time));
+ ctdb->nodes[bantime->pnn]->flags |= NODE_FLAGS_BANNED;
+
+ tevent_add_timer(ctdb->ev, ctdb->banning_ctx,
+ timeval_current_ofs(bantime->time,0),
+ ctdb_ban_node_event, ctdb);
+
+ if (!already_banned) {
+ ctdb_node_become_inactive(ctdb);
+ }
+ return 0;
+}
+
+int32_t ctdb_control_get_ban_state(struct ctdb_context *ctdb, TDB_DATA *outdata)
+{
+ struct ctdb_ban_state *bantime;
+
+ bantime = talloc(outdata, struct ctdb_ban_state);
+ CTDB_NO_MEMORY(ctdb, bantime);
+
+ if (ctdb->banning_ctx != NULL) {
+ *bantime = *(struct ctdb_ban_state *)(ctdb->banning_ctx);
+ } else {
+ bantime->pnn = ctdb->pnn;
+ bantime->time = 0;
+ }
+
+ outdata->dptr = (uint8_t *)bantime;
+ outdata->dsize = sizeof(struct ctdb_ban_state);
+
+ return 0;
+}
+
+/* Routine to ban ourselves for a while when trouble strikes. */
+void ctdb_ban_self(struct ctdb_context *ctdb)
+{
+ TDB_DATA data;
+ struct ctdb_ban_state bantime;
+
+ bantime.pnn = ctdb->pnn;
+ bantime.time = ctdb->tunable.recovery_ban_period;
+
+ data.dsize = sizeof(bantime);
+ data.dptr = (uint8_t *)&bantime;
+
+ ctdb_control_set_ban_state(ctdb, data);
+}
diff --git a/ctdb/server/ctdb_call.c b/ctdb/server/ctdb_call.c
new file mode 100644
index 0000000..1d5dea3
--- /dev/null
+++ b/ctdb/server/ctdb_call.c
@@ -0,0 +1,2082 @@
+/*
+ ctdb_call protocol code
+
+ Copyright (C) Andrew Tridgell 2006
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+/*
+ see http://wiki.samba.org/index.php/Samba_%26_Clustering for
+ protocol design and packet details
+*/
+#include "replace.h"
+#include "system/network.h"
+#include "system/filesys.h"
+
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/util/dlinklist.h"
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+#include "lib/util/sys_rw.h"
+#include "lib/util/util_process.h"
+
+#include "ctdb_private.h"
+#include "ctdb_client.h"
+
+#include "common/rb_tree.h"
+#include "common/reqid.h"
+#include "common/system.h"
+#include "common/common.h"
+#include "common/logging.h"
+#include "common/hash_count.h"
+
+struct ctdb_sticky_record {
+ struct ctdb_context *ctdb;
+ struct ctdb_db_context *ctdb_db;
+ TDB_CONTEXT *pindown;
+};
+
+/*
+ find the ctdb_db from a db index
+ */
+ struct ctdb_db_context *find_ctdb_db(struct ctdb_context *ctdb, uint32_t id)
+{
+ struct ctdb_db_context *ctdb_db;
+
+ for (ctdb_db=ctdb->db_list; ctdb_db; ctdb_db=ctdb_db->next) {
+ if (ctdb_db->db_id == id) {
+ break;
+ }
+ }
+ return ctdb_db;
+}
+
+/*
+ a variant of input packet that can be used in lock requeue
+*/
+static void ctdb_call_input_pkt(void *p, struct ctdb_req_header *hdr)
+{
+ struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
+ ctdb_input_pkt(ctdb, hdr);
+}
+
+
+/*
+ send an error reply
+*/
+static void ctdb_send_error(struct ctdb_context *ctdb,
+ struct ctdb_req_header *hdr, uint32_t status,
+ const char *fmt, ...) PRINTF_ATTRIBUTE(4,5);
+static void ctdb_send_error(struct ctdb_context *ctdb,
+ struct ctdb_req_header *hdr, uint32_t status,
+ const char *fmt, ...)
+{
+ va_list ap;
+ struct ctdb_reply_error_old *r;
+ char *msg;
+ int msglen, len;
+
+ if (ctdb->methods == NULL) {
+ DEBUG(DEBUG_INFO,(__location__ " Failed to send error. Transport is DOWN\n"));
+ return;
+ }
+
+ va_start(ap, fmt);
+ msg = talloc_vasprintf(ctdb, fmt, ap);
+ if (msg == NULL) {
+ ctdb_fatal(ctdb, "Unable to allocate error in ctdb_send_error\n");
+ }
+ va_end(ap);
+
+ msglen = strlen(msg)+1;
+ len = offsetof(struct ctdb_reply_error_old, msg);
+ r = ctdb_transport_allocate(ctdb, msg, CTDB_REPLY_ERROR, len + msglen,
+ struct ctdb_reply_error_old);
+ CTDB_NO_MEMORY_FATAL(ctdb, r);
+
+ r->hdr.destnode = hdr->srcnode;
+ r->hdr.reqid = hdr->reqid;
+ r->status = status;
+ r->msglen = msglen;
+ memcpy(&r->msg[0], msg, msglen);
+
+ ctdb_queue_packet(ctdb, &r->hdr);
+
+ talloc_free(msg);
+}
+
+
+/**
+ * send a redirect reply
+ *
+ * The logic behind this function is this:
+ *
+ * A client wants to grab a record and sends a CTDB_REQ_CALL packet
+ * to its local ctdb (ctdb_request_call). If the node is not itself
+ * the record's DMASTER, it first redirects the packet to the
+ * record's LMASTER. The LMASTER then redirects the call packet to
+ * the current DMASTER. Note that this works because of this: When
+ * a record is migrated off a node, then the new DMASTER is stored
+ * in the record's copy on the former DMASTER.
+ */
+static void ctdb_call_send_redirect(struct ctdb_context *ctdb,
+ struct ctdb_db_context *ctdb_db,
+ TDB_DATA key,
+ struct ctdb_req_call_old *c,
+ struct ctdb_ltdb_header *header)
+{
+ uint32_t lmaster = ctdb_lmaster(ctdb, &key);
+
+ c->hdr.destnode = lmaster;
+ if (ctdb->pnn == lmaster) {
+ c->hdr.destnode = header->dmaster;
+ }
+ c->hopcount++;
+
+ if (c->hopcount%100 > 95) {
+ DEBUG(DEBUG_WARNING,("High hopcount %d dbid:%s "
+ "key:0x%08x reqid=%08x pnn:%d src:%d lmaster:%d "
+ "header->dmaster:%d dst:%d\n",
+ c->hopcount, ctdb_db->db_name, ctdb_hash(&key),
+ c->hdr.reqid, ctdb->pnn, c->hdr.srcnode, lmaster,
+ header->dmaster, c->hdr.destnode));
+ }
+
+ ctdb_queue_packet(ctdb, &c->hdr);
+}
+
+
+/*
+ send a dmaster reply
+
+ caller must have the chainlock before calling this routine. Caller must be
+ the lmaster
+*/
+static void ctdb_send_dmaster_reply(struct ctdb_db_context *ctdb_db,
+ struct ctdb_ltdb_header *header,
+ TDB_DATA key, TDB_DATA data,
+ uint32_t new_dmaster,
+ uint32_t reqid)
+{
+ struct ctdb_context *ctdb = ctdb_db->ctdb;
+ struct ctdb_reply_dmaster_old *r;
+ int ret, len;
+ TALLOC_CTX *tmp_ctx;
+
+ if (ctdb->pnn != ctdb_lmaster(ctdb, &key)) {
+ DEBUG(DEBUG_ALERT,(__location__ " Caller is not lmaster!\n"));
+ return;
+ }
+
+ header->dmaster = new_dmaster;
+ ret = ctdb_ltdb_store(ctdb_db, key, header, data);
+ if (ret != 0) {
+ ctdb_fatal(ctdb, "ctdb_send_dmaster_reply unable to update dmaster");
+ return;
+ }
+
+ if (ctdb->methods == NULL) {
+ ctdb_fatal(ctdb, "ctdb_send_dmaster_reply cant update dmaster since transport is down");
+ return;
+ }
+
+ /* put the packet on a temporary context, allowing us to safely free
+ it below even if ctdb_reply_dmaster() has freed it already */
+ tmp_ctx = talloc_new(ctdb);
+
+ /* send the CTDB_REPLY_DMASTER */
+ len = offsetof(struct ctdb_reply_dmaster_old, data) + key.dsize + data.dsize + sizeof(uint32_t);
+ r = ctdb_transport_allocate(ctdb, tmp_ctx, CTDB_REPLY_DMASTER, len,
+ struct ctdb_reply_dmaster_old);
+ CTDB_NO_MEMORY_FATAL(ctdb, r);
+
+ r->hdr.destnode = new_dmaster;
+ r->hdr.reqid = reqid;
+ r->hdr.generation = ctdb_db->generation;
+ r->rsn = header->rsn;
+ r->keylen = key.dsize;
+ r->datalen = data.dsize;
+ r->db_id = ctdb_db->db_id;
+ memcpy(&r->data[0], key.dptr, key.dsize);
+ memcpy(&r->data[key.dsize], data.dptr, data.dsize);
+ memcpy(&r->data[key.dsize+data.dsize], &header->flags, sizeof(uint32_t));
+
+ ctdb_queue_packet(ctdb, &r->hdr);
+
+ talloc_free(tmp_ctx);
+}
+
+/*
+ send a dmaster request (give another node the dmaster for a record)
+
+ This is always sent to the lmaster, which ensures that the lmaster
+ always knows who the dmaster is. The lmaster will then send a
+ CTDB_REPLY_DMASTER to the new dmaster
+*/
+static void ctdb_call_send_dmaster(struct ctdb_db_context *ctdb_db,
+ struct ctdb_req_call_old *c,
+ struct ctdb_ltdb_header *header,
+ TDB_DATA *key, TDB_DATA *data)
+{
+ struct ctdb_req_dmaster_old *r;
+ struct ctdb_context *ctdb = ctdb_db->ctdb;
+ int len;
+ uint32_t lmaster = ctdb_lmaster(ctdb, key);
+
+ if (ctdb->methods == NULL) {
+ ctdb_fatal(ctdb, "Failed ctdb_call_send_dmaster since transport is down");
+ return;
+ }
+
+ if (data->dsize != 0) {
+ header->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
+ }
+
+ if (lmaster == ctdb->pnn) {
+ ctdb_send_dmaster_reply(ctdb_db, header, *key, *data,
+ c->hdr.srcnode, c->hdr.reqid);
+ return;
+ }
+
+ len = offsetof(struct ctdb_req_dmaster_old, data) + key->dsize + data->dsize
+ + sizeof(uint32_t);
+ r = ctdb_transport_allocate(ctdb, ctdb, CTDB_REQ_DMASTER, len,
+ struct ctdb_req_dmaster_old);
+ CTDB_NO_MEMORY_FATAL(ctdb, r);
+ r->hdr.destnode = lmaster;
+ r->hdr.reqid = c->hdr.reqid;
+ r->hdr.generation = ctdb_db->generation;
+ r->db_id = c->db_id;
+ r->rsn = header->rsn;
+ r->dmaster = c->hdr.srcnode;
+ r->keylen = key->dsize;
+ r->datalen = data->dsize;
+ memcpy(&r->data[0], key->dptr, key->dsize);
+ memcpy(&r->data[key->dsize], data->dptr, data->dsize);
+ memcpy(&r->data[key->dsize + data->dsize], &header->flags, sizeof(uint32_t));
+
+ header->dmaster = c->hdr.srcnode;
+ if (ctdb_ltdb_store(ctdb_db, *key, header, *data) != 0) {
+ ctdb_fatal(ctdb, "Failed to store record in ctdb_call_send_dmaster");
+ }
+
+ ctdb_queue_packet(ctdb, &r->hdr);
+
+ talloc_free(r);
+}
+
+static void ctdb_sticky_pindown_timeout(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *private_data)
+{
+ struct ctdb_sticky_record *sr = talloc_get_type(private_data,
+ struct ctdb_sticky_record);
+
+ DEBUG(DEBUG_ERR,("Pindown timeout db:%s unstick record\n", sr->ctdb_db->db_name));
+ if (sr->pindown != NULL) {
+ talloc_free(sr->pindown);
+ sr->pindown = NULL;
+ }
+}
+
+static int
+ctdb_set_sticky_pindown(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, TDB_DATA key)
+{
+ TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+ uint32_t *k;
+ struct ctdb_sticky_record *sr;
+
+ k = ctdb_key_to_idkey(tmp_ctx, key);
+ if (k == NULL) {
+ DEBUG(DEBUG_ERR,("Failed to allocate key for sticky record\n"));
+ talloc_free(tmp_ctx);
+ return -1;
+ }
+
+ sr = trbt_lookuparray32(ctdb_db->sticky_records, k[0], &k[0]);
+ if (sr == NULL) {
+ talloc_free(tmp_ctx);
+ return 0;
+ }
+
+ talloc_free(tmp_ctx);
+
+ if (sr->pindown == NULL) {
+ DEBUG(DEBUG_ERR,("Pinning down record in %s for %d ms\n", ctdb_db->db_name, ctdb->tunable.sticky_pindown));
+ sr->pindown = talloc_new(sr);
+ if (sr->pindown == NULL) {
+ DEBUG(DEBUG_ERR,("Failed to allocate pindown context for sticky record\n"));
+ return -1;
+ }
+ tevent_add_timer(ctdb->ev, sr->pindown,
+ timeval_current_ofs(ctdb->tunable.sticky_pindown / 1000,
+ (ctdb->tunable.sticky_pindown * 1000) % 1000000),
+ ctdb_sticky_pindown_timeout, sr);
+ }
+
+ return 0;
+}
+
+/*
+ called when a CTDB_REPLY_DMASTER packet comes in, or when the lmaster
+ gets a CTDB_REQUEST_DMASTER for itself. We become the dmaster.
+
+ must be called with the chainlock held. This function releases the chainlock
+*/
+static void ctdb_become_dmaster(struct ctdb_db_context *ctdb_db,
+ struct ctdb_req_header *hdr,
+ TDB_DATA key, TDB_DATA data,
+ uint64_t rsn, uint32_t record_flags)
+{
+ struct ctdb_call_state *state;
+ struct ctdb_context *ctdb = ctdb_db->ctdb;
+ struct ctdb_ltdb_header header;
+ int ret;
+
+ DEBUG(DEBUG_DEBUG,("pnn %u dmaster response %08x\n", ctdb->pnn, ctdb_hash(&key)));
+
+ ZERO_STRUCT(header);
+ header.rsn = rsn;
+ header.dmaster = ctdb->pnn;
+ header.flags = record_flags;
+
+ state = reqid_find(ctdb->idr, hdr->reqid, struct ctdb_call_state);
+
+ if (state) {
+ if (state->call->flags & CTDB_CALL_FLAG_VACUUM_MIGRATION) {
+ /*
+ * We temporarily add the VACUUM_MIGRATED flag to
+ * the record flags, so that ctdb_ltdb_store can
+ * decide whether the record should be stored or
+ * deleted.
+ */
+ header.flags |= CTDB_REC_FLAG_VACUUM_MIGRATED;
+ }
+ }
+
+ if (ctdb_ltdb_store(ctdb_db, key, &header, data) != 0) {
+ ctdb_fatal(ctdb, "ctdb_reply_dmaster store failed\n");
+
+ ret = ctdb_ltdb_unlock(ctdb_db, key);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+ }
+ return;
+ }
+
+ /* we just became DMASTER and this database is "sticky",
+ see if the record is flagged as "hot" and set up a pin-down
+ context to stop migrations for a little while if so
+ */
+ if (ctdb_db_sticky(ctdb_db)) {
+ ctdb_set_sticky_pindown(ctdb, ctdb_db, key);
+ }
+
+ if (state == NULL) {
+ DEBUG(DEBUG_ERR,("pnn %u Invalid reqid %u in ctdb_become_dmaster from node %u\n",
+ ctdb->pnn, hdr->reqid, hdr->srcnode));
+
+ ret = ctdb_ltdb_unlock(ctdb_db, key);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+ }
+ return;
+ }
+
+ if (key.dsize != state->call->key.dsize || memcmp(key.dptr, state->call->key.dptr, key.dsize)) {
+ DEBUG(DEBUG_ERR, ("Got bogus DMASTER packet reqid:%u from node %u. Key does not match key held in matching idr.\n", hdr->reqid, hdr->srcnode));
+
+ ret = ctdb_ltdb_unlock(ctdb_db, key);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+ }
+ return;
+ }
+
+ if (hdr->reqid != state->reqid) {
+ /* we found a record but it was the wrong one */
+ DEBUG(DEBUG_ERR, ("Dropped orphan in ctdb_become_dmaster with reqid:%u\n from node %u", hdr->reqid, hdr->srcnode));
+
+ ret = ctdb_ltdb_unlock(ctdb_db, key);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+ }
+ return;
+ }
+
+ (void) hash_count_increment(ctdb_db->migratedb, key);
+
+ ctdb_call_local(ctdb_db, state->call, &header, state, &data, true);
+
+ ret = ctdb_ltdb_unlock(ctdb_db, state->call->key);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+ }
+
+ state->state = CTDB_CALL_DONE;
+ if (state->async.fn) {
+ state->async.fn(state);
+ }
+}
+
+struct dmaster_defer_call {
+ struct dmaster_defer_call *next, *prev;
+ struct ctdb_context *ctdb;
+ struct ctdb_req_header *hdr;
+};
+
+struct dmaster_defer_queue {
+ struct ctdb_db_context *ctdb_db;
+ uint32_t generation;
+ struct dmaster_defer_call *deferred_calls;
+};
+
+static void dmaster_defer_reprocess(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t,
+ void *private_data)
+{
+ struct dmaster_defer_call *call = talloc_get_type(
+ private_data, struct dmaster_defer_call);
+
+ ctdb_input_pkt(call->ctdb, call->hdr);
+ talloc_free(call);
+}
+
+static int dmaster_defer_queue_destructor(struct dmaster_defer_queue *ddq)
+{
+ /* Ignore requests, if database recovery happens in-between. */
+ if (ddq->generation != ddq->ctdb_db->generation) {
+ return 0;
+ }
+
+ while (ddq->deferred_calls != NULL) {
+ struct dmaster_defer_call *call = ddq->deferred_calls;
+
+ DLIST_REMOVE(ddq->deferred_calls, call);
+
+ talloc_steal(call->ctdb, call);
+ tevent_add_timer(call->ctdb->ev, call, timeval_zero(),
+ dmaster_defer_reprocess, call);
+ }
+ return 0;
+}
+
+static void *insert_ddq_callback(void *parm, void *data)
+{
+ if (data) {
+ talloc_free(data);
+ }
+ return parm;
+}
+
+/**
+ * This function is used to register a key in database that needs to be updated.
+ * Any requests for that key should get deferred till this is completed.
+ */
+static int dmaster_defer_setup(struct ctdb_db_context *ctdb_db,
+ struct ctdb_req_header *hdr,
+ TDB_DATA key)
+{
+ uint32_t *k;
+ struct dmaster_defer_queue *ddq;
+
+ k = ctdb_key_to_idkey(hdr, key);
+ if (k == NULL) {
+ DEBUG(DEBUG_ERR, ("Failed to allocate key for dmaster defer setup\n"));
+ return -1;
+ }
+
+ /* Already exists */
+ ddq = trbt_lookuparray32(ctdb_db->defer_dmaster, k[0], k);
+ if (ddq != NULL) {
+ if (ddq->generation == ctdb_db->generation) {
+ talloc_free(k);
+ return 0;
+ }
+
+ /* Recovery occurred - get rid of old queue. All the deferred
+ * requests will be resent anyway from ctdb_call_resend_db.
+ */
+ talloc_free(ddq);
+ }
+
+ ddq = talloc(hdr, struct dmaster_defer_queue);
+ if (ddq == NULL) {
+ DEBUG(DEBUG_ERR, ("Failed to allocate dmaster defer queue\n"));
+ talloc_free(k);
+ return -1;
+ }
+ ddq->ctdb_db = ctdb_db;
+ ddq->generation = hdr->generation;
+ ddq->deferred_calls = NULL;
+
+ trbt_insertarray32_callback(ctdb_db->defer_dmaster, k[0], k,
+ insert_ddq_callback, ddq);
+ talloc_set_destructor(ddq, dmaster_defer_queue_destructor);
+
+ talloc_free(k);
+ return 0;
+}
+
+static int dmaster_defer_add(struct ctdb_db_context *ctdb_db,
+ struct ctdb_req_header *hdr,
+ TDB_DATA key)
+{
+ struct dmaster_defer_queue *ddq;
+ struct dmaster_defer_call *call;
+ uint32_t *k;
+
+ k = ctdb_key_to_idkey(hdr, key);
+ if (k == NULL) {
+ DEBUG(DEBUG_ERR, ("Failed to allocate key for dmaster defer add\n"));
+ return -1;
+ }
+
+ ddq = trbt_lookuparray32(ctdb_db->defer_dmaster, k[0], k);
+ if (ddq == NULL) {
+ talloc_free(k);
+ return -1;
+ }
+
+ talloc_free(k);
+
+ if (ddq->generation != hdr->generation) {
+ talloc_set_destructor(ddq, NULL);
+ talloc_free(ddq);
+ return -1;
+ }
+
+ call = talloc(ddq, struct dmaster_defer_call);
+ if (call == NULL) {
+ DEBUG(DEBUG_ERR, ("Failed to allocate dmaster defer call\n"));
+ return -1;
+ }
+
+ call->ctdb = ctdb_db->ctdb;
+ call->hdr = talloc_steal(call, hdr);
+
+ DLIST_ADD_END(ddq->deferred_calls, call);
+
+ return 0;
+}
+
+/*
+ called when a CTDB_REQ_DMASTER packet comes in
+
+ this comes into the lmaster for a record when the current dmaster
+ wants to give up the dmaster role and give it to someone else
+*/
+void ctdb_request_dmaster(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+ struct ctdb_req_dmaster_old *c = (struct ctdb_req_dmaster_old *)hdr;
+ TDB_DATA key, data, data2;
+ struct ctdb_ltdb_header header;
+ struct ctdb_db_context *ctdb_db;
+ uint32_t record_flags = 0;
+ size_t len;
+ int ret;
+
+ key.dptr = c->data;
+ key.dsize = c->keylen;
+ data.dptr = c->data + c->keylen;
+ data.dsize = c->datalen;
+ len = offsetof(struct ctdb_req_dmaster_old, data) + key.dsize + data.dsize
+ + sizeof(uint32_t);
+ if (len <= c->hdr.length) {
+ memcpy(&record_flags, &c->data[c->keylen + c->datalen],
+ sizeof(record_flags));
+ }
+
+ ctdb_db = find_ctdb_db(ctdb, c->db_id);
+ if (!ctdb_db) {
+ ctdb_send_error(ctdb, hdr, -1,
+ "Unknown database in request. db_id==0x%08x",
+ c->db_id);
+ return;
+ }
+
+ dmaster_defer_setup(ctdb_db, hdr, key);
+
+ /* fetch the current record */
+ ret = ctdb_ltdb_lock_fetch_requeue(ctdb_db, key, &header, hdr, &data2,
+ ctdb_call_input_pkt, ctdb, false);
+ if (ret == -1) {
+ ctdb_fatal(ctdb, "ctdb_req_dmaster failed to fetch record");
+ return;
+ }
+ if (ret == -2) {
+ DEBUG(DEBUG_INFO,(__location__ " deferring ctdb_request_dmaster\n"));
+ return;
+ }
+
+ if (ctdb_lmaster(ctdb, &key) != ctdb->pnn) {
+ DEBUG(DEBUG_ERR, ("dmaster request to non-lmaster "
+ "db=%s lmaster=%u gen=%u curgen=%u\n",
+ ctdb_db->db_name, ctdb_lmaster(ctdb, &key),
+ hdr->generation, ctdb_db->generation));
+ ctdb_fatal(ctdb, "ctdb_req_dmaster to non-lmaster");
+ }
+
+ DEBUG(DEBUG_DEBUG,("pnn %u dmaster request on %08x for %u from %u\n",
+ ctdb->pnn, ctdb_hash(&key), c->dmaster, c->hdr.srcnode));
+
+ /* its a protocol error if the sending node is not the current dmaster */
+ if (header.dmaster != hdr->srcnode) {
+ DEBUG(DEBUG_ALERT,("pnn %u dmaster request for new-dmaster %u from non-master %u real-dmaster=%u key %08x dbid 0x%08x gen=%u curgen=%u c->rsn=%llu header.rsn=%llu reqid=%u keyval=0x%08x\n",
+ ctdb->pnn, c->dmaster, hdr->srcnode, header.dmaster, ctdb_hash(&key),
+ ctdb_db->db_id, hdr->generation, ctdb->vnn_map->generation,
+ (unsigned long long)c->rsn, (unsigned long long)header.rsn, c->hdr.reqid,
+ (key.dsize >= 4)?(*(uint32_t *)key.dptr):0));
+ if (header.rsn != 0 || header.dmaster != ctdb->pnn) {
+ DEBUG(DEBUG_ERR,("ctdb_req_dmaster from non-master. Force a recovery.\n"));
+
+ ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
+ ctdb_ltdb_unlock(ctdb_db, key);
+ return;
+ }
+ }
+
+ if (header.rsn > c->rsn) {
+ DEBUG(DEBUG_ALERT,("pnn %u dmaster request with older RSN new-dmaster %u from %u real-dmaster=%u key %08x dbid 0x%08x gen=%u curgen=%u c->rsn=%llu header.rsn=%llu reqid=%u\n",
+ ctdb->pnn, c->dmaster, hdr->srcnode, header.dmaster, ctdb_hash(&key),
+ ctdb_db->db_id, hdr->generation, ctdb->vnn_map->generation,
+ (unsigned long long)c->rsn, (unsigned long long)header.rsn, c->hdr.reqid));
+ }
+
+ /* use the rsn from the sending node */
+ header.rsn = c->rsn;
+
+ /* store the record flags from the sending node */
+ header.flags = record_flags;
+
+ /* check if the new dmaster is the lmaster, in which case we
+ skip the dmaster reply */
+ if (c->dmaster == ctdb->pnn) {
+ ctdb_become_dmaster(ctdb_db, hdr, key, data, c->rsn, record_flags);
+ } else {
+ ctdb_send_dmaster_reply(ctdb_db, &header, key, data, c->dmaster, hdr->reqid);
+
+ ret = ctdb_ltdb_unlock(ctdb_db, key);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+ }
+ }
+}
+
+static void ctdb_sticky_record_timeout(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *private_data)
+{
+ struct ctdb_sticky_record *sr = talloc_get_type(private_data,
+ struct ctdb_sticky_record);
+ talloc_free(sr);
+}
+
+static void *ctdb_make_sticky_record_callback(void *parm, void *data)
+{
+ if (data) {
+ DEBUG(DEBUG_ERR,("Already have sticky record registered. Free old %p and create new %p\n", data, parm));
+ talloc_free(data);
+ }
+ return parm;
+}
+
+static int
+ctdb_make_record_sticky(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, TDB_DATA key)
+{
+ TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+ uint32_t *k;
+ struct ctdb_sticky_record *sr;
+
+ k = ctdb_key_to_idkey(tmp_ctx, key);
+ if (k == NULL) {
+ DEBUG(DEBUG_ERR,("Failed to allocate key for sticky record\n"));
+ talloc_free(tmp_ctx);
+ return -1;
+ }
+
+ sr = trbt_lookuparray32(ctdb_db->sticky_records, k[0], &k[0]);
+ if (sr != NULL) {
+ talloc_free(tmp_ctx);
+ return 0;
+ }
+
+ sr = talloc(ctdb_db->sticky_records, struct ctdb_sticky_record);
+ if (sr == NULL) {
+ talloc_free(tmp_ctx);
+ DEBUG(DEBUG_ERR,("Failed to allocate sticky record structure\n"));
+ return -1;
+ }
+
+ sr->ctdb = ctdb;
+ sr->ctdb_db = ctdb_db;
+ sr->pindown = NULL;
+
+ DEBUG(DEBUG_ERR,("Make record sticky for %d seconds in db %s key:0x%08x.\n",
+ ctdb->tunable.sticky_duration,
+ ctdb_db->db_name, ctdb_hash(&key)));
+
+ trbt_insertarray32_callback(ctdb_db->sticky_records, k[0], &k[0], ctdb_make_sticky_record_callback, sr);
+
+ tevent_add_timer(ctdb->ev, sr,
+ timeval_current_ofs(ctdb->tunable.sticky_duration, 0),
+ ctdb_sticky_record_timeout, sr);
+
+ talloc_free(tmp_ctx);
+ return 0;
+}
+
+struct pinned_down_requeue_handle {
+ struct ctdb_context *ctdb;
+ struct ctdb_req_header *hdr;
+};
+
+struct pinned_down_deferred_call {
+ struct ctdb_context *ctdb;
+ struct ctdb_req_header *hdr;
+};
+
+static void pinned_down_requeue(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *private_data)
+{
+ struct pinned_down_requeue_handle *handle = talloc_get_type(private_data, struct pinned_down_requeue_handle);
+ struct ctdb_context *ctdb = handle->ctdb;
+
+ talloc_steal(ctdb, handle->hdr);
+ ctdb_call_input_pkt(ctdb, handle->hdr);
+
+ talloc_free(handle);
+}
+
+static int pinned_down_destructor(struct pinned_down_deferred_call *pinned_down)
+{
+ struct ctdb_context *ctdb = pinned_down->ctdb;
+ struct pinned_down_requeue_handle *handle = talloc(ctdb, struct pinned_down_requeue_handle);
+
+ handle->ctdb = pinned_down->ctdb;
+ handle->hdr = pinned_down->hdr;
+ talloc_steal(handle, handle->hdr);
+
+ tevent_add_timer(ctdb->ev, handle, timeval_zero(),
+ pinned_down_requeue, handle);
+
+ return 0;
+}
+
+static int
+ctdb_defer_pinned_down_request(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_req_header *hdr)
+{
+ TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+ uint32_t *k;
+ struct ctdb_sticky_record *sr;
+ struct pinned_down_deferred_call *pinned_down;
+
+ k = ctdb_key_to_idkey(tmp_ctx, key);
+ if (k == NULL) {
+ DEBUG(DEBUG_ERR,("Failed to allocate key for sticky record\n"));
+ talloc_free(tmp_ctx);
+ return -1;
+ }
+
+ sr = trbt_lookuparray32(ctdb_db->sticky_records, k[0], &k[0]);
+ if (sr == NULL) {
+ talloc_free(tmp_ctx);
+ return -1;
+ }
+
+ talloc_free(tmp_ctx);
+
+ if (sr->pindown == NULL) {
+ return -1;
+ }
+
+ pinned_down = talloc(sr->pindown, struct pinned_down_deferred_call);
+ if (pinned_down == NULL) {
+ DEBUG(DEBUG_ERR,("Failed to allocate structure for deferred pinned down request\n"));
+ return -1;
+ }
+
+ pinned_down->ctdb = ctdb;
+ pinned_down->hdr = hdr;
+
+ talloc_set_destructor(pinned_down, pinned_down_destructor);
+ talloc_steal(pinned_down, hdr);
+
+ return 0;
+}
+
+static int hot_key_cmp(const void *a, const void *b)
+{
+ const struct ctdb_db_hot_key *ka = (const struct ctdb_db_hot_key *)a;
+ const struct ctdb_db_hot_key *kb = (const struct ctdb_db_hot_key *)b;
+
+ if (ka->count < kb->count) {
+ return -1;
+ }
+ if (ka->count > kb->count) {
+ return 1;
+ }
+
+ return 0;
+}
+
+static void
+ctdb_update_db_stat_hot_keys(struct ctdb_db_context *ctdb_db, TDB_DATA key,
+ unsigned int count)
+{
+ unsigned int i, id;
+ char *keystr;
+
+ /*
+ * If all slots are being used then only need to compare
+ * against the count in the 0th slot, since it contains the
+ * smallest count.
+ */
+ if (ctdb_db->statistics.num_hot_keys == MAX_HOT_KEYS &&
+ count <= ctdb_db->hot_keys[0].count) {
+ return;
+ }
+
+ /* see if we already know this key */
+ for (i = 0; i < MAX_HOT_KEYS; i++) {
+ if (key.dsize != ctdb_db->hot_keys[i].key.dsize) {
+ continue;
+ }
+ if (memcmp(key.dptr, ctdb_db->hot_keys[i].key.dptr, key.dsize)) {
+ continue;
+ }
+ /* found an entry for this key */
+ if (count <= ctdb_db->hot_keys[i].count) {
+ return;
+ }
+ if (count >= (2 * ctdb_db->hot_keys[i].last_logged_count)) {
+ keystr = hex_encode_talloc(ctdb_db,
+ (unsigned char *)key.dptr,
+ key.dsize);
+ D_NOTICE("Updated hot key database=%s key=%s count=%d\n",
+ ctdb_db->db_name,
+ keystr ? keystr : "" ,
+ count);
+ TALLOC_FREE(keystr);
+ ctdb_db->hot_keys[i].last_logged_count = count;
+ }
+ ctdb_db->hot_keys[i].count = count;
+ goto sort_keys;
+ }
+
+ if (ctdb_db->statistics.num_hot_keys < MAX_HOT_KEYS) {
+ id = ctdb_db->statistics.num_hot_keys;
+ ctdb_db->statistics.num_hot_keys++;
+ } else {
+ id = 0;
+ }
+
+ if (ctdb_db->hot_keys[id].key.dptr != NULL) {
+ talloc_free(ctdb_db->hot_keys[id].key.dptr);
+ }
+ ctdb_db->hot_keys[id].key.dsize = key.dsize;
+ ctdb_db->hot_keys[id].key.dptr = talloc_memdup(ctdb_db,
+ key.dptr,
+ key.dsize);
+ ctdb_db->hot_keys[id].count = count;
+
+ keystr = hex_encode_talloc(ctdb_db,
+ (unsigned char *)key.dptr, key.dsize);
+ D_NOTICE("Added hot key database=%s key=%s count=%d\n",
+ ctdb_db->db_name,
+ keystr ? keystr : "" ,
+ count);
+ talloc_free(keystr);
+ ctdb_db->hot_keys[id].last_logged_count = count;
+
+sort_keys:
+ qsort(&ctdb_db->hot_keys[0],
+ ctdb_db->statistics.num_hot_keys,
+ sizeof(struct ctdb_db_hot_key),
+ hot_key_cmp);
+}
+
+/*
+ called when a CTDB_REQ_CALL packet comes in
+*/
+void ctdb_request_call(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+ struct ctdb_req_call_old *c = (struct ctdb_req_call_old *)hdr;
+ TDB_DATA data;
+ struct ctdb_reply_call_old *r;
+ int ret, len;
+ struct ctdb_ltdb_header header;
+ struct ctdb_call *call;
+ struct ctdb_db_context *ctdb_db;
+ int tmp_count, bucket;
+
+ if (ctdb->methods == NULL) {
+ DEBUG(DEBUG_INFO,(__location__ " Failed ctdb_request_call. Transport is DOWN\n"));
+ return;
+ }
+
+
+ ctdb_db = find_ctdb_db(ctdb, c->db_id);
+ if (!ctdb_db) {
+ ctdb_send_error(ctdb, hdr, -1,
+ "Unknown database in request. db_id==0x%08x",
+ c->db_id);
+ return;
+ }
+
+ call = talloc(hdr, struct ctdb_call);
+ CTDB_NO_MEMORY_FATAL(ctdb, call);
+
+ call->call_id = c->callid;
+ call->key.dptr = c->data;
+ call->key.dsize = c->keylen;
+ call->call_data.dptr = c->data + c->keylen;
+ call->call_data.dsize = c->calldatalen;
+ call->reply_data.dptr = NULL;
+ call->reply_data.dsize = 0;
+
+
+ /* If this record is pinned down we should defer the
+ request until the pindown times out
+ */
+ if (ctdb_db_sticky(ctdb_db)) {
+ if (ctdb_defer_pinned_down_request(ctdb, ctdb_db, call->key, hdr) == 0) {
+ DEBUG(DEBUG_WARNING,
+ ("Defer request for pinned down record in %s\n", ctdb_db->db_name));
+ talloc_free(call);
+ return;
+ }
+ }
+
+ if (dmaster_defer_add(ctdb_db, hdr, call->key) == 0) {
+ talloc_free(call);
+ return;
+ }
+
+ /* determine if we are the dmaster for this key. This also
+ fetches the record data (if any), thus avoiding a 2nd fetch of the data
+ if the call will be answered locally */
+
+ ret = ctdb_ltdb_lock_fetch_requeue(ctdb_db, call->key, &header, hdr, &data,
+ ctdb_call_input_pkt, ctdb, false);
+ if (ret == -1) {
+ ctdb_send_error(ctdb, hdr, ret, "ltdb fetch failed in ctdb_request_call");
+ talloc_free(call);
+ return;
+ }
+ if (ret == -2) {
+ DEBUG(DEBUG_INFO,(__location__ " deferred ctdb_request_call\n"));
+ talloc_free(call);
+ return;
+ }
+
+ /* Dont do READONLY if we don't have a tracking database */
+ if ((c->flags & CTDB_WANT_READONLY) && !ctdb_db_readonly(ctdb_db)) {
+ c->flags &= ~CTDB_WANT_READONLY;
+ }
+
+ if (header.flags & CTDB_REC_RO_REVOKE_COMPLETE) {
+ header.flags &= ~CTDB_REC_RO_FLAGS;
+ CTDB_INCREMENT_STAT(ctdb, total_ro_revokes);
+ CTDB_INCREMENT_DB_STAT(ctdb_db, db_ro_revokes);
+ if (ctdb_ltdb_store(ctdb_db, call->key, &header, data) != 0) {
+ ctdb_fatal(ctdb, "Failed to write header with cleared REVOKE flag");
+ }
+ /* and clear out the tracking data */
+ if (tdb_delete(ctdb_db->rottdb, call->key) != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to clear out trackingdb record\n"));
+ }
+ }
+
+ /* if we are revoking, we must defer all other calls until the revoke
+ * had completed.
+ */
+ if (header.flags & CTDB_REC_RO_REVOKING_READONLY) {
+ talloc_free(data.dptr);
+ ret = ctdb_ltdb_unlock(ctdb_db, call->key);
+
+ if (ctdb_add_revoke_deferred_call(ctdb, ctdb_db, call->key, hdr, ctdb_call_input_pkt, ctdb) != 0) {
+ ctdb_fatal(ctdb, "Failed to add deferred call for revoke child");
+ }
+ talloc_free(call);
+ return;
+ }
+
+ /*
+ * If we are not the dmaster and are not hosting any delegations,
+ * then we redirect the request to the node than can answer it
+ * (the lmaster or the dmaster).
+ */
+ if ((header.dmaster != ctdb->pnn)
+ && (!(header.flags & CTDB_REC_RO_HAVE_DELEGATIONS)) ) {
+ talloc_free(data.dptr);
+ ctdb_call_send_redirect(ctdb, ctdb_db, call->key, c, &header);
+
+ ret = ctdb_ltdb_unlock(ctdb_db, call->key);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+ }
+ talloc_free(call);
+ return;
+ }
+
+ if ( (!(c->flags & CTDB_WANT_READONLY))
+ && (header.flags & (CTDB_REC_RO_HAVE_DELEGATIONS|CTDB_REC_RO_HAVE_READONLY)) ) {
+ header.flags |= CTDB_REC_RO_REVOKING_READONLY;
+ if (ctdb_ltdb_store(ctdb_db, call->key, &header, data) != 0) {
+ ctdb_fatal(ctdb, "Failed to store record with HAVE_DELEGATIONS set");
+ }
+ ret = ctdb_ltdb_unlock(ctdb_db, call->key);
+
+ if (ctdb_start_revoke_ro_record(ctdb, ctdb_db, call->key, &header, data) != 0) {
+ ctdb_fatal(ctdb, "Failed to start record revoke");
+ }
+ talloc_free(data.dptr);
+
+ if (ctdb_add_revoke_deferred_call(ctdb, ctdb_db, call->key, hdr, ctdb_call_input_pkt, ctdb) != 0) {
+ ctdb_fatal(ctdb, "Failed to add deferred call for revoke child");
+ }
+ talloc_free(call);
+
+ return;
+ }
+
+ /* If this is the first request for delegation. bump rsn and set
+ * the delegations flag
+ */
+ if ((c->flags & CTDB_WANT_READONLY)
+ && (c->callid == CTDB_FETCH_WITH_HEADER_FUNC)
+ && (!(header.flags & CTDB_REC_RO_HAVE_DELEGATIONS))) {
+ header.rsn += 3;
+ header.flags |= CTDB_REC_RO_HAVE_DELEGATIONS;
+ if (ctdb_ltdb_store(ctdb_db, call->key, &header, data) != 0) {
+ ctdb_fatal(ctdb, "Failed to store record with HAVE_DELEGATIONS set");
+ }
+ }
+ if ((c->flags & CTDB_WANT_READONLY)
+ && ((unsigned int)call->call_id == CTDB_FETCH_WITH_HEADER_FUNC)) {
+ TDB_DATA tdata;
+
+ tdata = tdb_fetch(ctdb_db->rottdb, call->key);
+ if (ctdb_trackingdb_add_pnn(ctdb, &tdata, c->hdr.srcnode) != 0) {
+ ctdb_fatal(ctdb, "Failed to add node to trackingdb");
+ }
+ if (tdb_store(ctdb_db->rottdb, call->key, tdata, TDB_REPLACE) != 0) {
+ ctdb_fatal(ctdb, "Failed to store trackingdb data");
+ }
+ free(tdata.dptr);
+
+ ret = ctdb_ltdb_unlock(ctdb_db, call->key);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+ }
+
+ len = offsetof(struct ctdb_reply_call_old, data) + data.dsize + sizeof(struct ctdb_ltdb_header);
+ r = ctdb_transport_allocate(ctdb, ctdb, CTDB_REPLY_CALL, len,
+ struct ctdb_reply_call_old);
+ CTDB_NO_MEMORY_FATAL(ctdb, r);
+ r->hdr.destnode = c->hdr.srcnode;
+ r->hdr.reqid = c->hdr.reqid;
+ r->hdr.generation = ctdb_db->generation;
+ r->status = 0;
+ r->datalen = data.dsize + sizeof(struct ctdb_ltdb_header);
+ header.rsn -= 2;
+ header.flags |= CTDB_REC_RO_HAVE_READONLY;
+ header.flags &= ~CTDB_REC_RO_HAVE_DELEGATIONS;
+ memcpy(&r->data[0], &header, sizeof(struct ctdb_ltdb_header));
+
+ if (data.dsize) {
+ memcpy(&r->data[sizeof(struct ctdb_ltdb_header)], data.dptr, data.dsize);
+ }
+
+ ctdb_queue_packet(ctdb, &r->hdr);
+ CTDB_INCREMENT_STAT(ctdb, total_ro_delegations);
+ CTDB_INCREMENT_DB_STAT(ctdb_db, db_ro_delegations);
+
+ talloc_free(r);
+ talloc_free(call);
+ return;
+ }
+
+ CTDB_UPDATE_STAT(ctdb, max_hop_count, c->hopcount);
+ tmp_count = c->hopcount;
+ bucket = 0;
+ while (tmp_count) {
+ tmp_count >>= 1;
+ bucket++;
+ }
+ if (bucket >= MAX_COUNT_BUCKETS) {
+ bucket = MAX_COUNT_BUCKETS - 1;
+ }
+ CTDB_INCREMENT_STAT(ctdb, hop_count_bucket[bucket]);
+ CTDB_INCREMENT_DB_STAT(ctdb_db, hop_count_bucket[bucket]);
+
+ /* If this database supports sticky records, then check if the
+ hopcount is big. If it is it means the record is hot and we
+ should make it sticky.
+ */
+ if (ctdb_db_sticky(ctdb_db) &&
+ c->hopcount >= ctdb->tunable.hopcount_make_sticky) {
+ ctdb_make_record_sticky(ctdb, ctdb_db, call->key);
+ }
+
+
+ /* Try if possible to migrate the record off to the caller node.
+ * From the clients perspective a fetch of the data is just as
+ * expensive as a migration.
+ */
+ if (c->hdr.srcnode != ctdb->pnn) {
+ if (ctdb_db->persistent_state) {
+ DEBUG(DEBUG_INFO, (__location__ " refusing migration"
+ " of key %s while transaction is active\n",
+ (char *)call->key.dptr));
+ } else {
+ DEBUG(DEBUG_DEBUG,("pnn %u starting migration of %08x to %u\n",
+ ctdb->pnn, ctdb_hash(&(call->key)), c->hdr.srcnode));
+ ctdb_call_send_dmaster(ctdb_db, c, &header, &(call->key), &data);
+ talloc_free(data.dptr);
+
+ ret = ctdb_ltdb_unlock(ctdb_db, call->key);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+ }
+ }
+ talloc_free(call);
+ return;
+ }
+
+ ret = ctdb_call_local(ctdb_db, call, &header, hdr, &data, true);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " ctdb_call_local failed\n"));
+ call->status = -1;
+ }
+
+ ret = ctdb_ltdb_unlock(ctdb_db, call->key);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+ }
+
+ len = offsetof(struct ctdb_reply_call_old, data) + call->reply_data.dsize;
+ r = ctdb_transport_allocate(ctdb, ctdb, CTDB_REPLY_CALL, len,
+ struct ctdb_reply_call_old);
+ CTDB_NO_MEMORY_FATAL(ctdb, r);
+ r->hdr.destnode = hdr->srcnode;
+ r->hdr.reqid = hdr->reqid;
+ r->hdr.generation = ctdb_db->generation;
+ r->status = call->status;
+ r->datalen = call->reply_data.dsize;
+ if (call->reply_data.dsize) {
+ memcpy(&r->data[0], call->reply_data.dptr, call->reply_data.dsize);
+ }
+
+ ctdb_queue_packet(ctdb, &r->hdr);
+
+ talloc_free(r);
+ talloc_free(call);
+}
+
+/**
+ * called when a CTDB_REPLY_CALL packet comes in
+ *
+ * This packet comes in response to a CTDB_REQ_CALL request packet. It
+ * contains any reply data from the call
+ */
+void ctdb_reply_call(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+ struct ctdb_reply_call_old *c = (struct ctdb_reply_call_old *)hdr;
+ struct ctdb_call_state *state;
+
+ state = reqid_find(ctdb->idr, hdr->reqid, struct ctdb_call_state);
+ if (state == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " reqid %u not found\n", hdr->reqid));
+ return;
+ }
+
+ if (hdr->reqid != state->reqid) {
+ /* we found a record but it was the wrong one */
+ DEBUG(DEBUG_ERR, ("Dropped orphaned call reply with reqid:%u\n",hdr->reqid));
+ return;
+ }
+
+
+ /* read only delegation processing */
+ /* If we got a FETCH_WITH_HEADER we should check if this is a ro
+ * delegation since we may need to update the record header
+ */
+ if (state->c->callid == CTDB_FETCH_WITH_HEADER_FUNC) {
+ struct ctdb_db_context *ctdb_db = state->ctdb_db;
+ struct ctdb_ltdb_header *header = (struct ctdb_ltdb_header *)&c->data[0];
+ struct ctdb_ltdb_header oldheader;
+ TDB_DATA key, data, olddata;
+ int ret;
+
+ if (!(header->flags & CTDB_REC_RO_HAVE_READONLY)) {
+ goto finished_ro;
+ return;
+ }
+
+ key.dsize = state->c->keylen;
+ key.dptr = state->c->data;
+ ret = ctdb_ltdb_lock_requeue(ctdb_db, key, hdr,
+ ctdb_call_input_pkt, ctdb, false);
+ if (ret == -2) {
+ return;
+ }
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to get lock in ctdb_reply_call\n"));
+ return;
+ }
+
+ ret = ctdb_ltdb_fetch(ctdb_db, key, &oldheader, state, &olddata);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, ("Failed to fetch old record in ctdb_reply_call\n"));
+ ctdb_ltdb_unlock(ctdb_db, key);
+ goto finished_ro;
+ }
+
+ if (header->rsn <= oldheader.rsn) {
+ ctdb_ltdb_unlock(ctdb_db, key);
+ goto finished_ro;
+ }
+
+ if (c->datalen < sizeof(struct ctdb_ltdb_header)) {
+ DEBUG(DEBUG_ERR,(__location__ " Got FETCH_WITH_HEADER reply with too little data: %d bytes\n", c->datalen));
+ ctdb_ltdb_unlock(ctdb_db, key);
+ goto finished_ro;
+ }
+
+ data.dsize = c->datalen - sizeof(struct ctdb_ltdb_header);
+ data.dptr = &c->data[sizeof(struct ctdb_ltdb_header)];
+ ret = ctdb_ltdb_store(ctdb_db, key, header, data);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, ("Failed to store new record in ctdb_reply_call\n"));
+ ctdb_ltdb_unlock(ctdb_db, key);
+ goto finished_ro;
+ }
+
+ ctdb_ltdb_unlock(ctdb_db, key);
+ }
+finished_ro:
+
+ state->call->reply_data.dptr = c->data;
+ state->call->reply_data.dsize = c->datalen;
+ state->call->status = c->status;
+
+ talloc_steal(state, c);
+
+ state->state = CTDB_CALL_DONE;
+ if (state->async.fn) {
+ state->async.fn(state);
+ }
+}
+
+
+/**
+ * called when a CTDB_REPLY_DMASTER packet comes in
+ *
+ * This packet comes in from the lmaster in response to a CTDB_REQ_CALL
+ * request packet. It means that the current dmaster wants to give us
+ * the dmaster role.
+ */
+void ctdb_reply_dmaster(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+ struct ctdb_reply_dmaster_old *c = (struct ctdb_reply_dmaster_old *)hdr;
+ struct ctdb_db_context *ctdb_db;
+ TDB_DATA key, data;
+ uint32_t record_flags = 0;
+ size_t len;
+ int ret;
+
+ ctdb_db = find_ctdb_db(ctdb, c->db_id);
+ if (ctdb_db == NULL) {
+ DEBUG(DEBUG_ERR,("Unknown db_id 0x%x in ctdb_reply_dmaster\n", c->db_id));
+ return;
+ }
+
+ key.dptr = c->data;
+ key.dsize = c->keylen;
+ data.dptr = &c->data[key.dsize];
+ data.dsize = c->datalen;
+ len = offsetof(struct ctdb_reply_dmaster_old, data) + key.dsize + data.dsize
+ + sizeof(uint32_t);
+ if (len <= c->hdr.length) {
+ memcpy(&record_flags, &c->data[c->keylen + c->datalen],
+ sizeof(record_flags));
+ }
+
+ dmaster_defer_setup(ctdb_db, hdr, key);
+
+ ret = ctdb_ltdb_lock_requeue(ctdb_db, key, hdr,
+ ctdb_call_input_pkt, ctdb, false);
+ if (ret == -2) {
+ return;
+ }
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to get lock in ctdb_reply_dmaster\n"));
+ return;
+ }
+
+ ctdb_become_dmaster(ctdb_db, hdr, key, data, c->rsn, record_flags);
+}
+
+
+/*
+ called when a CTDB_REPLY_ERROR packet comes in
+*/
+void ctdb_reply_error(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+ struct ctdb_reply_error_old *c = (struct ctdb_reply_error_old *)hdr;
+ struct ctdb_call_state *state;
+
+ state = reqid_find(ctdb->idr, hdr->reqid, struct ctdb_call_state);
+ if (state == NULL) {
+ DEBUG(DEBUG_ERR,("pnn %u Invalid reqid %u in ctdb_reply_error\n",
+ ctdb->pnn, hdr->reqid));
+ return;
+ }
+
+ if (hdr->reqid != state->reqid) {
+ /* we found a record but it was the wrong one */
+ DEBUG(DEBUG_ERR, ("Dropped orphaned error reply with reqid:%u\n",hdr->reqid));
+ return;
+ }
+
+ talloc_steal(state, c);
+
+ state->state = CTDB_CALL_ERROR;
+ state->errmsg = (char *)c->msg;
+ if (state->async.fn) {
+ state->async.fn(state);
+ }
+}
+
+
+/*
+ destroy a ctdb_call
+*/
+static int ctdb_call_destructor(struct ctdb_call_state *state)
+{
+ DLIST_REMOVE(state->ctdb_db->pending_calls, state);
+ reqid_remove(state->ctdb_db->ctdb->idr, state->reqid);
+ return 0;
+}
+
+
+/*
+ called when a ctdb_call needs to be resent after a reconfigure event
+*/
+static void ctdb_call_resend(struct ctdb_call_state *state)
+{
+ struct ctdb_context *ctdb = state->ctdb_db->ctdb;
+
+ state->generation = state->ctdb_db->generation;
+
+ /* use a new reqid, in case the old reply does eventually come in */
+ reqid_remove(ctdb->idr, state->reqid);
+ state->reqid = reqid_new(ctdb->idr, state);
+ state->c->hdr.reqid = state->reqid;
+
+ /* update the generation count for this request, so its valid with the new vnn_map */
+ state->c->hdr.generation = state->generation;
+
+ /* send the packet to ourselves, it will be redirected appropriately */
+ state->c->hdr.destnode = ctdb->pnn;
+
+ ctdb_queue_packet(ctdb, &state->c->hdr);
+ D_INFO("resent ctdb_call for db %s reqid %u generation %u\n",
+ state->ctdb_db->db_name,
+ state->reqid,
+ state->generation);
+}
+
+/*
+ resend all pending calls on recovery
+ */
+void ctdb_call_resend_db(struct ctdb_db_context *ctdb_db)
+{
+ struct ctdb_call_state *state, *next;
+ unsigned int count = 0;
+
+ for (state = ctdb_db->pending_calls; state; state = next) {
+ next = state->next;
+ ctdb_call_resend(state);
+ count++;
+ }
+ D_NOTICE("Resent calls for database=%s, generation=%u, count=%u\n",
+ ctdb_db->db_name,
+ ctdb_db->generation,
+ count);
+}
+
+void ctdb_call_resend_all(struct ctdb_context *ctdb)
+{
+ struct ctdb_db_context *ctdb_db;
+
+ for (ctdb_db = ctdb->db_list; ctdb_db; ctdb_db = ctdb_db->next) {
+ ctdb_call_resend_db(ctdb_db);
+ }
+}
+
+/*
+ this allows the caller to setup a async.fn
+*/
+static void call_local_trigger(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *private_data)
+{
+ struct ctdb_call_state *state = talloc_get_type(private_data, struct ctdb_call_state);
+ if (state->async.fn) {
+ state->async.fn(state);
+ }
+}
+
+
+/*
+ construct an event driven local ctdb_call
+
+ this is used so that locally processed ctdb_call requests are processed
+ in an event driven manner
+*/
+struct ctdb_call_state *ctdb_call_local_send(struct ctdb_db_context *ctdb_db,
+ struct ctdb_call *call,
+ struct ctdb_ltdb_header *header,
+ TDB_DATA *data)
+{
+ struct ctdb_call_state *state;
+ struct ctdb_context *ctdb = ctdb_db->ctdb;
+ int ret;
+
+ state = talloc_zero(ctdb_db, struct ctdb_call_state);
+ CTDB_NO_MEMORY_NULL(ctdb, state);
+
+ talloc_steal(state, data->dptr);
+
+ state->state = CTDB_CALL_DONE;
+ state->call = talloc(state, struct ctdb_call);
+ CTDB_NO_MEMORY_NULL(ctdb, state->call);
+ *(state->call) = *call;
+ state->ctdb_db = ctdb_db;
+
+ ret = ctdb_call_local(ctdb_db, state->call, header, state, data, true);
+ if (ret != 0) {
+ DEBUG(DEBUG_DEBUG,("ctdb_call_local() failed, ignoring return code %d\n", ret));
+ }
+
+ tevent_add_timer(ctdb->ev, state, timeval_zero(),
+ call_local_trigger, state);
+
+ return state;
+}
+
+
+/*
+ make a remote ctdb call - async send. Called in daemon context.
+
+ This constructs a ctdb_call request and queues it for processing.
+ This call never blocks.
+*/
+struct ctdb_call_state *ctdb_daemon_call_send_remote(struct ctdb_db_context *ctdb_db,
+ struct ctdb_call *call,
+ struct ctdb_ltdb_header *header)
+{
+ uint32_t len;
+ struct ctdb_call_state *state;
+ struct ctdb_context *ctdb = ctdb_db->ctdb;
+ struct ctdb_req_call_old *c;
+
+ if (ctdb->methods == NULL) {
+ DEBUG(DEBUG_INFO,(__location__ " Failed send packet. Transport is down\n"));
+ return NULL;
+ }
+
+ state = talloc_zero(ctdb_db, struct ctdb_call_state);
+ CTDB_NO_MEMORY_NULL(ctdb, state);
+ state->call = talloc(state, struct ctdb_call);
+ CTDB_NO_MEMORY_NULL(ctdb, state->call);
+
+ state->reqid = reqid_new(ctdb->idr, state);
+ state->ctdb_db = ctdb_db;
+ state->state = CTDB_CALL_WAIT;
+ state->generation = ctdb_db->generation;
+
+ len = offsetof(struct ctdb_req_call_old, data) + call->key.dsize +
+ call->call_data.dsize;
+
+ c = ctdb_transport_allocate(ctdb,
+ state,
+ CTDB_REQ_CALL,
+ len,
+ struct ctdb_req_call_old);
+
+ CTDB_NO_MEMORY_NULL(ctdb, c);
+ state->c = c;
+
+ c->hdr.destnode = header->dmaster;
+ c->hdr.reqid = state->reqid;
+ c->hdr.generation = ctdb_db->generation;
+ c->flags = call->flags;
+ c->db_id = ctdb_db->db_id;
+ c->callid = call->call_id;
+ c->hopcount = 0;
+ c->keylen = call->key.dsize;
+ c->calldatalen = call->call_data.dsize;
+
+ memcpy(&c->data[0], call->key.dptr, call->key.dsize);
+ memcpy(&c->data[call->key.dsize],
+ call->call_data.dptr,
+ call->call_data.dsize);
+
+ *(state->call) = *call;
+ state->call->call_data.dptr = &c->data[call->key.dsize];
+ state->call->key.dptr = &c->data[0];
+
+ DLIST_ADD(ctdb_db->pending_calls, state);
+
+ talloc_set_destructor(state, ctdb_call_destructor);
+ ctdb_queue_packet(ctdb, &state->c->hdr);
+
+ return state;
+}
+
+/*
+ make a remote ctdb call - async recv - called in daemon context
+
+ This is called when the program wants to wait for a ctdb_call to complete and get the
+ results. This call will block unless the call has already completed.
+*/
+int ctdb_daemon_call_recv(struct ctdb_call_state *state, struct ctdb_call *call)
+{
+ while (state->state < CTDB_CALL_DONE) {
+ tevent_loop_once(state->ctdb_db->ctdb->ev);
+ }
+ if (state->state != CTDB_CALL_DONE) {
+ ctdb_set_error(state->ctdb_db->ctdb, "%s", state->errmsg);
+ talloc_free(state);
+ return -1;
+ }
+
+ if (state->call->reply_data.dsize) {
+ call->reply_data.dptr = talloc_memdup(call,
+ state->call->reply_data.dptr,
+ state->call->reply_data.dsize);
+ call->reply_data.dsize = state->call->reply_data.dsize;
+ } else {
+ call->reply_data.dptr = NULL;
+ call->reply_data.dsize = 0;
+ }
+ call->status = state->call->status;
+ talloc_free(state);
+ return 0;
+}
+
+
+struct revokechild_deferred_call {
+ struct revokechild_deferred_call *prev, *next;
+ struct ctdb_context *ctdb;
+ struct ctdb_req_header *hdr;
+ deferred_requeue_fn fn;
+ void *ctx;
+ struct revokechild_handle *rev_hdl;
+};
+
+struct revokechild_handle {
+ struct revokechild_handle *next, *prev;
+ struct ctdb_context *ctdb;
+ struct ctdb_db_context *ctdb_db;
+ struct tevent_fd *fde;
+ int status;
+ int fd[2];
+ pid_t child;
+ TDB_DATA key;
+ struct revokechild_deferred_call *deferred_call_list;
+};
+
+static void deferred_call_requeue(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *private_data)
+{
+ struct revokechild_deferred_call *dlist = talloc_get_type_abort(
+ private_data, struct revokechild_deferred_call);
+
+ while (dlist != NULL) {
+ struct revokechild_deferred_call *dcall = dlist;
+
+ talloc_set_destructor(dcall, NULL);
+ DLIST_REMOVE(dlist, dcall);
+ dcall->fn(dcall->ctx, dcall->hdr);
+ talloc_free(dcall);
+ }
+}
+
+static int deferred_call_destructor(struct revokechild_deferred_call *dcall)
+{
+ struct revokechild_handle *rev_hdl = dcall->rev_hdl;
+
+ DLIST_REMOVE(rev_hdl->deferred_call_list, dcall);
+ return 0;
+}
+
+static int revokechild_destructor(struct revokechild_handle *rev_hdl)
+{
+ struct revokechild_deferred_call *now_list = NULL;
+ struct revokechild_deferred_call *delay_list = NULL;
+
+ if (rev_hdl->fde != NULL) {
+ talloc_free(rev_hdl->fde);
+ }
+
+ if (rev_hdl->fd[0] != -1) {
+ close(rev_hdl->fd[0]);
+ }
+ if (rev_hdl->fd[1] != -1) {
+ close(rev_hdl->fd[1]);
+ }
+ ctdb_kill(rev_hdl->ctdb, rev_hdl->child, SIGKILL);
+
+ DLIST_REMOVE(rev_hdl->ctdb_db->revokechild_active, rev_hdl);
+
+ while (rev_hdl->deferred_call_list != NULL) {
+ struct revokechild_deferred_call *dcall;
+
+ dcall = rev_hdl->deferred_call_list;
+ DLIST_REMOVE(rev_hdl->deferred_call_list, dcall);
+
+ /* If revoke is successful, then first process all the calls
+ * that need write access, and delay readonly requests by 1
+ * second grace.
+ *
+ * If revoke is unsuccessful, most likely because of node
+ * failure, delay all the pending requests, so database can
+ * be recovered.
+ */
+
+ if (rev_hdl->status == 0) {
+ struct ctdb_req_call_old *c;
+
+ c = (struct ctdb_req_call_old *)dcall->hdr;
+ if (c->flags & CTDB_WANT_READONLY) {
+ DLIST_ADD(delay_list, dcall);
+ } else {
+ DLIST_ADD(now_list, dcall);
+ }
+ } else {
+ DLIST_ADD(delay_list, dcall);
+ }
+ }
+
+ if (now_list != NULL) {
+ tevent_add_timer(rev_hdl->ctdb->ev,
+ rev_hdl->ctdb_db,
+ tevent_timeval_current_ofs(0, 0),
+ deferred_call_requeue,
+ now_list);
+ }
+
+ if (delay_list != NULL) {
+ tevent_add_timer(rev_hdl->ctdb->ev,
+ rev_hdl->ctdb_db,
+ tevent_timeval_current_ofs(1, 0),
+ deferred_call_requeue,
+ delay_list);
+ }
+
+ return 0;
+}
+
+static void revokechild_handler(struct tevent_context *ev,
+ struct tevent_fd *fde,
+ uint16_t flags, void *private_data)
+{
+ struct revokechild_handle *rev_hdl =
+ talloc_get_type(private_data, struct revokechild_handle);
+ int ret;
+ char c;
+
+ ret = sys_read(rev_hdl->fd[0], &c, 1);
+ if (ret != 1) {
+ DEBUG(DEBUG_ERR,("Failed to read status from revokechild. errno:%d\n", errno));
+ rev_hdl->status = -1;
+ talloc_free(rev_hdl);
+ return;
+ }
+ if (c != 0) {
+ DEBUG(DEBUG_ERR,("revokechild returned failure. status:%d\n", c));
+ rev_hdl->status = -1;
+ talloc_free(rev_hdl);
+ return;
+ }
+
+ talloc_free(rev_hdl);
+}
+
+struct ctdb_revoke_state {
+ struct ctdb_db_context *ctdb_db;
+ TDB_DATA key;
+ struct ctdb_ltdb_header *header;
+ TDB_DATA data;
+ int count;
+ int status;
+ int finished;
+};
+
+static void update_record_cb(struct ctdb_client_control_state *state)
+{
+ struct ctdb_revoke_state *revoke_state;
+ int ret;
+ int32_t res;
+
+ if (state == NULL) {
+ return;
+ }
+ revoke_state = state->async.private_data;
+
+ state->async.fn = NULL;
+ ret = ctdb_control_recv(state->ctdb, state, state, NULL, &res, NULL);
+ if ((ret != 0) || (res != 0)) {
+ DEBUG(DEBUG_ERR,("Recv for revoke update record failed ret:%d res:%d\n", ret, res));
+ revoke_state->status = -1;
+ }
+
+ revoke_state->count--;
+ if (revoke_state->count <= 0) {
+ revoke_state->finished = 1;
+ }
+}
+
+static void revoke_send_cb(struct ctdb_context *ctdb, uint32_t pnn, void *private_data)
+{
+ struct ctdb_revoke_state *revoke_state = private_data;
+ struct ctdb_client_control_state *state;
+
+ state = ctdb_ctrl_updaterecord_send(ctdb, revoke_state, timeval_current_ofs(ctdb->tunable.control_timeout,0), pnn, revoke_state->ctdb_db, revoke_state->key, revoke_state->header, revoke_state->data);
+ if (state == NULL) {
+ DEBUG(DEBUG_ERR,("Failure to send update record to revoke readonly delegation\n"));
+ revoke_state->status = -1;
+ return;
+ }
+ state->async.fn = update_record_cb;
+ state->async.private_data = revoke_state;
+
+ revoke_state->count++;
+
+}
+
+static void ctdb_revoke_timeout_handler(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval yt, void *private_data)
+{
+ struct ctdb_revoke_state *state = private_data;
+
+ DEBUG(DEBUG_ERR,("Timed out waiting for revoke to finish\n"));
+ state->finished = 1;
+ state->status = -1;
+}
+
+static int ctdb_revoke_all_delegations(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, TDB_DATA tdata, TDB_DATA key, struct ctdb_ltdb_header *header, TDB_DATA data)
+{
+ struct ctdb_revoke_state *state = talloc_zero(ctdb, struct ctdb_revoke_state);
+ struct ctdb_ltdb_header new_header;
+ TDB_DATA new_data;
+
+ state->ctdb_db = ctdb_db;
+ state->key = key;
+ state->header = header;
+ state->data = data;
+
+ ctdb_trackingdb_traverse(ctdb, tdata, revoke_send_cb, state);
+
+ tevent_add_timer(ctdb->ev, state,
+ timeval_current_ofs(ctdb->tunable.control_timeout, 0),
+ ctdb_revoke_timeout_handler, state);
+
+ while (state->finished == 0) {
+ tevent_loop_once(ctdb->ev);
+ }
+
+ if (ctdb_ltdb_lock(ctdb_db, key) != 0) {
+ DEBUG(DEBUG_ERR,("Failed to chainlock the database in revokechild\n"));
+ talloc_free(state);
+ return -1;
+ }
+ if (ctdb_ltdb_fetch(ctdb_db, key, &new_header, state, &new_data) != 0) {
+ ctdb_ltdb_unlock(ctdb_db, key);
+ DEBUG(DEBUG_ERR,("Failed for fetch tdb record in revokechild\n"));
+ talloc_free(state);
+ return -1;
+ }
+ header->rsn++;
+ if (new_header.rsn > header->rsn) {
+ ctdb_ltdb_unlock(ctdb_db, key);
+ DEBUG(DEBUG_ERR,("RSN too high in tdb record in revokechild\n"));
+ talloc_free(state);
+ return -1;
+ }
+ if ( (new_header.flags & (CTDB_REC_RO_REVOKING_READONLY|CTDB_REC_RO_HAVE_DELEGATIONS)) != (CTDB_REC_RO_REVOKING_READONLY|CTDB_REC_RO_HAVE_DELEGATIONS) ) {
+ ctdb_ltdb_unlock(ctdb_db, key);
+ DEBUG(DEBUG_ERR,("Flags are wrong in tdb record in revokechild\n"));
+ talloc_free(state);
+ return -1;
+ }
+
+ /*
+ * If revoke on all nodes succeed, revoke is complete. Otherwise,
+ * remove CTDB_REC_RO_REVOKING_READONLY flag and retry.
+ */
+ if (state->status == 0) {
+ new_header.rsn++;
+ new_header.flags |= CTDB_REC_RO_REVOKE_COMPLETE;
+ } else {
+ DEBUG(DEBUG_NOTICE, ("Revoke all delegations failed, retrying.\n"));
+ new_header.flags &= ~CTDB_REC_RO_REVOKING_READONLY;
+ }
+ if (ctdb_ltdb_store(ctdb_db, key, &new_header, new_data) != 0) {
+ ctdb_ltdb_unlock(ctdb_db, key);
+ DEBUG(DEBUG_ERR,("Failed to write new record in revokechild\n"));
+ talloc_free(state);
+ return -1;
+ }
+ ctdb_ltdb_unlock(ctdb_db, key);
+
+ talloc_free(state);
+ return 0;
+}
+
+
+int ctdb_start_revoke_ro_record(struct ctdb_context *ctdb,
+ struct ctdb_db_context *ctdb_db,
+ TDB_DATA key,
+ struct ctdb_ltdb_header *header,
+ TDB_DATA data)
+{
+ TDB_DATA tdata;
+ struct revokechild_handle *rev_hdl;
+ pid_t parent = getpid();
+ int ret;
+
+ header->flags &= ~(CTDB_REC_RO_REVOKING_READONLY |
+ CTDB_REC_RO_HAVE_DELEGATIONS |
+ CTDB_REC_RO_HAVE_READONLY);
+
+ header->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
+ header->rsn -= 1;
+
+ rev_hdl = talloc_zero(ctdb_db, struct revokechild_handle);
+ if (rev_hdl == NULL) {
+ D_ERR("Failed to allocate revokechild_handle\n");
+ return -1;
+ }
+
+ tdata = tdb_fetch(ctdb_db->rottdb, key);
+ if (tdata.dsize > 0) {
+ uint8_t *tmp;
+
+ tmp = tdata.dptr;
+ tdata.dptr = talloc_memdup(rev_hdl, tdata.dptr, tdata.dsize);
+ free(tmp);
+ }
+
+ rev_hdl->status = 0;
+ rev_hdl->ctdb = ctdb;
+ rev_hdl->ctdb_db = ctdb_db;
+ rev_hdl->fd[0] = -1;
+ rev_hdl->fd[1] = -1;
+
+ rev_hdl->key.dsize = key.dsize;
+ rev_hdl->key.dptr = talloc_memdup(rev_hdl, key.dptr, key.dsize);
+ if (rev_hdl->key.dptr == NULL) {
+ D_ERR("Failed to allocate key for revokechild_handle\n");
+ goto err_out;
+ }
+
+ ret = pipe(rev_hdl->fd);
+ if (ret != 0) {
+ D_ERR("Failed to allocate key for revokechild_handle\n");
+ goto err_out;
+ }
+
+
+ rev_hdl->child = ctdb_fork(ctdb);
+ if (rev_hdl->child == (pid_t)-1) {
+ D_ERR("Failed to fork child for revokechild\n");
+ goto err_out;
+ }
+
+ if (rev_hdl->child == 0) {
+ char c = 0;
+ close(rev_hdl->fd[0]);
+
+ prctl_set_comment("ctdb_revokechild");
+ if (switch_from_server_to_client(ctdb) != 0) {
+ D_ERR("Failed to switch from server to client "
+ "for revokechild process\n");
+ c = 1;
+ goto child_finished;
+ }
+
+ c = ctdb_revoke_all_delegations(ctdb,
+ ctdb_db,
+ tdata,
+ key,
+ header,
+ data);
+
+child_finished:
+ sys_write(rev_hdl->fd[1], &c, 1);
+ ctdb_wait_for_process_to_exit(parent);
+ _exit(0);
+ }
+
+ close(rev_hdl->fd[1]);
+ rev_hdl->fd[1] = -1;
+ set_close_on_exec(rev_hdl->fd[0]);
+
+ rev_hdl->fde = tevent_add_fd(ctdb->ev,
+ rev_hdl,
+ rev_hdl->fd[0],
+ TEVENT_FD_READ,
+ revokechild_handler,
+ (void *)rev_hdl);
+
+ if (rev_hdl->fde == NULL) {
+ D_ERR("Failed to set up fd event for revokechild process\n");
+ talloc_free(rev_hdl);
+ }
+ tevent_fd_set_auto_close(rev_hdl->fde);
+
+ /* This is an active revokechild child process */
+ DLIST_ADD_END(ctdb_db->revokechild_active, rev_hdl);
+ talloc_set_destructor(rev_hdl, revokechild_destructor);
+
+ return 0;
+err_out:
+ talloc_free(rev_hdl);
+ return -1;
+}
+
+int ctdb_add_revoke_deferred_call(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_req_header *hdr, deferred_requeue_fn fn, void *call_context)
+{
+ struct revokechild_handle *rev_hdl;
+ struct revokechild_deferred_call *deferred_call;
+
+ for (rev_hdl = ctdb_db->revokechild_active;
+ rev_hdl;
+ rev_hdl = rev_hdl->next) {
+ if (rev_hdl->key.dsize == 0) {
+ continue;
+ }
+ if (rev_hdl->key.dsize != key.dsize) {
+ continue;
+ }
+ if (!memcmp(rev_hdl->key.dptr, key.dptr, key.dsize)) {
+ break;
+ }
+ }
+
+ if (rev_hdl == NULL) {
+ DEBUG(DEBUG_ERR,("Failed to add deferred call to revoke list. revoke structure not found\n"));
+ return -1;
+ }
+
+ deferred_call = talloc(call_context, struct revokechild_deferred_call);
+ if (deferred_call == NULL) {
+ DEBUG(DEBUG_ERR,("Failed to allocate deferred call structure for revoking record\n"));
+ return -1;
+ }
+
+ deferred_call->ctdb = ctdb;
+ deferred_call->hdr = talloc_steal(deferred_call, hdr);
+ deferred_call->fn = fn;
+ deferred_call->ctx = call_context;
+ deferred_call->rev_hdl = rev_hdl;
+
+ talloc_set_destructor(deferred_call, deferred_call_destructor);
+
+ DLIST_ADD(rev_hdl->deferred_call_list, deferred_call);
+
+ return 0;
+}
+
+static void ctdb_migration_count_handler(TDB_DATA key, uint64_t counter,
+ void *private_data)
+{
+ struct ctdb_db_context *ctdb_db = talloc_get_type_abort(
+ private_data, struct ctdb_db_context);
+ unsigned int value;
+
+ value = (counter < INT_MAX ? counter : INT_MAX);
+ ctdb_update_db_stat_hot_keys(ctdb_db, key, value);
+}
+
+static void ctdb_migration_cleandb_event(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval current_time,
+ void *private_data)
+{
+ struct ctdb_db_context *ctdb_db = talloc_get_type_abort(
+ private_data, struct ctdb_db_context);
+
+ if (ctdb_db->migratedb == NULL) {
+ return;
+ }
+
+ hash_count_expire(ctdb_db->migratedb, NULL);
+
+ te = tevent_add_timer(ctdb_db->ctdb->ev, ctdb_db->migratedb,
+ tevent_timeval_current_ofs(10, 0),
+ ctdb_migration_cleandb_event, ctdb_db);
+ if (te == NULL) {
+ DEBUG(DEBUG_ERR,
+ ("Memory error in migration cleandb event for %s\n",
+ ctdb_db->db_name));
+ TALLOC_FREE(ctdb_db->migratedb);
+ }
+}
+
+int ctdb_migration_init(struct ctdb_db_context *ctdb_db)
+{
+ struct timeval one_second = { 1, 0 };
+ struct tevent_timer *te;
+ int ret;
+
+ if (! ctdb_db_volatile(ctdb_db)) {
+ return 0;
+ }
+
+ ret = hash_count_init(ctdb_db, one_second,
+ ctdb_migration_count_handler, ctdb_db,
+ &ctdb_db->migratedb);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,
+ ("Memory error in migration init for %s\n",
+ ctdb_db->db_name));
+ return -1;
+ }
+
+ te = tevent_add_timer(ctdb_db->ctdb->ev, ctdb_db->migratedb,
+ tevent_timeval_current_ofs(10, 0),
+ ctdb_migration_cleandb_event, ctdb_db);
+ if (te == NULL) {
+ DEBUG(DEBUG_ERR,
+ ("Memory error in migration init for %s\n",
+ ctdb_db->db_name));
+ TALLOC_FREE(ctdb_db->migratedb);
+ return -1;
+ }
+
+ return 0;
+}
diff --git a/ctdb/server/ctdb_client.c b/ctdb/server/ctdb_client.c
new file mode 100644
index 0000000..c9edb1d
--- /dev/null
+++ b/ctdb/server/ctdb_client.c
@@ -0,0 +1,1709 @@
+/*
+ ctdb daemon code
+
+ Copyright (C) Andrew Tridgell 2007
+ Copyright (C) Ronnie Sahlberg 2007
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "system/locale.h"
+
+#include <talloc.h>
+#include <tevent.h>
+#include <tdb.h>
+
+#include "lib/tdb_wrap/tdb_wrap.h"
+#include "lib/util/dlinklist.h"
+#include "lib/util/time.h"
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+
+#include "ctdb_private.h"
+#include "ctdb_client.h"
+
+#include "common/reqid.h"
+#include "common/system.h"
+#include "common/common.h"
+#include "common/logging.h"
+
+/*
+ allocate a packet for use in client<->daemon communication
+ */
+struct ctdb_req_header *_ctdbd_allocate_pkt(struct ctdb_context *ctdb,
+ TALLOC_CTX *mem_ctx,
+ enum ctdb_operation operation,
+ size_t length, size_t slength,
+ const char *type)
+{
+ int size;
+ struct ctdb_req_header *hdr;
+
+ length = MAX(length, slength);
+ size = (length+(CTDB_DS_ALIGNMENT-1)) & ~(CTDB_DS_ALIGNMENT-1);
+
+ hdr = (struct ctdb_req_header *)talloc_zero_size(mem_ctx, size);
+ if (hdr == NULL) {
+ DEBUG(DEBUG_ERR,("Unable to allocate packet for operation %u of length %u\n",
+ operation, (unsigned)length));
+ return NULL;
+ }
+ talloc_set_name_const(hdr, type);
+ hdr->length = length;
+ hdr->operation = operation;
+ hdr->ctdb_magic = CTDB_MAGIC;
+ hdr->ctdb_version = CTDB_PROTOCOL;
+ hdr->srcnode = ctdb->pnn;
+ if (ctdb->vnn_map) {
+ hdr->generation = ctdb->vnn_map->generation;
+ }
+
+ return hdr;
+}
+
+/*
+ local version of ctdb_call
+*/
+int ctdb_call_local(struct ctdb_db_context *ctdb_db, struct ctdb_call *call,
+ struct ctdb_ltdb_header *header, TALLOC_CTX *mem_ctx,
+ TDB_DATA *data, bool updatetdb)
+{
+ struct ctdb_call_info *c;
+ struct ctdb_registered_call *fn;
+ struct ctdb_context *ctdb = ctdb_db->ctdb;
+
+ c = talloc_zero(mem_ctx, struct ctdb_call_info);
+ CTDB_NO_MEMORY(ctdb, c);
+
+ c->key = call->key;
+ c->call_data = &call->call_data;
+ c->record_data.dptr = talloc_memdup(c, data->dptr, data->dsize);
+ c->record_data.dsize = data->dsize;
+ CTDB_NO_MEMORY(ctdb, c->record_data.dptr);
+ c->header = header;
+
+ for (fn=ctdb_db->calls;fn;fn=fn->next) {
+ if (fn->id == (uint32_t)call->call_id) {
+ break;
+ }
+ }
+ if (fn == NULL) {
+ ctdb_set_error(ctdb, "Unknown call id %u\n", call->call_id);
+ talloc_free(c);
+ return -1;
+ }
+
+ if (fn->fn(c) != 0) {
+ ctdb_set_error(ctdb, "ctdb_call %u failed\n", call->call_id);
+ talloc_free(c);
+ return -1;
+ }
+
+ /* we need to force the record to be written out if this was a remote access */
+ if (c->new_data == NULL) {
+ c->new_data = &c->record_data;
+ }
+
+ if (c->new_data && updatetdb) {
+ /* XXX check that we always have the lock here? */
+ if (ctdb_ltdb_store(ctdb_db, call->key, header, *c->new_data) != 0) {
+ ctdb_set_error(ctdb, "ctdb_call tdb_store failed\n");
+ talloc_free(c);
+ return -1;
+ }
+ }
+
+ if (c->reply_data) {
+ call->reply_data = *c->reply_data;
+
+ talloc_steal(call, call->reply_data.dptr);
+ talloc_set_name_const(call->reply_data.dptr, __location__);
+ } else {
+ call->reply_data.dptr = NULL;
+ call->reply_data.dsize = 0;
+ }
+ call->status = c->status;
+
+ talloc_free(c);
+
+ return 0;
+}
+
+
+/*
+ queue a packet for sending from client to daemon
+*/
+static int ctdb_client_queue_pkt(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+ return ctdb_queue_send(ctdb->daemon.queue, (uint8_t *)hdr, hdr->length);
+}
+
+
+/*
+ called when a CTDB_REPLY_CALL packet comes in in the client
+
+ This packet comes in response to a CTDB_REQ_CALL request packet. It
+ contains any reply data from the call
+*/
+static void ctdb_client_reply_call(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+ struct ctdb_reply_call_old *c = (struct ctdb_reply_call_old *)hdr;
+ struct ctdb_client_call_state *state;
+
+ state = reqid_find(ctdb->idr, hdr->reqid, struct ctdb_client_call_state);
+ if (state == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " reqid %u not found\n", hdr->reqid));
+ return;
+ }
+
+ if (hdr->reqid != state->reqid) {
+ /* we found a record but it was the wrong one */
+ DEBUG(DEBUG_ERR, ("Dropped client call reply with reqid:%u\n",hdr->reqid));
+ return;
+ }
+
+ state->call->reply_data.dptr = c->data;
+ state->call->reply_data.dsize = c->datalen;
+ state->call->status = c->status;
+
+ talloc_steal(state, c);
+
+ state->state = CTDB_CALL_DONE;
+
+ if (state->async.fn) {
+ state->async.fn(state);
+ }
+}
+
+void ctdb_request_message(struct ctdb_context *ctdb,
+ struct ctdb_req_header *hdr)
+{
+ struct ctdb_req_message_old *c = (struct ctdb_req_message_old *)hdr;
+ TDB_DATA data;
+
+ data.dsize = c->datalen;
+ data.dptr = talloc_memdup(c, &c->data[0], c->datalen);
+ if (data.dptr == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " Memory allocation failure\n"));
+ return;
+ }
+
+ srvid_dispatch(ctdb->srv, c->srvid, CTDB_SRVID_ALL, data);
+}
+
+static void ctdb_client_reply_control(struct ctdb_context *ctdb, struct ctdb_req_header *hdr);
+
+/*
+ this is called in the client, when data comes in from the daemon
+ */
+void ctdb_client_read_cb(uint8_t *data, size_t cnt, void *args)
+{
+ struct ctdb_context *ctdb = talloc_get_type(args, struct ctdb_context);
+ struct ctdb_req_header *hdr = (struct ctdb_req_header *)data;
+ TALLOC_CTX *tmp_ctx;
+
+ /* place the packet as a child of a tmp_ctx. We then use
+ talloc_free() below to free it. If any of the calls want
+ to keep it, then they will steal it somewhere else, and the
+ talloc_free() will be a no-op */
+ tmp_ctx = talloc_new(ctdb);
+ talloc_steal(tmp_ctx, hdr);
+
+ if (cnt == 0) {
+ DEBUG(DEBUG_CRIT,("Daemon has exited - shutting down client\n"));
+ exit(1);
+ }
+
+ if (cnt < sizeof(*hdr)) {
+ DEBUG(DEBUG_CRIT,("Bad packet length %u in client\n", (unsigned)cnt));
+ goto done;
+ }
+ if (cnt != hdr->length) {
+ ctdb_set_error(ctdb, "Bad header length %u expected %u in client\n",
+ (unsigned)hdr->length, (unsigned)cnt);
+ goto done;
+ }
+
+ if (hdr->ctdb_magic != CTDB_MAGIC) {
+ ctdb_set_error(ctdb, "Non CTDB packet rejected in client\n");
+ goto done;
+ }
+
+ if (hdr->ctdb_version != CTDB_PROTOCOL) {
+ ctdb_set_error(ctdb, "Bad CTDB version 0x%x rejected in client\n", hdr->ctdb_version);
+ goto done;
+ }
+
+ switch (hdr->operation) {
+ case CTDB_REPLY_CALL:
+ ctdb_client_reply_call(ctdb, hdr);
+ break;
+
+ case CTDB_REQ_MESSAGE:
+ ctdb_request_message(ctdb, hdr);
+ break;
+
+ case CTDB_REPLY_CONTROL:
+ ctdb_client_reply_control(ctdb, hdr);
+ break;
+
+ default:
+ DEBUG(DEBUG_CRIT,("bogus operation code:%u\n",hdr->operation));
+ }
+
+done:
+ talloc_free(tmp_ctx);
+}
+
+/*
+ connect to a unix domain socket
+*/
+int ctdb_socket_connect(struct ctdb_context *ctdb)
+{
+ struct sockaddr_un addr;
+ int ret;
+
+ memset(&addr, 0, sizeof(addr));
+ addr.sun_family = AF_UNIX;
+ strncpy(addr.sun_path, ctdb->daemon.name, sizeof(addr.sun_path)-1);
+
+ ctdb->daemon.sd = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (ctdb->daemon.sd == -1) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to open client socket. Errno:%s(%d)\n", strerror(errno), errno));
+ return -1;
+ }
+
+ if (connect(ctdb->daemon.sd, (struct sockaddr *)&addr, sizeof(addr)) == -1) {
+ DEBUG(DEBUG_ERR,
+ (__location__
+ "Failed to connect client socket to daemon (%s)\n",
+ strerror(errno)));
+ close(ctdb->daemon.sd);
+ ctdb->daemon.sd = -1;
+ return -1;
+ }
+
+ ret = set_blocking(ctdb->daemon.sd, false);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,
+ (__location__
+ " failed to set socket non-blocking (%s)\n",
+ strerror(errno)));
+ close(ctdb->daemon.sd);
+ ctdb->daemon.sd = -1;
+ return -1;
+ }
+
+ set_close_on_exec(ctdb->daemon.sd);
+
+ ctdb->daemon.queue = ctdb_queue_setup(ctdb, ctdb, ctdb->daemon.sd,
+ CTDB_DS_ALIGNMENT,
+ ctdb_client_read_cb, ctdb, "to-ctdbd");
+ return 0;
+}
+
+
+struct ctdb_record_handle {
+ struct ctdb_db_context *ctdb_db;
+ TDB_DATA key;
+ TDB_DATA *data;
+ struct ctdb_ltdb_header header;
+};
+
+
+/*
+ make a recv call to the local ctdb daemon - called from client context
+
+ This is called when the program wants to wait for a ctdb_call to complete and get the
+ results. This call will block unless the call has already completed.
+*/
+int ctdb_call_recv(struct ctdb_client_call_state *state, struct ctdb_call *call)
+{
+ if (state == NULL) {
+ return -1;
+ }
+
+ while (state->state < CTDB_CALL_DONE) {
+ tevent_loop_once(state->ctdb_db->ctdb->ev);
+ }
+ if (state->state != CTDB_CALL_DONE) {
+ DEBUG(DEBUG_ERR,(__location__ " ctdb_call_recv failed\n"));
+ talloc_free(state);
+ return -1;
+ }
+
+ if (state->call->reply_data.dsize) {
+ call->reply_data.dptr = talloc_memdup(state->ctdb_db,
+ state->call->reply_data.dptr,
+ state->call->reply_data.dsize);
+ call->reply_data.dsize = state->call->reply_data.dsize;
+ } else {
+ call->reply_data.dptr = NULL;
+ call->reply_data.dsize = 0;
+ }
+ call->status = state->call->status;
+ talloc_free(state);
+
+ return call->status;
+}
+
+
+
+
+/*
+ destroy a ctdb_call in client
+*/
+static int ctdb_client_call_destructor(struct ctdb_client_call_state *state)
+{
+ reqid_remove(state->ctdb_db->ctdb->idr, state->reqid);
+ return 0;
+}
+
+/*
+ construct an event driven local ctdb_call
+
+ this is used so that locally processed ctdb_call requests are processed
+ in an event driven manner
+*/
+static struct ctdb_client_call_state *ctdb_client_call_local_send(struct ctdb_db_context *ctdb_db,
+ struct ctdb_call *call,
+ struct ctdb_ltdb_header *header,
+ TDB_DATA *data)
+{
+ struct ctdb_client_call_state *state;
+ struct ctdb_context *ctdb = ctdb_db->ctdb;
+ int ret;
+
+ state = talloc_zero(ctdb_db, struct ctdb_client_call_state);
+ CTDB_NO_MEMORY_NULL(ctdb, state);
+ state->call = talloc_zero(state, struct ctdb_call);
+ CTDB_NO_MEMORY_NULL(ctdb, state->call);
+
+ talloc_steal(state, data->dptr);
+
+ state->state = CTDB_CALL_DONE;
+ *(state->call) = *call;
+ state->ctdb_db = ctdb_db;
+
+ ret = ctdb_call_local(ctdb_db, state->call, header, state, data, true);
+ if (ret != 0) {
+ DEBUG(DEBUG_DEBUG,("ctdb_call_local() failed, ignoring return code %d\n", ret));
+ }
+
+ return state;
+}
+
+/*
+ make a ctdb call to the local daemon - async send. Called from client context.
+
+ This constructs a ctdb_call request and queues it for processing.
+ This call never blocks.
+*/
+struct ctdb_client_call_state *ctdb_call_send(struct ctdb_db_context *ctdb_db,
+ struct ctdb_call *call)
+{
+ struct ctdb_client_call_state *state;
+ struct ctdb_context *ctdb = ctdb_db->ctdb;
+ struct ctdb_ltdb_header header;
+ TDB_DATA data;
+ int ret;
+ size_t len;
+ struct ctdb_req_call_old *c;
+
+ /* if the domain socket is not yet open, open it */
+ if (ctdb->daemon.sd==-1) {
+ ctdb_socket_connect(ctdb);
+ }
+
+ ret = ctdb_ltdb_lock(ctdb_db, call->key);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to get chainlock\n"));
+ return NULL;
+ }
+
+ ret = ctdb_ltdb_fetch(ctdb_db, call->key, &header, ctdb_db, &data);
+
+ if ((call->flags & CTDB_IMMEDIATE_MIGRATION) && (header.flags & CTDB_REC_RO_HAVE_DELEGATIONS)) {
+ ret = -1;
+ }
+
+ if (ret == 0 && header.dmaster == ctdb->pnn) {
+ state = ctdb_client_call_local_send(ctdb_db, call, &header, &data);
+ talloc_free(data.dptr);
+ ctdb_ltdb_unlock(ctdb_db, call->key);
+ return state;
+ }
+
+ ctdb_ltdb_unlock(ctdb_db, call->key);
+ talloc_free(data.dptr);
+
+ state = talloc_zero(ctdb_db, struct ctdb_client_call_state);
+ if (state == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " failed to allocate state\n"));
+ return NULL;
+ }
+ state->call = talloc_zero(state, struct ctdb_call);
+ if (state->call == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " failed to allocate state->call\n"));
+ return NULL;
+ }
+
+ len = offsetof(struct ctdb_req_call_old, data) + call->key.dsize + call->call_data.dsize;
+ c = ctdbd_allocate_pkt(ctdb, state, CTDB_REQ_CALL, len, struct ctdb_req_call_old);
+ if (c == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " failed to allocate packet\n"));
+ return NULL;
+ }
+
+ state->reqid = reqid_new(ctdb->idr, state);
+ state->ctdb_db = ctdb_db;
+ talloc_set_destructor(state, ctdb_client_call_destructor);
+
+ c->hdr.reqid = state->reqid;
+ c->flags = call->flags;
+ c->db_id = ctdb_db->db_id;
+ c->callid = call->call_id;
+ c->hopcount = 0;
+ c->keylen = call->key.dsize;
+ c->calldatalen = call->call_data.dsize;
+ memcpy(&c->data[0], call->key.dptr, call->key.dsize);
+ memcpy(&c->data[call->key.dsize],
+ call->call_data.dptr, call->call_data.dsize);
+ *(state->call) = *call;
+ state->call->call_data.dptr = &c->data[call->key.dsize];
+ state->call->key.dptr = &c->data[0];
+
+ state->state = CTDB_CALL_WAIT;
+
+
+ ctdb_client_queue_pkt(ctdb, &c->hdr);
+
+ return state;
+}
+
+
+/*
+ full ctdb_call. Equivalent to a ctdb_call_send() followed by a ctdb_call_recv()
+*/
+int ctdb_call(struct ctdb_db_context *ctdb_db, struct ctdb_call *call)
+{
+ struct ctdb_client_call_state *state;
+
+ state = ctdb_call_send(ctdb_db, call);
+ return ctdb_call_recv(state, call);
+}
+
+
+/*
+ tell the daemon what messaging srvid we will use, and register the message
+ handler function in the client
+*/
+int ctdb_client_set_message_handler(struct ctdb_context *ctdb, uint64_t srvid,
+ srvid_handler_fn handler,
+ void *private_data)
+{
+ int res;
+ int32_t status;
+
+ res = ctdb_control(ctdb, CTDB_CURRENT_NODE, srvid,
+ CTDB_CONTROL_REGISTER_SRVID, 0,
+ tdb_null, NULL, NULL, &status, NULL, NULL);
+ if (res != 0 || status != 0) {
+ DEBUG(DEBUG_ERR,
+ ("Failed to register srvid %llu\n",
+ (unsigned long long)srvid));
+ return -1;
+ }
+
+ /* also need to register the handler with our own ctdb structure */
+ return srvid_register(ctdb->srv, ctdb, srvid, handler, private_data);
+}
+
+/*
+ tell the daemon we no longer want a srvid
+*/
+int ctdb_client_remove_message_handler(struct ctdb_context *ctdb,
+ uint64_t srvid, void *private_data)
+{
+ int res;
+ int32_t status;
+
+ res = ctdb_control(ctdb, CTDB_CURRENT_NODE, srvid,
+ CTDB_CONTROL_DEREGISTER_SRVID, 0,
+ tdb_null, NULL, NULL, &status, NULL, NULL);
+ if (res != 0 || status != 0) {
+ DEBUG(DEBUG_ERR,
+ ("Failed to deregister srvid %llu\n",
+ (unsigned long long)srvid));
+ return -1;
+ }
+
+ /* also need to register the handler with our own ctdb structure */
+ srvid_deregister(ctdb->srv, srvid, private_data);
+ return 0;
+}
+
+/*
+ send a message - from client context
+ */
+int ctdb_client_send_message(struct ctdb_context *ctdb, uint32_t pnn,
+ uint64_t srvid, TDB_DATA data)
+{
+ struct ctdb_req_message_old *r;
+ int len, res;
+
+ len = offsetof(struct ctdb_req_message_old, data) + data.dsize;
+ r = ctdbd_allocate_pkt(ctdb, ctdb, CTDB_REQ_MESSAGE,
+ len, struct ctdb_req_message_old);
+ CTDB_NO_MEMORY(ctdb, r);
+
+ r->hdr.destnode = pnn;
+ r->srvid = srvid;
+ r->datalen = data.dsize;
+ memcpy(&r->data[0], data.dptr, data.dsize);
+
+ res = ctdb_client_queue_pkt(ctdb, &r->hdr);
+ talloc_free(r);
+ return res;
+}
+
+
+/*
+ called when a control completes or timesout to invoke the callback
+ function the user provided
+*/
+static void invoke_control_callback(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *private_data)
+{
+ struct ctdb_client_control_state *state;
+ TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+ int ret;
+
+ state = talloc_get_type(private_data, struct ctdb_client_control_state);
+ talloc_steal(tmp_ctx, state);
+
+ ret = ctdb_control_recv(state->ctdb, state, state,
+ NULL,
+ NULL,
+ NULL);
+ if (ret != 0) {
+ DEBUG(DEBUG_DEBUG,("ctdb_control_recv() failed, ignoring return code %d\n", ret));
+ }
+
+ talloc_free(tmp_ctx);
+}
+
+/*
+ called when a CTDB_REPLY_CONTROL packet comes in in the client
+
+ This packet comes in response to a CTDB_REQ_CONTROL request packet. It
+ contains any reply data from the control
+*/
+static void ctdb_client_reply_control(struct ctdb_context *ctdb,
+ struct ctdb_req_header *hdr)
+{
+ struct ctdb_reply_control_old *c = (struct ctdb_reply_control_old *)hdr;
+ struct ctdb_client_control_state *state;
+
+ state = reqid_find(ctdb->idr, hdr->reqid, struct ctdb_client_control_state);
+ if (state == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " reqid %u not found\n", hdr->reqid));
+ return;
+ }
+
+ if (hdr->reqid != state->reqid) {
+ /* we found a record but it was the wrong one */
+ DEBUG(DEBUG_ERR, ("Dropped orphaned reply control with reqid:%u\n",hdr->reqid));
+ return;
+ }
+
+ state->outdata.dptr = c->data;
+ state->outdata.dsize = c->datalen;
+ state->status = c->status;
+ if (c->errorlen) {
+ state->errormsg = talloc_strndup(state,
+ (char *)&c->data[c->datalen],
+ c->errorlen);
+ }
+
+ /* state->outdata now uses resources from c so we don't want c
+ to just disappear from under us while state is still alive
+ */
+ talloc_steal(state, c);
+
+ state->state = CTDB_CONTROL_DONE;
+
+ /* if we had a callback registered for this control, pull the response
+ and call the callback.
+ */
+ if (state->async.fn) {
+ tevent_add_timer(ctdb->ev, state, timeval_zero(),
+ invoke_control_callback, state);
+ }
+}
+
+
+/*
+ destroy a ctdb_control in client
+*/
+static int ctdb_client_control_destructor(struct ctdb_client_control_state *state)
+{
+ reqid_remove(state->ctdb->idr, state->reqid);
+ return 0;
+}
+
+
+/* time out handler for ctdb_control */
+static void control_timeout_func(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *private_data)
+{
+ struct ctdb_client_control_state *state = talloc_get_type(private_data, struct ctdb_client_control_state);
+
+ DEBUG(DEBUG_ERR,(__location__ " control timed out. reqid:%u opcode:%u "
+ "dstnode:%u\n", state->reqid, state->c->opcode,
+ state->c->hdr.destnode));
+
+ state->state = CTDB_CONTROL_TIMEOUT;
+
+ /* if we had a callback registered for this control, pull the response
+ and call the callback.
+ */
+ if (state->async.fn) {
+ tevent_add_timer(state->ctdb->ev, state, timeval_zero(),
+ invoke_control_callback, state);
+ }
+}
+
+/* async version of send control request */
+struct ctdb_client_control_state *ctdb_control_send(struct ctdb_context *ctdb,
+ uint32_t destnode, uint64_t srvid,
+ uint32_t opcode, uint32_t flags, TDB_DATA data,
+ TALLOC_CTX *mem_ctx,
+ struct timeval *timeout,
+ char **errormsg)
+{
+ struct ctdb_client_control_state *state;
+ size_t len;
+ struct ctdb_req_control_old *c;
+ int ret;
+
+ if (errormsg) {
+ *errormsg = NULL;
+ }
+
+ /* if the domain socket is not yet open, open it */
+ if (ctdb->daemon.sd==-1) {
+ ctdb_socket_connect(ctdb);
+ }
+
+ state = talloc_zero(mem_ctx, struct ctdb_client_control_state);
+ CTDB_NO_MEMORY_NULL(ctdb, state);
+
+ state->ctdb = ctdb;
+ state->reqid = reqid_new(ctdb->idr, state);
+ state->state = CTDB_CONTROL_WAIT;
+ state->errormsg = NULL;
+
+ talloc_set_destructor(state, ctdb_client_control_destructor);
+
+ len = offsetof(struct ctdb_req_control_old, data) + data.dsize;
+ c = ctdbd_allocate_pkt(ctdb, state, CTDB_REQ_CONTROL,
+ len, struct ctdb_req_control_old);
+ state->c = c;
+ CTDB_NO_MEMORY_NULL(ctdb, c);
+ c->hdr.reqid = state->reqid;
+ c->hdr.destnode = destnode;
+ c->opcode = opcode;
+ c->client_id = 0;
+ c->flags = flags;
+ c->srvid = srvid;
+ c->datalen = data.dsize;
+ if (data.dsize) {
+ memcpy(&c->data[0], data.dptr, data.dsize);
+ }
+
+ /* timeout */
+ if (timeout && !timeval_is_zero(timeout)) {
+ tevent_add_timer(ctdb->ev, state, *timeout,
+ control_timeout_func, state);
+ }
+
+ ret = ctdb_client_queue_pkt(ctdb, &(c->hdr));
+ if (ret != 0) {
+ talloc_free(state);
+ return NULL;
+ }
+
+ if (flags & CTDB_CTRL_FLAG_NOREPLY) {
+ talloc_free(state);
+ return NULL;
+ }
+
+ return state;
+}
+
+
+/* async version of receive control reply */
+int ctdb_control_recv(struct ctdb_context *ctdb,
+ struct ctdb_client_control_state *state,
+ TALLOC_CTX *mem_ctx,
+ TDB_DATA *outdata, int32_t *status, char **errormsg)
+{
+ TALLOC_CTX *tmp_ctx;
+
+ if (status != NULL) {
+ *status = -1;
+ }
+ if (errormsg != NULL) {
+ *errormsg = NULL;
+ }
+
+ if (state == NULL) {
+ return -1;
+ }
+
+ /* prevent double free of state */
+ tmp_ctx = talloc_new(ctdb);
+ talloc_steal(tmp_ctx, state);
+
+ /* loop one event at a time until we either timeout or the control
+ completes.
+ */
+ while (state->state == CTDB_CONTROL_WAIT) {
+ tevent_loop_once(ctdb->ev);
+ }
+
+ if (state->state != CTDB_CONTROL_DONE) {
+ DEBUG(DEBUG_ERR,(__location__ " ctdb_control_recv failed\n"));
+ if (state->async.fn) {
+ state->async.fn(state);
+ }
+ talloc_free(tmp_ctx);
+ return -1;
+ }
+
+ if (state->errormsg) {
+ int s = (state->status == 0 ? -1 : state->status);
+ DEBUG(DEBUG_ERR,("ctdb_control error: '%s'\n", state->errormsg));
+ if (errormsg) {
+ (*errormsg) = talloc_move(mem_ctx, &state->errormsg);
+ }
+ if (state->async.fn) {
+ state->async.fn(state);
+ }
+ talloc_free(tmp_ctx);
+ return s;
+ }
+
+ if (outdata) {
+ *outdata = state->outdata;
+ outdata->dptr = talloc_memdup(mem_ctx, outdata->dptr, outdata->dsize);
+ }
+
+ if (status) {
+ *status = state->status;
+ }
+
+ if (state->async.fn) {
+ state->async.fn(state);
+ }
+
+ talloc_free(tmp_ctx);
+ return 0;
+}
+
+
+
+/*
+ send a ctdb control message
+ timeout specifies how long we should wait for a reply.
+ if timeout is NULL we wait indefinitely
+ */
+int ctdb_control(struct ctdb_context *ctdb, uint32_t destnode, uint64_t srvid,
+ uint32_t opcode, uint32_t flags, TDB_DATA data,
+ TALLOC_CTX *mem_ctx, TDB_DATA *outdata, int32_t *status,
+ struct timeval *timeout,
+ char **errormsg)
+{
+ struct ctdb_client_control_state *state;
+
+ state = ctdb_control_send(ctdb, destnode, srvid, opcode,
+ flags, data, mem_ctx,
+ timeout, errormsg);
+
+ /* FIXME: Error conditions in ctdb_control_send return NULL without
+ * setting errormsg. So, there is no way to distinguish between success
+ * and failure when CTDB_CTRL_FLAG_NOREPLY is set */
+ if (flags & CTDB_CTRL_FLAG_NOREPLY) {
+ if (status != NULL) {
+ *status = 0;
+ }
+ return 0;
+ }
+
+ return ctdb_control_recv(ctdb, state, mem_ctx, outdata, status,
+ errormsg);
+}
+
+/*
+ get vnn map from a remote node
+ */
+int ctdb_ctrl_getvnnmap(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, TALLOC_CTX *mem_ctx, struct ctdb_vnn_map **vnnmap)
+{
+ int ret;
+ TDB_DATA outdata;
+ int32_t res;
+ struct ctdb_vnn_map_wire *map;
+
+ ret = ctdb_control(ctdb, destnode, 0,
+ CTDB_CONTROL_GETVNNMAP, 0, tdb_null,
+ mem_ctx, &outdata, &res, &timeout, NULL);
+ if (ret != 0 || res != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " ctdb_control for getvnnmap failed\n"));
+ return -1;
+ }
+
+ map = (struct ctdb_vnn_map_wire *)outdata.dptr;
+ if (outdata.dsize < offsetof(struct ctdb_vnn_map_wire, map) ||
+ outdata.dsize != map->size*sizeof(uint32_t) + offsetof(struct ctdb_vnn_map_wire, map)) {
+ DEBUG(DEBUG_ERR,("Bad vnn map size received in ctdb_ctrl_getvnnmap\n"));
+ return -1;
+ }
+
+ (*vnnmap) = talloc(mem_ctx, struct ctdb_vnn_map);
+ CTDB_NO_MEMORY(ctdb, *vnnmap);
+ (*vnnmap)->generation = map->generation;
+ (*vnnmap)->size = map->size;
+ (*vnnmap)->map = talloc_array(*vnnmap, uint32_t, map->size);
+
+ CTDB_NO_MEMORY(ctdb, (*vnnmap)->map);
+ memcpy((*vnnmap)->map, map->map, sizeof(uint32_t)*map->size);
+ talloc_free(outdata.dptr);
+
+ return 0;
+}
+
+
+/*
+ get the recovery mode of a remote node
+ */
+struct ctdb_client_control_state *
+ctdb_ctrl_getrecmode_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode)
+{
+ return ctdb_control_send(ctdb, destnode, 0,
+ CTDB_CONTROL_GET_RECMODE, 0, tdb_null,
+ mem_ctx, &timeout, NULL);
+}
+
+int ctdb_ctrl_getrecmode_recv(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct ctdb_client_control_state *state, uint32_t *recmode)
+{
+ int ret;
+ int32_t res;
+
+ ret = ctdb_control_recv(ctdb, state, mem_ctx, NULL, &res, NULL);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " ctdb_ctrl_getrecmode_recv failed\n"));
+ return -1;
+ }
+
+ if (recmode) {
+ *recmode = (uint32_t)res;
+ }
+
+ return 0;
+}
+
+int ctdb_ctrl_getrecmode(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode, uint32_t *recmode)
+{
+ struct ctdb_client_control_state *state;
+
+ state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx, timeout, destnode);
+ return ctdb_ctrl_getrecmode_recv(ctdb, mem_ctx, state, recmode);
+}
+
+
+
+
+/*
+ set the recovery mode of a remote node
+ */
+int ctdb_ctrl_setrecmode(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t recmode)
+{
+ int ret;
+ TDB_DATA data;
+ int32_t res;
+
+ data.dsize = sizeof(uint32_t);
+ data.dptr = (unsigned char *)&recmode;
+
+ ret = ctdb_control(ctdb, destnode, 0,
+ CTDB_CONTROL_SET_RECMODE, 0, data,
+ NULL, NULL, &res, &timeout, NULL);
+ if (ret != 0 || res != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " ctdb_control for setrecmode failed\n"));
+ return -1;
+ }
+
+ return 0;
+}
+
+
+
+/*
+ get a list of nodes (vnn and flags ) from a remote node
+ */
+int ctdb_ctrl_getnodemap(struct ctdb_context *ctdb,
+ struct timeval timeout, uint32_t destnode,
+ TALLOC_CTX *mem_ctx, struct ctdb_node_map_old **nodemap)
+{
+ int ret;
+ TDB_DATA outdata;
+ int32_t res;
+
+ ret = ctdb_control(ctdb, destnode, 0,
+ CTDB_CONTROL_GET_NODEMAP, 0, tdb_null,
+ mem_ctx, &outdata, &res, &timeout, NULL);
+ if (ret != 0 || res != 0 || outdata.dsize == 0) {
+ DEBUG(DEBUG_ERR,(__location__ " ctdb_control for getnodes failed ret:%d res:%d\n", ret, res));
+ return -1;
+ }
+
+ *nodemap = (struct ctdb_node_map_old *)talloc_memdup(mem_ctx, outdata.dptr, outdata.dsize);
+ talloc_free(outdata.dptr);
+ return 0;
+}
+
+int ctdb_ctrl_get_runstate(struct ctdb_context *ctdb,
+ struct timeval timeout,
+ uint32_t destnode,
+ uint32_t *runstate)
+{
+ TDB_DATA outdata;
+ int32_t res;
+ int ret;
+
+ ret = ctdb_control(ctdb, destnode, 0, CTDB_CONTROL_GET_RUNSTATE, 0,
+ tdb_null, ctdb, &outdata, &res, &timeout, NULL);
+ if (ret != 0 || res != 0) {
+ DEBUG(DEBUG_ERR,("ctdb_control for get_runstate failed\n"));
+ return ret != 0 ? ret : res;
+ }
+
+ if (outdata.dsize != sizeof(uint32_t)) {
+ DEBUG(DEBUG_ERR,("Invalid return data in get_runstate\n"));
+ talloc_free(outdata.dptr);
+ return -1;
+ }
+
+ if (runstate != NULL) {
+ *runstate = *(uint32_t *)outdata.dptr;
+ }
+ talloc_free(outdata.dptr);
+
+ return 0;
+}
+
+/*
+ get debug level on a node
+ */
+int ctdb_ctrl_get_debuglevel(struct ctdb_context *ctdb, uint32_t destnode, int32_t *level)
+{
+ int ret;
+ int32_t res;
+ TDB_DATA data;
+
+ ret = ctdb_control(ctdb, destnode, 0, CTDB_CONTROL_GET_DEBUG, 0, tdb_null,
+ ctdb, &data, &res, NULL, NULL);
+ if (ret != 0 || res != 0) {
+ return -1;
+ }
+ if (data.dsize != sizeof(int32_t)) {
+ DEBUG(DEBUG_ERR,("Bad control reply size in ctdb_get_debuglevel (got %u)\n",
+ (unsigned)data.dsize));
+ return -1;
+ }
+ *level = *(int32_t *)data.dptr;
+ talloc_free(data.dptr);
+ return 0;
+}
+
+/* Freeze all databases */
+int ctdb_ctrl_freeze(struct ctdb_context *ctdb, struct timeval timeout,
+ uint32_t destnode)
+{
+ int ret;
+ int32_t res;
+
+ ret = ctdb_control(ctdb, destnode, 0,
+ CTDB_CONTROL_FREEZE, 0, tdb_null,
+ NULL, NULL, &res, &timeout, NULL);
+ if (ret != 0 || res != 0) {
+ DEBUG(DEBUG_ERR, ("ctdb_ctrl_freeze_priority failed\n"));
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ get pnn of a node, or -1
+ */
+int ctdb_ctrl_getpnn(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode)
+{
+ int ret;
+ int32_t res;
+
+ ret = ctdb_control(ctdb, destnode, 0,
+ CTDB_CONTROL_GET_PNN, 0, tdb_null,
+ NULL, NULL, &res, &timeout, NULL);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " ctdb_control for getpnn failed\n"));
+ return -1;
+ }
+
+ return res;
+}
+
+int ctdb_ctrl_get_public_ips_flags(struct ctdb_context *ctdb,
+ struct timeval timeout, uint32_t destnode,
+ TALLOC_CTX *mem_ctx,
+ uint32_t flags,
+ struct ctdb_public_ip_list_old **ips)
+{
+ int ret;
+ TDB_DATA outdata;
+ int32_t res;
+
+ ret = ctdb_control(ctdb, destnode, 0,
+ CTDB_CONTROL_GET_PUBLIC_IPS, flags, tdb_null,
+ mem_ctx, &outdata, &res, &timeout, NULL);
+ if (ret != 0 || res != 0) {
+ DEBUG(DEBUG_ERR,(__location__
+ " ctdb_control for getpublicips failed ret:%d res:%d\n",
+ ret, res));
+ return -1;
+ }
+
+ *ips = (struct ctdb_public_ip_list_old *)talloc_memdup(mem_ctx, outdata.dptr, outdata.dsize);
+ talloc_free(outdata.dptr);
+
+ return 0;
+}
+
+int ctdb_ctrl_get_public_ips(struct ctdb_context *ctdb,
+ struct timeval timeout, uint32_t destnode,
+ TALLOC_CTX *mem_ctx,
+ struct ctdb_public_ip_list_old **ips)
+{
+ return ctdb_ctrl_get_public_ips_flags(ctdb, timeout,
+ destnode, mem_ctx,
+ 0, ips);
+}
+
+int ctdb_ctrl_get_ifaces(struct ctdb_context *ctdb,
+ struct timeval timeout, uint32_t destnode,
+ TALLOC_CTX *mem_ctx,
+ struct ctdb_iface_list_old **_ifaces)
+{
+ int ret;
+ TDB_DATA outdata;
+ int32_t res;
+ struct ctdb_iface_list_old *ifaces;
+ uint32_t len;
+ uint32_t i;
+
+ ret = ctdb_control(ctdb, destnode, 0,
+ CTDB_CONTROL_GET_IFACES, 0, tdb_null,
+ mem_ctx, &outdata, &res, &timeout, NULL);
+ if (ret != 0 || res != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get ifaces "
+ "failed ret:%d res:%d\n",
+ ret, res));
+ return -1;
+ }
+
+ len = offsetof(struct ctdb_iface_list_old, ifaces);
+ if (len > outdata.dsize) {
+ DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get ifaces "
+ "returned invalid data with size %u > %u\n",
+ (unsigned int)outdata.dsize,
+ (unsigned int)len));
+ dump_data(DEBUG_DEBUG, outdata.dptr, outdata.dsize);
+ return -1;
+ }
+
+ ifaces = (struct ctdb_iface_list_old *)outdata.dptr;
+ len += ifaces->num*sizeof(struct ctdb_iface);
+
+ if (len > outdata.dsize) {
+ DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get ifaces "
+ "returned invalid data with size %u > %u\n",
+ (unsigned int)outdata.dsize,
+ (unsigned int)len));
+ dump_data(DEBUG_DEBUG, outdata.dptr, outdata.dsize);
+ return -1;
+ }
+
+ /* make sure we null terminate the returned strings */
+ for (i=0; i < ifaces->num; i++) {
+ ifaces->ifaces[i].name[CTDB_IFACE_SIZE] = '\0';
+ }
+
+ *_ifaces = (struct ctdb_iface_list_old *)talloc_memdup(mem_ctx,
+ outdata.dptr,
+ outdata.dsize);
+ talloc_free(outdata.dptr);
+ if (*_ifaces == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get ifaces "
+ "talloc_memdup size %u failed\n",
+ (unsigned int)outdata.dsize));
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ get all tunables
+ */
+int ctdb_ctrl_get_all_tunables(struct ctdb_context *ctdb,
+ struct timeval timeout,
+ uint32_t destnode,
+ struct ctdb_tunable_list *tunables)
+{
+ TDB_DATA outdata;
+ int ret;
+ int32_t res;
+
+ ret = ctdb_control(ctdb, destnode, 0, CTDB_CONTROL_GET_ALL_TUNABLES, 0, tdb_null, ctdb,
+ &outdata, &res, &timeout, NULL);
+ if (ret != 0 || res != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get all tunables failed\n"));
+ return -1;
+ }
+
+ if (outdata.dsize != sizeof(*tunables)) {
+ DEBUG(DEBUG_ERR,(__location__ " bad data size %u in ctdb_ctrl_get_all_tunables should be %u\n",
+ (unsigned)outdata.dsize, (unsigned)sizeof(*tunables)));
+ return -1;
+ }
+
+ *tunables = *(struct ctdb_tunable_list *)outdata.dptr;
+ talloc_free(outdata.dptr);
+ return 0;
+}
+
+/*
+ set some ctdb flags
+*/
+void ctdb_set_flags(struct ctdb_context *ctdb, unsigned flags)
+{
+ ctdb->flags |= flags;
+}
+
+const char *ctdb_get_socketname(struct ctdb_context *ctdb)
+{
+ return ctdb->daemon.name;
+}
+
+/*
+ return the pnn of this node
+*/
+uint32_t ctdb_get_pnn(struct ctdb_context *ctdb)
+{
+ return ctdb->pnn;
+}
+
+/*
+ callback for the async helpers used when sending the same control
+ to multiple nodes in parallel.
+*/
+static void async_callback(struct ctdb_client_control_state *state)
+{
+ struct client_async_data *data = talloc_get_type(state->async.private_data, struct client_async_data);
+ struct ctdb_context *ctdb = talloc_get_type(state->ctdb, struct ctdb_context);
+ int ret;
+ TDB_DATA outdata;
+ int32_t res = -1;
+ uint32_t destnode = state->c->hdr.destnode;
+
+ outdata.dsize = 0;
+ outdata.dptr = NULL;
+
+ /* one more node has responded with recmode data */
+ data->count--;
+
+ /* if we failed to push the db, then return an error and let
+ the main loop try again.
+ */
+ if (state->state != CTDB_CONTROL_DONE) {
+ if ( !data->dont_log_errors) {
+ DEBUG(DEBUG_ERR,("Async operation failed with state %d, opcode:%u\n", state->state, data->opcode));
+ }
+ data->fail_count++;
+ if (state->state == CTDB_CONTROL_TIMEOUT) {
+ res = -ETIMEDOUT;
+ } else {
+ res = -1;
+ }
+ if (data->fail_callback) {
+ data->fail_callback(ctdb, destnode, res, outdata,
+ data->callback_data);
+ }
+ return;
+ }
+
+ state->async.fn = NULL;
+
+ ret = ctdb_control_recv(ctdb, state, data, &outdata, &res, NULL);
+ if ((ret != 0) || (res != 0)) {
+ if ( !data->dont_log_errors) {
+ DEBUG(DEBUG_ERR,("Async operation failed with ret=%d res=%d opcode=%u\n", ret, (int)res, data->opcode));
+ }
+ data->fail_count++;
+ if (data->fail_callback) {
+ data->fail_callback(ctdb, destnode, res, outdata,
+ data->callback_data);
+ }
+ }
+ if ((ret == 0) && (data->callback != NULL)) {
+ data->callback(ctdb, destnode, res, outdata,
+ data->callback_data);
+ }
+}
+
+
+void ctdb_client_async_add(struct client_async_data *data, struct ctdb_client_control_state *state)
+{
+ /* set up the callback functions */
+ state->async.fn = async_callback;
+ state->async.private_data = data;
+
+ /* one more control to wait for to complete */
+ data->count++;
+}
+
+
+/* wait for up to the maximum number of seconds allowed
+ or until all nodes we expect a response from has replied
+*/
+int ctdb_client_async_wait(struct ctdb_context *ctdb, struct client_async_data *data)
+{
+ while (data->count > 0) {
+ tevent_loop_once(ctdb->ev);
+ }
+ if (data->fail_count != 0) {
+ if (!data->dont_log_errors) {
+ DEBUG(DEBUG_ERR,("Async wait failed - fail_count=%u\n",
+ data->fail_count));
+ }
+ return -1;
+ }
+ return 0;
+}
+
+
+/*
+ perform a simple control on the listed nodes
+ The control cannot return data
+ */
+int ctdb_client_async_control(struct ctdb_context *ctdb,
+ enum ctdb_controls opcode,
+ uint32_t *nodes,
+ uint64_t srvid,
+ struct timeval timeout,
+ bool dont_log_errors,
+ TDB_DATA data,
+ client_async_callback client_callback,
+ client_async_callback fail_callback,
+ void *callback_data)
+{
+ struct client_async_data *async_data;
+ struct ctdb_client_control_state *state;
+ int j, num_nodes;
+
+ async_data = talloc_zero(ctdb, struct client_async_data);
+ CTDB_NO_MEMORY_FATAL(ctdb, async_data);
+ async_data->dont_log_errors = dont_log_errors;
+ async_data->callback = client_callback;
+ async_data->fail_callback = fail_callback;
+ async_data->callback_data = callback_data;
+ async_data->opcode = opcode;
+
+ num_nodes = talloc_get_size(nodes) / sizeof(uint32_t);
+
+ /* loop over all nodes and send an async control to each of them */
+ for (j=0; j<num_nodes; j++) {
+ uint32_t pnn = nodes[j];
+
+ state = ctdb_control_send(ctdb, pnn, srvid, opcode,
+ 0, data, async_data, &timeout, NULL);
+ if (state == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to call async control %u\n", (unsigned)opcode));
+ talloc_free(async_data);
+ return -1;
+ }
+
+ ctdb_client_async_add(async_data, state);
+ }
+
+ if (ctdb_client_async_wait(ctdb, async_data) != 0) {
+ talloc_free(async_data);
+ return -1;
+ }
+
+ talloc_free(async_data);
+ return 0;
+}
+
+uint32_t *list_of_vnnmap_nodes(struct ctdb_context *ctdb,
+ struct ctdb_vnn_map *vnn_map,
+ TALLOC_CTX *mem_ctx,
+ bool include_self)
+{
+ unsigned int i, j, num_nodes;
+ uint32_t *nodes;
+
+ for (i=num_nodes=0;i<vnn_map->size;i++) {
+ if (vnn_map->map[i] == ctdb->pnn && !include_self) {
+ continue;
+ }
+ num_nodes++;
+ }
+
+ nodes = talloc_array(mem_ctx, uint32_t, num_nodes);
+ CTDB_NO_MEMORY_FATAL(ctdb, nodes);
+
+ for (i=j=0;i<vnn_map->size;i++) {
+ if (vnn_map->map[i] == ctdb->pnn && !include_self) {
+ continue;
+ }
+ nodes[j++] = vnn_map->map[i];
+ }
+
+ return nodes;
+}
+
+/* Get list of nodes not including those with flags specified by mask */
+static uint32_t *list_of_nodes(struct ctdb_context *ctdb,
+ struct ctdb_node_map_old *node_map,
+ TALLOC_CTX *mem_ctx,
+ uint32_t mask,
+ bool include_self)
+{
+ unsigned int i, j, num_nodes;
+ uint32_t exclude_pnn;
+ uint32_t *nodes;
+
+ exclude_pnn = include_self ? CTDB_UNKNOWN_PNN : ctdb->pnn;
+
+ for (i=num_nodes=0;i<node_map->num;i++) {
+ if (node_map->nodes[i].flags & mask) {
+ continue;
+ }
+ if (node_map->nodes[i].pnn == exclude_pnn) {
+ continue;
+ }
+ num_nodes++;
+ }
+
+ nodes = talloc_array(mem_ctx, uint32_t, num_nodes);
+ CTDB_NO_MEMORY_FATAL(ctdb, nodes);
+
+ for (i=j=0;i<node_map->num;i++) {
+ if (node_map->nodes[i].flags & mask) {
+ continue;
+ }
+ if (node_map->nodes[i].pnn == exclude_pnn) {
+ continue;
+ }
+ nodes[j++] = node_map->nodes[i].pnn;
+ }
+
+ return nodes;
+}
+
+uint32_t *list_of_active_nodes(struct ctdb_context *ctdb,
+ struct ctdb_node_map_old *node_map,
+ TALLOC_CTX *mem_ctx,
+ bool include_self)
+{
+ return list_of_nodes(ctdb,
+ node_map,
+ mem_ctx,
+ NODE_FLAGS_INACTIVE,
+ include_self);
+}
+
+uint32_t *list_of_connected_nodes(struct ctdb_context *ctdb,
+ struct ctdb_node_map_old *node_map,
+ TALLOC_CTX *mem_ctx,
+ bool include_self)
+{
+ return list_of_nodes(ctdb,
+ node_map,
+ mem_ctx,
+ NODE_FLAGS_DISCONNECTED,
+ include_self);
+}
+
+/*
+ get capabilities of a remote node
+ */
+struct ctdb_client_control_state *
+ctdb_ctrl_getcapabilities_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode)
+{
+ return ctdb_control_send(ctdb, destnode, 0,
+ CTDB_CONTROL_GET_CAPABILITIES, 0, tdb_null,
+ mem_ctx, &timeout, NULL);
+}
+
+int ctdb_ctrl_getcapabilities_recv(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct ctdb_client_control_state *state, uint32_t *capabilities)
+{
+ int ret;
+ int32_t res;
+ TDB_DATA outdata;
+
+ ret = ctdb_control_recv(ctdb, state, mem_ctx, &outdata, &res, NULL);
+ if ( (ret != 0) || (res != 0) ) {
+ DEBUG(DEBUG_ERR,(__location__ " ctdb_ctrl_getcapabilities_recv failed\n"));
+ return -1;
+ }
+
+ if (capabilities) {
+ *capabilities = *((uint32_t *)outdata.dptr);
+ }
+
+ return 0;
+}
+
+int ctdb_ctrl_getcapabilities(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t *capabilities)
+{
+ struct ctdb_client_control_state *state;
+ TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+ int ret;
+
+ state = ctdb_ctrl_getcapabilities_send(ctdb, tmp_ctx, timeout, destnode);
+ ret = ctdb_ctrl_getcapabilities_recv(ctdb, tmp_ctx, state, capabilities);
+ talloc_free(tmp_ctx);
+ return ret;
+}
+
+static void get_capabilities_callback(struct ctdb_context *ctdb,
+ uint32_t node_pnn, int32_t res,
+ TDB_DATA outdata, void *callback_data)
+{
+ struct ctdb_node_capabilities *caps =
+ talloc_get_type(callback_data,
+ struct ctdb_node_capabilities);
+
+ if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
+ DEBUG(DEBUG_ERR, (__location__ " Invalid length/pointer for getcap callback : %u %p\n", (unsigned)outdata.dsize, outdata.dptr));
+ return;
+ }
+
+ if (node_pnn >= talloc_array_length(caps)) {
+ DEBUG(DEBUG_ERR,
+ (__location__ " unexpected PNN %u\n", node_pnn));
+ return;
+ }
+
+ caps[node_pnn].retrieved = true;
+ caps[node_pnn].capabilities = *((uint32_t *)outdata.dptr);
+}
+
+struct ctdb_node_capabilities *
+ctdb_get_capabilities(struct ctdb_context *ctdb,
+ TALLOC_CTX *mem_ctx,
+ struct timeval timeout,
+ struct ctdb_node_map_old *nodemap)
+{
+ uint32_t *nodes;
+ uint32_t i, res;
+ struct ctdb_node_capabilities *ret;
+
+ nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
+
+ ret = talloc_array(mem_ctx, struct ctdb_node_capabilities,
+ nodemap->num);
+ CTDB_NO_MEMORY_NULL(ctdb, ret);
+ /* Prepopulate the expected PNNs */
+ for (i = 0; i < talloc_array_length(ret); i++) {
+ ret[i].retrieved = false;
+ }
+
+ res = ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
+ nodes, 0, timeout,
+ false, tdb_null,
+ get_capabilities_callback, NULL,
+ ret);
+ if (res != 0) {
+ DEBUG(DEBUG_ERR,
+ (__location__ " Failed to read node capabilities.\n"));
+ TALLOC_FREE(ret);
+ }
+
+ return ret;
+}
+
+uint32_t *
+ctdb_get_node_capabilities(struct ctdb_node_capabilities *caps,
+ uint32_t pnn)
+{
+ if (pnn < talloc_array_length(caps) && caps[pnn].retrieved) {
+ return &caps[pnn].capabilities;
+ }
+
+ return NULL;
+}
+
+bool ctdb_node_has_capabilities(struct ctdb_node_capabilities *caps,
+ uint32_t pnn,
+ uint32_t capabilities_required)
+{
+ uint32_t *capp = ctdb_get_node_capabilities(caps, pnn);
+ return (capp != NULL) &&
+ ((*capp & capabilities_required) == capabilities_required);
+}
+
+/*
+ recovery daemon ping to main daemon
+ */
+int ctdb_ctrl_recd_ping(struct ctdb_context *ctdb)
+{
+ int ret;
+ int32_t res;
+
+ ret = ctdb_control(ctdb, CTDB_CURRENT_NODE, 0, CTDB_CONTROL_RECD_PING, 0, tdb_null,
+ ctdb, NULL, &res, NULL, NULL);
+ if (ret != 0 || res != 0) {
+ DEBUG(DEBUG_ERR,("Failed to send recd ping\n"));
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ tell the main daemon how long it took to lock the reclock file
+ */
+int ctdb_ctrl_report_recd_lock_latency(struct ctdb_context *ctdb, struct timeval timeout, double latency)
+{
+ int ret;
+ int32_t res;
+ TDB_DATA data;
+
+ data.dptr = (uint8_t *)&latency;
+ data.dsize = sizeof(latency);
+
+ ret = ctdb_control(ctdb, CTDB_CURRENT_NODE, 0, CTDB_CONTROL_RECD_RECLOCK_LATENCY, 0, data,
+ ctdb, NULL, &res, NULL, NULL);
+ if (ret != 0 || res != 0) {
+ DEBUG(DEBUG_ERR,("Failed to send recd reclock latency\n"));
+ return -1;
+ }
+
+ return 0;
+}
+
+int ctdb_ctrl_set_ban(struct ctdb_context *ctdb, struct timeval timeout,
+ uint32_t destnode, struct ctdb_ban_state *bantime)
+{
+ int ret;
+ TDB_DATA data;
+ int32_t res;
+
+ data.dsize = sizeof(*bantime);
+ data.dptr = (uint8_t *)bantime;
+
+ ret = ctdb_control(ctdb, destnode, 0,
+ CTDB_CONTROL_SET_BAN_STATE, 0, data,
+ NULL, NULL, &res, &timeout, NULL);
+ if (ret != 0 || res != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set ban state failed\n"));
+ return -1;
+ }
+
+ return 0;
+}
+
+struct ctdb_client_control_state *
+ctdb_ctrl_updaterecord_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode, struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_ltdb_header *header, TDB_DATA data)
+{
+ struct ctdb_client_control_state *handle;
+ struct ctdb_marshall_buffer *m;
+ struct ctdb_rec_data_old *rec;
+ TDB_DATA outdata;
+
+ m = talloc_zero(mem_ctx, struct ctdb_marshall_buffer);
+ if (m == NULL) {
+ DEBUG(DEBUG_ERR, ("Failed to allocate marshall buffer for update record\n"));
+ return NULL;
+ }
+
+ m->db_id = ctdb_db->db_id;
+
+ rec = ctdb_marshall_record(m, 0, key, header, data);
+ if (rec == NULL) {
+ DEBUG(DEBUG_ERR,("Failed to marshall record for update record\n"));
+ talloc_free(m);
+ return NULL;
+ }
+ m = talloc_realloc_size(mem_ctx, m, rec->length + offsetof(struct ctdb_marshall_buffer, data));
+ if (m == NULL) {
+ DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata\n"));
+ talloc_free(m);
+ return NULL;
+ }
+ m->count++;
+ memcpy((uint8_t *)m + offsetof(struct ctdb_marshall_buffer, data), rec, rec->length);
+
+
+ outdata.dptr = (uint8_t *)m;
+ outdata.dsize = talloc_get_size(m);
+
+ handle = ctdb_control_send(ctdb, destnode, 0,
+ CTDB_CONTROL_UPDATE_RECORD, 0, outdata,
+ mem_ctx, &timeout, NULL);
+ talloc_free(m);
+ return handle;
+}
+
+int ctdb_ctrl_updaterecord_recv(struct ctdb_context *ctdb, struct ctdb_client_control_state *state)
+{
+ int ret;
+ int32_t res;
+
+ ret = ctdb_control_recv(ctdb, state, state, NULL, &res, NULL);
+ if ( (ret != 0) || (res != 0) ){
+ DEBUG(DEBUG_ERR,(__location__ " ctdb_ctrl_update_record_recv failed\n"));
+ return -1;
+ }
+
+ return 0;
+}
+
+int
+ctdb_ctrl_updaterecord(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode, struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_ltdb_header *header, TDB_DATA data)
+{
+ struct ctdb_client_control_state *state;
+
+ state = ctdb_ctrl_updaterecord_send(ctdb, mem_ctx, timeout, destnode, ctdb_db, key, header, data);
+ return ctdb_ctrl_updaterecord_recv(ctdb, state);
+}
diff --git a/ctdb/server/ctdb_cluster_mutex.c b/ctdb/server/ctdb_cluster_mutex.c
new file mode 100644
index 0000000..2fbe301
--- /dev/null
+++ b/ctdb/server/ctdb_cluster_mutex.c
@@ -0,0 +1,382 @@
+/*
+ CTDB cluster mutex handling
+
+ Copyright (C) Andrew Tridgell 2007
+ Copyright (C) Ronnie Sahlberg 2007
+ Copyright (C) Martin Schwenke 2016
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+#include "system/filesys.h"
+
+#include <tevent.h>
+
+#include "lib/util/debug.h"
+#include "lib/util/time.h"
+#include "lib/util/strv.h"
+#include "lib/util/strv_util.h"
+#include "lib/util/sys_rw.h"
+#include "lib/util/blocking.h"
+
+#include "ctdb_private.h"
+
+#include "ctdb_cluster_mutex.h"
+
+struct ctdb_cluster_mutex_handle {
+ struct ctdb_context *ctdb;
+ cluster_mutex_handler_t handler;
+ void *private_data;
+ cluster_mutex_lost_handler_t lost_handler;
+ void *lost_data;
+ int fd[2];
+ struct tevent_timer *te;
+ struct tevent_fd *fde;
+ pid_t child;
+ struct timeval start_time;
+ bool have_response;
+};
+
+static void cluster_mutex_timeout(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *private_data)
+{
+ struct ctdb_cluster_mutex_handle *h =
+ talloc_get_type(private_data, struct ctdb_cluster_mutex_handle);
+ double latency = timeval_elapsed(&h->start_time);
+
+ if (h->handler != NULL) {
+ h->handler('2', latency, h->private_data);
+ }
+}
+
+
+/* When the handle is freed it causes any child holding the mutex to
+ * be killed, thus freeing the mutex */
+static int cluster_mutex_destructor(struct ctdb_cluster_mutex_handle *h)
+{
+ if (h->fd[0] != -1) {
+ h->fd[0] = -1;
+ }
+ ctdb_kill(h->ctdb, h->child, SIGTERM);
+ return 0;
+}
+
+/* this is called when the client process has completed ctdb_recovery_lock()
+ and has written data back to us through the pipe.
+*/
+static void cluster_mutex_handler(struct tevent_context *ev,
+ struct tevent_fd *fde,
+ uint16_t flags, void *private_data)
+{
+ struct ctdb_cluster_mutex_handle *h=
+ talloc_get_type(private_data, struct ctdb_cluster_mutex_handle);
+ double latency = timeval_elapsed(&h->start_time);
+ char c = '0';
+ int ret;
+
+ /* Got response from child process so abort timeout */
+ TALLOC_FREE(h->te);
+
+ ret = sys_read(h->fd[0], &c, 1);
+
+ /* Don't call the handler more than once. It only exists to
+ * process the initial response from the helper. */
+ if (h->have_response) {
+ /* Only deal with EOF due to process exit. Silently
+ * ignore any other output. */
+ if (ret == 0) {
+ if (h->lost_handler != NULL) {
+ h->lost_handler(h->lost_data);
+ }
+ }
+ return;
+ }
+ h->have_response = true;
+
+ /* If the child wrote status then just pass it to the handler.
+ * If no status was written then this is an unexpected error
+ * so pass generic error code to handler. */
+ if (h->handler != NULL) {
+ h->handler(ret == 1 ? c : '3', latency, h->private_data);
+ }
+}
+
+static char cluster_mutex_helper[PATH_MAX+1] = "";
+
+static bool cluster_mutex_helper_args_file(TALLOC_CTX *mem_ctx,
+ const char *argstring,
+ char ***argv)
+{
+ struct stat st;
+ size_t size = sizeof(cluster_mutex_helper);
+ const char *t;
+ char **args = NULL;
+ int ret;
+
+ if (cluster_mutex_helper[0] != '\0') {
+ goto helper_done;
+ }
+
+ t = getenv("CTDB_CLUSTER_MUTEX_HELPER");
+ if (t != NULL) {
+ size_t len;
+
+ len = strlcpy(cluster_mutex_helper, t, size);
+ if (len >= size) {
+ DBG_ERR("error: CTDB_CLUSTER_MUTEX_HELPER too long\n");
+ exit(1);
+ }
+ } else {
+ ret = snprintf(cluster_mutex_helper,
+ size,
+ "%s/%s",
+ CTDB_HELPER_BINDIR,
+ "ctdb_mutex_fcntl_helper");
+ if (ret < 0 || (size_t)ret >= size) {
+ D_ERR("Unable to set cluster mutex helper - "
+ "path too long\n");
+ exit(1);
+ }
+ }
+
+ ret = stat(cluster_mutex_helper, &st);
+ if (ret != 0) {
+ D_ERR("Unable to set cluster mutex helper \"%s\" - %s\n",
+ cluster_mutex_helper,
+ strerror(errno));
+ exit(1);
+ }
+
+ if ((st.st_mode & S_IXUSR) == 0) {
+ D_ERR("Unable to set cluster_mutex helper \"%s\" - "
+ "not executable\n",
+ cluster_mutex_helper);
+ exit(1);
+ }
+
+ D_NOTICE("Set cluster mutex helper to \"%s\"\n", cluster_mutex_helper);
+
+helper_done:
+
+ /* Array includes default helper, file and NULL */
+ args = talloc_array(mem_ctx, char *, 3);
+ if (args == NULL) {
+ DBG_ERR("Memory allocation error\n");
+ return false;
+ }
+
+ args[0] = cluster_mutex_helper;
+
+ args[1] = talloc_strdup(args, argstring);
+ if (args[1] == NULL) {
+ DBG_ERR("Memory allocation error\n");
+ return false;
+ }
+
+ args[2] = NULL;
+
+ *argv = args;
+ return true;
+}
+
+static bool cluster_mutex_helper_args_cmd(TALLOC_CTX *mem_ctx,
+ const char *argstring,
+ char ***argv)
+{
+ int i, ret, n;
+ char **args = NULL;
+ char *strv = NULL;
+ char *t = NULL;
+
+ ret = strv_split(mem_ctx, &strv, argstring, " \t");
+ if (ret != 0) {
+ D_ERR("Unable to parse mutex helper command \"%s\" (%s)\n",
+ argstring,
+ strerror(ret));
+ return false;
+ }
+ n = strv_count(strv);
+ if (n == 0) {
+ D_ERR("Mutex helper command is empty \"%s\"\n", argstring);
+ return false;
+ }
+
+ /* Extra slot for NULL */
+ args = talloc_array(mem_ctx, char *, n + 1);
+ if (args == NULL) {
+ DBG_ERR("Memory allocation error\n");
+ return false;
+ }
+
+ talloc_steal(args, strv);
+
+ t = NULL;
+ for (i = 0 ; i < n; i++) {
+ t = strv_next(strv, t);
+ args[i] = t;
+ }
+
+ args[n] = NULL;
+
+ *argv = args;
+ return true;
+}
+
+static bool cluster_mutex_helper_args(TALLOC_CTX *mem_ctx,
+ const char *argstring,
+ char ***argv)
+{
+ bool ok;
+
+ if (argstring != NULL && argstring[0] == '!') {
+ ok = cluster_mutex_helper_args_cmd(mem_ctx, &argstring[1], argv);
+ } else {
+ ok = cluster_mutex_helper_args_file(mem_ctx, argstring, argv);
+ }
+
+ return ok;
+}
+
+struct ctdb_cluster_mutex_handle *
+ctdb_cluster_mutex(TALLOC_CTX *mem_ctx,
+ struct ctdb_context *ctdb,
+ const char *argstring,
+ int timeout,
+ cluster_mutex_handler_t handler,
+ void *private_data,
+ cluster_mutex_lost_handler_t lost_handler,
+ void *lost_data)
+{
+ struct ctdb_cluster_mutex_handle *h;
+ char **args;
+ sigset_t sigset_term;
+ int ret;
+
+ h = talloc(mem_ctx, struct ctdb_cluster_mutex_handle);
+ if (h == NULL) {
+ DBG_ERR("out of memory\n");
+ return NULL;
+ }
+
+ h->start_time = timeval_current();
+ h->fd[0] = -1;
+ h->fd[1] = -1;
+ h->have_response = false;
+
+ ret = pipe(h->fd);
+ if (ret != 0) {
+ talloc_free(h);
+ DBG_ERR("Failed to open pipe\n");
+ return NULL;
+ }
+ set_close_on_exec(h->fd[0]);
+
+ /* Create arguments for lock helper */
+ if (!cluster_mutex_helper_args(h, argstring, &args)) {
+ close(h->fd[0]);
+ close(h->fd[1]);
+ talloc_free(h);
+ return NULL;
+ }
+
+ sigemptyset(&sigset_term);
+ sigaddset(&sigset_term, SIGTERM);
+ ret = sigprocmask(SIG_BLOCK, &sigset_term, NULL);
+ if (ret != 0) {
+ DBG_WARNING("Failed to block SIGTERM (%d)\n", errno);
+ }
+
+ h->child = ctdb_fork(ctdb);
+ if (h->child == (pid_t)-1) {
+ close(h->fd[0]);
+ close(h->fd[1]);
+ talloc_free(h);
+ ret = sigprocmask(SIG_UNBLOCK, &sigset_term, NULL);
+ if (ret != 0) {
+ DBG_WARNING("Failed to unblock SIGTERM (%d)\n", errno);
+ }
+ return NULL;
+ }
+
+ if (h->child == 0) {
+ struct sigaction sa = {
+ .sa_handler = SIG_DFL,
+ };
+
+ ret = sigaction(SIGTERM, &sa, NULL);
+ if (ret != 0) {
+ DBG_WARNING("Failed to reset signal handler (%d)\n",
+ errno);
+ }
+
+ ret = sigprocmask(SIG_UNBLOCK, &sigset_term, NULL);
+ if (ret != 0) {
+ DBG_WARNING("Failed to unblock SIGTERM (%d)\n", errno);
+ }
+
+ /* Make stdout point to the pipe */
+ close(STDOUT_FILENO);
+ dup2(h->fd[1], STDOUT_FILENO);
+ close(h->fd[1]);
+
+ execv(args[0], args);
+
+ /* Only happens on error */
+ DBG_ERR("execv() failed\n");
+ _exit(1);
+ }
+
+ /* Parent */
+
+ ret = sigprocmask(SIG_UNBLOCK, &sigset_term, NULL);
+ if (ret != 0) {
+ DBG_WARNING("Failed to unblock SIGTERM (%d)\n", errno);
+ }
+
+ DBG_DEBUG("Created PIPE FD:%d\n", h->fd[0]);
+ set_close_on_exec(h->fd[0]);
+
+ close(h->fd[1]);
+ h->fd[1] = -1;
+
+ talloc_set_destructor(h, cluster_mutex_destructor);
+
+ if (timeout != 0) {
+ h->te = tevent_add_timer(ctdb->ev, h,
+ timeval_current_ofs(timeout, 0),
+ cluster_mutex_timeout, h);
+ } else {
+ h->te = NULL;
+ }
+
+ h->fde = tevent_add_fd(ctdb->ev, h, h->fd[0], TEVENT_FD_READ,
+ cluster_mutex_handler, (void *)h);
+
+ if (h->fde == NULL) {
+ talloc_free(h);
+ return NULL;
+ }
+ tevent_fd_set_auto_close(h->fde);
+
+ h->ctdb = ctdb;
+ h->handler = handler;
+ h->private_data = private_data;
+ h->lost_handler = lost_handler;
+ h->lost_data = lost_data;
+
+ return h;
+}
diff --git a/ctdb/server/ctdb_cluster_mutex.h b/ctdb/server/ctdb_cluster_mutex.h
new file mode 100644
index 0000000..4587290
--- /dev/null
+++ b/ctdb/server/ctdb_cluster_mutex.h
@@ -0,0 +1,51 @@
+/*
+ CTDB cluster mutex handling
+
+ Copyright (C) Andrew Tridgell 2007
+ Copyright (C) Ronnie Sahlberg 2007
+ Copyright (C) Martin Schwenke 2016
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __CTDB_CLUSTER_MUTEX_H__
+#define __CTDB_CLUSTER_MUTEX_H__
+
+#include <talloc.h>
+
+#include "replace.h"
+#include "system/network.h"
+
+#include "ctdb_private.h"
+
+struct ctdb_cluster_mutex_handle;
+
+typedef void (*cluster_mutex_handler_t) (
+ char status,
+ double latency,
+ void *private_data);
+
+typedef void (*cluster_mutex_lost_handler_t) (void *private_data);
+
+struct ctdb_cluster_mutex_handle *
+ctdb_cluster_mutex(TALLOC_CTX *mem_ctx,
+ struct ctdb_context *ctdb,
+ const char *argstring,
+ int timeout,
+ cluster_mutex_handler_t handler,
+ void *private_data,
+ cluster_mutex_lost_handler_t lost_handler,
+ void *lost_data);
+
+#endif /* __CTDB_CLUSTER_MUTEX_H__ */
diff --git a/ctdb/server/ctdb_config.c b/ctdb/server/ctdb_config.c
new file mode 100644
index 0000000..7283027
--- /dev/null
+++ b/ctdb/server/ctdb_config.c
@@ -0,0 +1,183 @@
+/*
+ CTDB daemon config handling
+
+ Copyright (C) Martin Schwenke 2018
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+
+#include "lib/util/debug.h"
+
+#include "common/conf.h"
+#include "common/logging_conf.h"
+#include "common/path.h"
+
+#include "cluster/cluster_conf.h"
+#include "database/database_conf.h"
+#include "event/event_conf.h"
+#include "failover/failover_conf.h"
+#include "legacy_conf.h"
+
+#include "ctdb_config.h"
+
+struct ctdb_config ctdb_config;
+
+static void setup_config_pointers(struct conf_context *conf)
+{
+ /*
+ * Cluster
+ */
+
+ conf_assign_string_pointer(conf,
+ CLUSTER_CONF_SECTION,
+ CLUSTER_CONF_TRANSPORT,
+ &ctdb_config.transport);
+ conf_assign_string_pointer(conf,
+ CLUSTER_CONF_SECTION,
+ CLUSTER_CONF_NODE_ADDRESS,
+ &ctdb_config.node_address);
+ conf_assign_string_pointer(conf,
+ CLUSTER_CONF_SECTION,
+ CLUSTER_CONF_CLUSTER_LOCK,
+ &ctdb_config.cluster_lock);
+ conf_assign_string_pointer(conf,
+ CLUSTER_CONF_SECTION,
+ CLUSTER_CONF_RECOVERY_LOCK,
+ &ctdb_config.recovery_lock);
+ conf_assign_integer_pointer(conf,
+ CLUSTER_CONF_SECTION,
+ CLUSTER_CONF_LEADER_TIMEOUT,
+ &ctdb_config.leader_timeout);
+ conf_assign_boolean_pointer(conf,
+ CLUSTER_CONF_SECTION,
+ CLUSTER_CONF_LEADER_CAPABILITY,
+ &ctdb_config.leader_capability);
+
+ /*
+ * Database
+ */
+
+ conf_assign_string_pointer(conf,
+ DATABASE_CONF_SECTION,
+ DATABASE_CONF_VOLATILE_DB_DIR,
+ &ctdb_config.dbdir_volatile);
+ conf_assign_string_pointer(conf,
+ DATABASE_CONF_SECTION,
+ DATABASE_CONF_PERSISTENT_DB_DIR,
+ &ctdb_config.dbdir_persistent);
+ conf_assign_string_pointer(conf,
+ DATABASE_CONF_SECTION,
+ DATABASE_CONF_STATE_DB_DIR,
+ &ctdb_config.dbdir_state);
+ conf_assign_string_pointer(conf,
+ DATABASE_CONF_SECTION,
+ DATABASE_CONF_LOCK_DEBUG_SCRIPT,
+ &ctdb_config.lock_debug_script);
+ conf_assign_boolean_pointer(conf,
+ DATABASE_CONF_SECTION,
+ DATABASE_CONF_TDB_MUTEXES,
+ &ctdb_config.tdb_mutexes);
+
+ /*
+ * Event
+ */
+ conf_assign_string_pointer(conf,
+ EVENT_CONF_SECTION,
+ EVENT_CONF_DEBUG_SCRIPT,
+ &ctdb_config.event_debug_script);
+
+ /*
+ * Failover
+ */
+ conf_assign_boolean_pointer(conf,
+ FAILOVER_CONF_SECTION,
+ FAILOVER_CONF_DISABLED,
+ &ctdb_config.failover_disabled);
+
+ /*
+ * Legacy
+ */
+
+ conf_assign_boolean_pointer(conf,
+ LEGACY_CONF_SECTION,
+ LEGACY_CONF_REALTIME_SCHEDULING,
+ &ctdb_config.realtime_scheduling);
+ conf_assign_boolean_pointer(conf,
+ LEGACY_CONF_SECTION,
+ LEGACY_CONF_LMASTER_CAPABILITY,
+ &ctdb_config.lmaster_capability);
+ conf_assign_boolean_pointer(conf,
+ LEGACY_CONF_SECTION,
+ LEGACY_CONF_START_AS_STOPPED,
+ &ctdb_config.start_as_stopped);
+ conf_assign_boolean_pointer(conf,
+ LEGACY_CONF_SECTION,
+ LEGACY_CONF_START_AS_DISABLED,
+ &ctdb_config.start_as_disabled);
+ conf_assign_string_pointer(conf,
+ LEGACY_CONF_SECTION,
+ LEGACY_CONF_SCRIPT_LOG_LEVEL,
+ &ctdb_config.script_log_level);
+}
+
+int ctdbd_config_load(TALLOC_CTX *mem_ctx,
+ struct conf_context **result)
+{
+ struct conf_context *conf = NULL;
+ int ret = 0;
+ char *conf_file = NULL;
+
+ ret = conf_init(mem_ctx, &conf);
+ if (ret != 0) {
+ return ret;
+ }
+
+ logging_conf_init(conf, "NOTICE");
+ cluster_conf_init(conf);
+ database_conf_init(conf);
+ event_conf_init(conf);
+ failover_conf_init(conf);
+ legacy_conf_init(conf);
+
+ setup_config_pointers(conf);
+
+ if (! conf_valid(conf)) {
+ ret = EINVAL;
+ goto fail;
+ }
+
+ conf_file = path_config(conf);
+ if (conf_file == NULL) {
+ D_ERR("Memory allocation error\n");
+ ret = ENOMEM;
+ goto fail;
+ }
+ ret = conf_load(conf, conf_file, true);
+ /* Configuration file does not need to exist */
+ if (ret != 0 && ret != ENOENT) {
+ D_ERR("Failed to load configuration file %s\n", conf_file);
+ goto fail;
+ }
+
+ talloc_free(conf_file);
+ *result = conf;
+
+ return 0;
+
+fail:
+ talloc_free(conf);
+ return ret;
+}
diff --git a/ctdb/server/ctdb_config.h b/ctdb/server/ctdb_config.h
new file mode 100644
index 0000000..7ccda7d
--- /dev/null
+++ b/ctdb/server/ctdb_config.h
@@ -0,0 +1,59 @@
+/*
+ CTDB daemon config handling
+
+ Copyright (C) Martin Schwenke 2018
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __CTDB_CONFIG_H__
+#define __CTDB_CONFIG_H__
+
+#include "common/conf.h"
+
+struct ctdb_config {
+ /* Cluster */
+ const char *transport;
+ const char *node_address;
+ const char *cluster_lock;
+ const char *recovery_lock;
+ int leader_timeout;
+ bool leader_capability;
+
+ /* Database */
+ const char *dbdir_volatile;
+ const char *dbdir_persistent;
+ const char *dbdir_state;
+ const char *lock_debug_script;
+ bool tdb_mutexes;
+
+ /* Event */
+ const char *event_debug_script;
+
+ /* Failover */
+ bool failover_disabled;
+
+ /* Legacy */
+ bool realtime_scheduling;
+ bool lmaster_capability;
+ bool start_as_stopped;
+ bool start_as_disabled;
+ const char *script_log_level;
+};
+
+extern struct ctdb_config ctdb_config;
+
+int ctdbd_config_load(TALLOC_CTX *mem_ctx, struct conf_context **conf);
+
+#endif /* __CTDB_CONFIG_H__ */
diff --git a/ctdb/server/ctdb_control.c b/ctdb/server/ctdb_control.c
new file mode 100644
index 0000000..0826851
--- /dev/null
+++ b/ctdb/server/ctdb_control.c
@@ -0,0 +1,1089 @@
+/*
+ ctdb_control protocol code
+
+ Copyright (C) Andrew Tridgell 2007
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "replace.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "system/wait.h"
+
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/tdb_wrap/tdb_wrap.h"
+#include "lib/util/dlinklist.h"
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+#include "lib/util/talloc_report.h"
+
+#include "ctdb_private.h"
+#include "ctdb_client.h"
+
+#include "protocol/protocol_private.h"
+
+#include "common/reqid.h"
+#include "common/common.h"
+#include "common/logging.h"
+
+
+struct ctdb_control_state {
+ struct ctdb_context *ctdb;
+ uint32_t reqid;
+ ctdb_control_callback_fn_t callback;
+ void *private_data;
+ unsigned flags;
+};
+
+
+/*
+ dump talloc memory hierarchy, returning it as a blob to the client
+ */
+int32_t ctdb_dump_memory(struct ctdb_context *ctdb, TDB_DATA *outdata)
+{
+ char *report;
+ size_t reportlen;
+
+ report = talloc_report_str(outdata, NULL);
+ if (report == NULL) {
+ DEBUG(DEBUG_ERR,
+ (__location__ " talloc_report_str failed\n"));
+ return -1;
+ }
+ reportlen = talloc_get_size(report);
+
+ if (reportlen > 0) {
+ reportlen -= 1; /* strip trailing zero */
+ }
+
+ outdata->dptr = (uint8_t *)report;
+ outdata->dsize = reportlen;
+ return 0;
+}
+
+static int32_t control_not_implemented(const char *unsupported,
+ const char *alternate)
+{
+ if (alternate == NULL) {
+ DEBUG(DEBUG_ERR,
+ ("Control %s is not implemented any more\n",
+ unsupported));
+ } else {
+ DEBUG(DEBUG_ERR,
+ ("Control %s is not implemented any more, use %s instead\n",
+ unsupported, alternate));
+ }
+ return -1;
+}
+
+struct ctdb_echo_data_state {
+ struct ctdb_context *ctdb;
+ struct ctdb_req_control_old *c;
+ struct ctdb_echo_data *data;
+};
+
+static void ctdb_echo_data_timeout(
+ struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval now,
+ void *private_data);
+
+static int32_t ctdb_control_echo_data(
+ struct ctdb_context *ctdb,
+ struct ctdb_req_control_old *c,
+ TDB_DATA indata,
+ bool *async_reply)
+{
+ struct ctdb_echo_data_state *state = NULL;
+ struct tevent_timer *te = NULL;
+ uint32_t delay = 0;
+ size_t np = 0;
+ int ret;
+
+ state = talloc_zero(ctdb, struct ctdb_echo_data_state);
+ CTDB_NO_MEMORY(ctdb, state);
+ state->ctdb = ctdb;
+
+ ret = ctdb_echo_data_pull(
+ indata.dptr, indata.dsize, state, &state->data, &np);
+ if (ret != 0) {
+ DBG_DEBUG("ctdb_echo_data_pull failed: %s\n",
+ strerror(ret));
+ TALLOC_FREE(state);
+ return -1;
+ }
+
+ te = tevent_add_timer(
+ ctdb->ev,
+ state,
+ timeval_current_ofs_msec(delay),
+ ctdb_echo_data_timeout,
+ state);
+ if (te == NULL) {
+ DBG_DEBUG("tevent_add_timer failed\n");
+ TALLOC_FREE(state);
+ return -1;
+ }
+
+ state->c = talloc_move(state, &c);
+ *async_reply = true;
+
+ return 0;
+}
+
+static void ctdb_echo_data_timeout(
+ struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval now,
+ void *private_data)
+{
+ struct ctdb_echo_data_state *state = talloc_get_type_abort(
+ private_data, struct ctdb_echo_data_state);
+ size_t len = ctdb_echo_data_len(state->data);
+ uint8_t *buf = NULL;
+ size_t np;
+ TDB_DATA data;
+
+ DBG_DEBUG("reqid=%"PRIu32" len=%zu\n", state->c->hdr.reqid, len);
+
+ buf = talloc_array(state, uint8_t, len);
+ if (buf == NULL) {
+ DBG_WARNING("talloc_array(%zu) failed\n", len);
+ goto done;
+ }
+ ctdb_echo_data_push(state->data, buf, &np);
+ data = (TDB_DATA) { .dptr = buf, .dsize = np };
+
+ ctdb_request_control_reply(state->ctdb, state->c, &data, 0, NULL);
+
+done:
+ TALLOC_FREE(state);
+}
+
+static int ctdb_control_disable_node(struct ctdb_context *ctdb)
+{
+ struct ctdb_node *node;
+
+ node = ctdb_find_node(ctdb, CTDB_CURRENT_NODE);
+ if (node == NULL) {
+ /* Can't happen */
+ DBG_ERR("Unable to find current node\n");
+ return -1;
+ }
+
+ D_ERR("Disable node\n");
+ node->flags |= NODE_FLAGS_PERMANENTLY_DISABLED;
+
+ return 0;
+}
+
+static int ctdb_control_enable_node(struct ctdb_context *ctdb)
+{
+ struct ctdb_node *node;
+
+ node = ctdb_find_node(ctdb, CTDB_CURRENT_NODE);
+ if (node == NULL) {
+ /* Can't happen */
+ DBG_ERR("Unable to find current node\n");
+ return -1;
+ }
+
+ D_ERR("Enable node\n");
+ node->flags &= ~NODE_FLAGS_PERMANENTLY_DISABLED;
+
+ return 0;
+}
+
+/*
+ process a control request
+ */
+static int32_t ctdb_control_dispatch(struct ctdb_context *ctdb,
+ struct ctdb_req_control_old *c,
+ TDB_DATA indata,
+ TDB_DATA *outdata, uint32_t srcnode,
+ const char **errormsg,
+ bool *async_reply)
+{
+ uint32_t opcode = c->opcode;
+ uint64_t srvid = c->srvid;
+ uint32_t client_id = c->client_id;
+ static int level = DEBUG_ERR;
+
+ switch (opcode) {
+ case CTDB_CONTROL_PROCESS_EXISTS: {
+ CHECK_CONTROL_DATA_SIZE(sizeof(pid_t));
+ return ctdb_control_process_exists(ctdb, *(pid_t *)indata.dptr);
+ }
+
+ case CTDB_CONTROL_SET_DEBUG: {
+ union {
+ uint8_t *ptr;
+ int32_t *level;
+ } debug;
+ CHECK_CONTROL_DATA_SIZE(sizeof(int32_t));
+ debug.ptr = indata.dptr;
+ debuglevel_set(*debug.level);
+ return 0;
+ }
+
+ case CTDB_CONTROL_GET_DEBUG: {
+ CHECK_CONTROL_DATA_SIZE(0);
+ level = debuglevel_get();
+ outdata->dptr = (uint8_t *)&(level);
+ outdata->dsize = sizeof(DEBUGLEVEL);
+ return 0;
+ }
+
+ case CTDB_CONTROL_STATISTICS: {
+ CHECK_CONTROL_DATA_SIZE(0);
+ ctdb->statistics.memory_used = talloc_total_size(NULL);
+ ctdb->statistics.num_clients = ctdb->num_clients;
+ ctdb->statistics.frozen = (ctdb_db_all_frozen(ctdb) ? 1 : 0);
+ ctdb->statistics.recovering = (ctdb->recovery_mode == CTDB_RECOVERY_ACTIVE);
+ ctdb->statistics.statistics_current_time = timeval_current();
+
+ outdata->dptr = (uint8_t *)&ctdb->statistics;
+ outdata->dsize = sizeof(ctdb->statistics);
+ return 0;
+ }
+
+ case CTDB_CONTROL_GET_ALL_TUNABLES: {
+ CHECK_CONTROL_DATA_SIZE(0);
+ outdata->dptr = (uint8_t *)&ctdb->tunable;
+ outdata->dsize = sizeof(ctdb->tunable);
+ return 0;
+ }
+
+ case CTDB_CONTROL_DUMP_MEMORY: {
+ CHECK_CONTROL_DATA_SIZE(0);
+ return ctdb_dump_memory(ctdb, outdata);
+ }
+
+ case CTDB_CONTROL_STATISTICS_RESET: {
+ struct ctdb_db_context *ctdb_db;
+
+ CHECK_CONTROL_DATA_SIZE(0);
+ ZERO_STRUCT(ctdb->statistics);
+ for (ctdb_db = ctdb->db_list;
+ ctdb_db != NULL;
+ ctdb_db = ctdb_db->next) {
+ ctdb_db_statistics_reset(ctdb_db);
+ }
+ ctdb->statistics.statistics_start_time = timeval_current();
+ return 0;
+ }
+
+ case CTDB_CONTROL_GETVNNMAP:
+ return ctdb_control_getvnnmap(ctdb, opcode, indata, outdata);
+
+ case CTDB_CONTROL_GET_DBMAP:
+ return ctdb_control_getdbmap(ctdb, opcode, indata, outdata);
+
+ case CTDB_CONTROL_GET_NODEMAPv4:
+ return control_not_implemented("GET_NODEMAPv4", "GET_NODEMAP");
+
+ case CTDB_CONTROL_GET_NODEMAP:
+ return ctdb_control_getnodemap(ctdb, opcode, indata, outdata);
+
+ case CTDB_CONTROL_GET_NODES_FILE:
+ return ctdb_control_getnodesfile(ctdb, opcode, indata, outdata);
+
+ case CTDB_CONTROL_RELOAD_NODES_FILE:
+ CHECK_CONTROL_DATA_SIZE(0);
+ return ctdb_control_reload_nodes_file(ctdb, opcode);
+
+ case CTDB_CONTROL_SET_DB_STICKY: {
+ uint32_t db_id;
+ struct ctdb_db_context *ctdb_db;
+
+ CHECK_CONTROL_DATA_SIZE(sizeof(db_id));
+ db_id = *(uint32_t *)indata.dptr;
+ ctdb_db = find_ctdb_db(ctdb, db_id);
+ if (ctdb_db == NULL) return -1;
+ return ctdb_set_db_sticky(ctdb, ctdb_db);
+ }
+
+ case CTDB_CONTROL_SETVNNMAP:
+ return ctdb_control_setvnnmap(ctdb, opcode, indata, outdata);
+
+ case CTDB_CONTROL_PULL_DB:
+ return control_not_implemented("PULL_DB", NULL);
+
+ case CTDB_CONTROL_SET_DMASTER:
+ return control_not_implemented("SET_DMASTER", NULL);
+
+ case CTDB_CONTROL_PUSH_DB:
+ return control_not_implemented("PUSH_DB", NULL);
+
+ case CTDB_CONTROL_GET_RECMODE: {
+ return ctdb->recovery_mode;
+ }
+
+ case CTDB_CONTROL_SET_RECMASTER:
+ return control_not_implemented("SET_RECMASTER", NULL);
+
+ case CTDB_CONTROL_GET_RECMASTER:
+ return control_not_implemented("GET_RECMASTER", NULL);
+
+ case CTDB_CONTROL_GET_PID:
+ return getpid();
+
+ case CTDB_CONTROL_GET_PNN:
+ return ctdb->pnn;
+
+ case CTDB_CONTROL_PING:
+ CHECK_CONTROL_DATA_SIZE(0);
+ return ctdb->num_clients;
+
+ case CTDB_CONTROL_GET_RUNSTATE:
+ CHECK_CONTROL_DATA_SIZE(0);
+ outdata->dptr = (uint8_t *)&ctdb->runstate;
+ outdata->dsize = sizeof(uint32_t);
+ return 0;
+
+
+ case CTDB_CONTROL_SET_DB_READONLY: {
+ uint32_t db_id;
+ struct ctdb_db_context *ctdb_db;
+
+ CHECK_CONTROL_DATA_SIZE(sizeof(db_id));
+ db_id = *(uint32_t *)indata.dptr;
+ ctdb_db = find_ctdb_db(ctdb, db_id);
+ if (ctdb_db == NULL) return -1;
+ return ctdb_set_db_readonly(ctdb, ctdb_db);
+ }
+ case CTDB_CONTROL_GET_DBNAME: {
+ uint32_t db_id;
+ struct ctdb_db_context *ctdb_db;
+
+ CHECK_CONTROL_DATA_SIZE(sizeof(db_id));
+ db_id = *(uint32_t *)indata.dptr;
+ ctdb_db = find_ctdb_db(ctdb, db_id);
+ if (ctdb_db == NULL) return -1;
+ outdata->dptr = discard_const(ctdb_db->db_name);
+ outdata->dsize = strlen(ctdb_db->db_name)+1;
+ return 0;
+ }
+
+ case CTDB_CONTROL_GETDBPATH: {
+ uint32_t db_id;
+ struct ctdb_db_context *ctdb_db;
+
+ CHECK_CONTROL_DATA_SIZE(sizeof(db_id));
+ db_id = *(uint32_t *)indata.dptr;
+ ctdb_db = find_ctdb_db(ctdb, db_id);
+ if (ctdb_db == NULL) return -1;
+ outdata->dptr = discard_const(ctdb_db->db_path);
+ outdata->dsize = strlen(ctdb_db->db_path)+1;
+ return 0;
+ }
+
+ case CTDB_CONTROL_DB_ATTACH:
+ return ctdb_control_db_attach(ctdb,
+ indata,
+ outdata,
+ 0,
+ srcnode,
+ client_id,
+ c,
+ async_reply);
+
+ case CTDB_CONTROL_DB_ATTACH_PERSISTENT:
+ return ctdb_control_db_attach(ctdb,
+ indata,
+ outdata,
+ CTDB_DB_FLAGS_PERSISTENT,
+ srcnode,
+ client_id,
+ c,
+ async_reply);
+
+ case CTDB_CONTROL_DB_ATTACH_REPLICATED:
+ return ctdb_control_db_attach(ctdb,
+ indata,
+ outdata,
+ CTDB_DB_FLAGS_REPLICATED,
+ srcnode,
+ client_id,
+ c,
+ async_reply);
+
+ case CTDB_CONTROL_SET_CALL:
+ return control_not_implemented("SET_CALL", NULL);
+
+ case CTDB_CONTROL_TRAVERSE_START:
+ CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_traverse_start));
+ return ctdb_control_traverse_start(ctdb, indata, outdata, srcnode, client_id);
+
+ case CTDB_CONTROL_TRAVERSE_START_EXT:
+ CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_traverse_start_ext));
+ return ctdb_control_traverse_start_ext(ctdb, indata, outdata, srcnode, client_id);
+
+ case CTDB_CONTROL_TRAVERSE_ALL:
+ return ctdb_control_traverse_all(ctdb, indata, outdata);
+
+ case CTDB_CONTROL_TRAVERSE_ALL_EXT:
+ return ctdb_control_traverse_all_ext(ctdb, indata, outdata);
+
+ case CTDB_CONTROL_TRAVERSE_DATA:
+ return ctdb_control_traverse_data(ctdb, indata, outdata);
+
+ case CTDB_CONTROL_TRAVERSE_KILL:
+ CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_traverse_start));
+ return ctdb_control_traverse_kill(ctdb, indata, outdata, srcnode);
+
+ case CTDB_CONTROL_REGISTER_SRVID:
+ return daemon_register_message_handler(ctdb, client_id, srvid);
+
+ case CTDB_CONTROL_DEREGISTER_SRVID:
+ return daemon_deregister_message_handler(ctdb, client_id, srvid);
+
+ case CTDB_CONTROL_CHECK_SRVIDS:
+ return control_not_implemented("CHECK_SRVIDS", NULL);
+
+ case CTDB_CONTROL_ENABLE_SEQNUM:
+ CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
+ return ctdb_ltdb_enable_seqnum(ctdb, *(uint32_t *)indata.dptr);
+
+ case CTDB_CONTROL_UPDATE_SEQNUM:
+ CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
+ return ctdb_ltdb_update_seqnum(ctdb, *(uint32_t *)indata.dptr, srcnode);
+
+ case CTDB_CONTROL_FREEZE:
+ CHECK_CONTROL_DATA_SIZE(0);
+ return ctdb_control_freeze(ctdb, c, async_reply);
+
+ case CTDB_CONTROL_THAW:
+ return control_not_implemented("THAW", NULL);
+
+ case CTDB_CONTROL_SET_RECMODE:
+ CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
+ return ctdb_control_set_recmode(ctdb, c, indata, async_reply, errormsg);
+
+ case CTDB_CONTROL_GET_MONMODE:
+ return control_not_implemented("GET_MONMODE", NULL);
+
+ case CTDB_CONTROL_ENABLE_MONITOR:
+ return control_not_implemented("ENABLE_MONITOR", NULL);
+
+ case CTDB_CONTROL_RUN_EVENTSCRIPTS:
+ return control_not_implemented("RUN_EVENTSCRIPTS", NULL);
+
+ case CTDB_CONTROL_DISABLE_MONITOR:
+ return control_not_implemented("DISABLE_MONITOR", NULL);
+
+ case CTDB_CONTROL_SHUTDOWN:
+ DEBUG(DEBUG_NOTICE,("Received SHUTDOWN command.\n"));
+ ctdb_shutdown_sequence(ctdb, 0);
+ /* In case above returns due to duplicate shutdown */
+ return 0;
+
+ case CTDB_CONTROL_TAKEOVER_IPv4:
+ return control_not_implemented("TAKEOVER_IPv4", "TAKEOVER_IP");
+
+ case CTDB_CONTROL_TAKEOVER_IP:
+ CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_public_ip));
+ return ctdb_control_takeover_ip(ctdb, c, indata, async_reply);
+
+ case CTDB_CONTROL_RELEASE_IPv4:
+ return control_not_implemented("RELEASE_IPv4", "RELEASE_IP");
+
+ case CTDB_CONTROL_RELEASE_IP:
+ CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_public_ip));
+ return ctdb_control_release_ip(ctdb, c, indata, async_reply);
+
+ case CTDB_CONTROL_IPREALLOCATED:
+ CHECK_CONTROL_DATA_SIZE(0);
+ return ctdb_control_ipreallocated(ctdb, c, async_reply);
+
+ case CTDB_CONTROL_GET_PUBLIC_IPSv4:
+ return control_not_implemented("GET_PUBLIC_IPSv4",
+ "GET_PUBLIC_IPS");
+
+ case CTDB_CONTROL_GET_PUBLIC_IPS:
+ CHECK_CONTROL_DATA_SIZE(0);
+ return ctdb_control_get_public_ips(ctdb, c, outdata);
+
+ case CTDB_CONTROL_TCP_CLIENT:
+ CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_connection));
+ return ctdb_control_tcp_client(ctdb, client_id, indata);
+
+ case CTDB_CONTROL_STARTUP:
+ CHECK_CONTROL_DATA_SIZE(0);
+ return ctdb_control_startup(ctdb, srcnode);
+
+ case CTDB_CONTROL_TCP_ADD:
+ CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_connection));
+ return ctdb_control_tcp_add(ctdb, indata, false);
+
+ case CTDB_CONTROL_TCP_ADD_DELAYED_UPDATE:
+ CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_connection));
+ return ctdb_control_tcp_add(ctdb, indata, true);
+
+ case CTDB_CONTROL_TCP_REMOVE:
+ CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_connection));
+ return ctdb_control_tcp_remove(ctdb, indata);
+
+ case CTDB_CONTROL_SET_TUNABLE:
+ return ctdb_control_set_tunable(ctdb, indata);
+
+ case CTDB_CONTROL_GET_TUNABLE:
+ return ctdb_control_get_tunable(ctdb, indata, outdata);
+
+ case CTDB_CONTROL_LIST_TUNABLES:
+ return ctdb_control_list_tunables(ctdb, outdata);
+
+ case CTDB_CONTROL_MODIFY_FLAGS:
+ CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_node_flag_change));
+ return ctdb_control_modflags(ctdb, indata);
+
+ case CTDB_CONTROL_KILL_TCP:
+ return control_not_implemented("KILL_TCP", NULL);
+
+ case CTDB_CONTROL_GET_TCP_TICKLE_LIST:
+ CHECK_CONTROL_DATA_SIZE(sizeof(ctdb_sock_addr));
+ return ctdb_control_get_tcp_tickle_list(ctdb, indata, outdata);
+
+ case CTDB_CONTROL_SET_TCP_TICKLE_LIST:
+ /* data size is verified in the called function */
+ return ctdb_control_set_tcp_tickle_list(ctdb, indata);
+
+ case CTDB_CONTROL_REGISTER_SERVER_ID:
+ return control_not_implemented("REGISTER_SERVER_ID", NULL);
+
+ case CTDB_CONTROL_UNREGISTER_SERVER_ID:
+ return control_not_implemented("UNREGISTER_SERVER_ID", NULL);
+
+ case CTDB_CONTROL_CHECK_SERVER_ID:
+ return control_not_implemented("CHECK_SERVER_ID", NULL);
+
+ case CTDB_CONTROL_GET_SERVER_ID_LIST:
+ return control_not_implemented("SERVER_ID_LIST", NULL);
+
+ case CTDB_CONTROL_PERSISTENT_STORE:
+ return control_not_implemented("PERSISTENT_STORE", NULL);
+
+ case CTDB_CONTROL_UPDATE_RECORD:
+ return ctdb_control_update_record(ctdb, c, indata, async_reply);
+
+ case CTDB_CONTROL_SEND_GRATUITOUS_ARP:
+ return ctdb_control_send_gratious_arp(ctdb, indata);
+
+ case CTDB_CONTROL_TRANSACTION_START:
+ return control_not_implemented("TRANSACTION_START", NULL);
+
+ case CTDB_CONTROL_TRANSACTION_COMMIT:
+ return control_not_implemented("TRANSACTION_COMMIT", NULL);
+
+ case CTDB_CONTROL_WIPE_DATABASE:
+ CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_transdb));
+ return ctdb_control_wipe_database(ctdb, indata);
+
+ case CTDB_CONTROL_UPTIME:
+ return ctdb_control_uptime(ctdb, outdata);
+
+ case CTDB_CONTROL_START_RECOVERY:
+ return ctdb_control_start_recovery(ctdb, c, async_reply);
+
+ case CTDB_CONTROL_END_RECOVERY:
+ return ctdb_control_end_recovery(ctdb, c, async_reply);
+
+ case CTDB_CONTROL_TRY_DELETE_RECORDS:
+ return ctdb_control_try_delete_records(ctdb, indata, outdata);
+
+ case CTDB_CONTROL_ADD_PUBLIC_IP:
+ return ctdb_control_add_public_address(ctdb, indata);
+
+ case CTDB_CONTROL_DEL_PUBLIC_IP:
+ return ctdb_control_del_public_address(ctdb, indata);
+
+ case CTDB_CONTROL_GET_CAPABILITIES:
+ return ctdb_control_get_capabilities(ctdb, outdata);
+
+ case CTDB_CONTROL_START_PERSISTENT_UPDATE:
+ return ctdb_control_start_persistent_update(ctdb, c, indata);
+
+ case CTDB_CONTROL_CANCEL_PERSISTENT_UPDATE:
+ return ctdb_control_cancel_persistent_update(ctdb, c, indata);
+
+ case CTDB_CONTROL_TRANS2_COMMIT:
+ case CTDB_CONTROL_TRANS2_COMMIT_RETRY:
+ return control_not_implemented("TRANS2_COMMIT", "TRANS3_COMMIT");
+
+ case CTDB_CONTROL_TRANS2_ERROR:
+ return control_not_implemented("TRANS2_ERROR", NULL);
+
+ case CTDB_CONTROL_TRANS2_FINISHED:
+ return control_not_implemented("TRANS2_FINISHED", NULL);
+
+ case CTDB_CONTROL_TRANS2_ACTIVE:
+ return control_not_implemented("TRANS2_ACTIVE", NULL);
+
+ case CTDB_CONTROL_TRANS3_COMMIT:
+ return ctdb_control_trans3_commit(ctdb, c, indata, async_reply);
+
+ case CTDB_CONTROL_RECD_PING:
+ CHECK_CONTROL_DATA_SIZE(0);
+ return ctdb_control_recd_ping(ctdb);
+
+ case CTDB_CONTROL_GET_EVENT_SCRIPT_STATUS:
+ return control_not_implemented("GET_EVENT_SCRIPT_STATUS", NULL);
+
+ case CTDB_CONTROL_RECD_RECLOCK_LATENCY:
+ CHECK_CONTROL_DATA_SIZE(sizeof(double));
+ CTDB_UPDATE_RECLOCK_LATENCY(ctdb, "recd reclock", reclock.recd, *((double *)indata.dptr));
+ return 0;
+ case CTDB_CONTROL_GET_RECLOCK_FILE:
+ CHECK_CONTROL_DATA_SIZE(0);
+ if (ctdb->recovery_lock != NULL) {
+ outdata->dptr = discard_const(ctdb->recovery_lock);
+ outdata->dsize = strlen(ctdb->recovery_lock) + 1;
+ }
+ return 0;
+ case CTDB_CONTROL_SET_RECLOCK_FILE:
+ return control_not_implemented("SET_RECLOCK", NULL);
+
+ case CTDB_CONTROL_STOP_NODE:
+ CHECK_CONTROL_DATA_SIZE(0);
+ return ctdb_control_stop_node(ctdb);
+
+ case CTDB_CONTROL_CONTINUE_NODE:
+ CHECK_CONTROL_DATA_SIZE(0);
+ return ctdb_control_continue_node(ctdb);
+
+ case CTDB_CONTROL_SET_NATGWSTATE:
+ return control_not_implemented("SET_NATGWSTATE", NULL);
+
+ case CTDB_CONTROL_SET_LMASTERROLE: {
+ uint32_t lmasterrole;
+
+ CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
+ lmasterrole = *(uint32_t *)indata.dptr;
+ if (lmasterrole == 0) {
+ ctdb->capabilities &= ~CTDB_CAP_LMASTER;
+ } else {
+ ctdb->capabilities |= CTDB_CAP_LMASTER;
+ }
+ return 0;
+ }
+
+ case CTDB_CONTROL_SET_RECMASTERROLE: {
+ uint32_t recmasterrole;
+
+ CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
+ recmasterrole = *(uint32_t *)indata.dptr;
+ if (recmasterrole == 0) {
+ ctdb->capabilities &= ~CTDB_CAP_RECMASTER;
+ } else {
+ ctdb->capabilities |= CTDB_CAP_RECMASTER;
+ }
+ return 0;
+ }
+
+ case CTDB_CONTROL_ENABLE_SCRIPT:
+ return control_not_implemented("ENABLE_SCRIPT", NULL);
+
+ case CTDB_CONTROL_DISABLE_SCRIPT:
+ return control_not_implemented("DISABLE_SCRIPT", NULL);
+
+ case CTDB_CONTROL_SET_BAN_STATE:
+ CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_ban_state));
+ return ctdb_control_set_ban_state(ctdb, indata);
+
+ case CTDB_CONTROL_GET_BAN_STATE:
+ CHECK_CONTROL_DATA_SIZE(0);
+ return ctdb_control_get_ban_state(ctdb, outdata);
+
+ case CTDB_CONTROL_SET_DB_PRIORITY:
+ return control_not_implemented("SET_DB_PRIORITY", NULL);
+
+ case CTDB_CONTROL_GET_DB_PRIORITY:
+ return control_not_implemented("GET_DB_PRIORITY", NULL);
+
+ case CTDB_CONTROL_TRANSACTION_CANCEL:
+ return control_not_implemented("TRANSACTION_CANCEL", NULL);
+
+ case CTDB_CONTROL_REGISTER_NOTIFY:
+ return ctdb_control_register_notify(ctdb, client_id, indata);
+
+ case CTDB_CONTROL_DEREGISTER_NOTIFY:
+ CHECK_CONTROL_DATA_SIZE(sizeof(uint64_t));
+ return ctdb_control_deregister_notify(ctdb, client_id, indata);
+
+ case CTDB_CONTROL_GET_LOG:
+ return control_not_implemented("GET_LOG", NULL);
+
+ case CTDB_CONTROL_CLEAR_LOG:
+ return control_not_implemented("CLEAR_LOG", NULL);
+
+ case CTDB_CONTROL_GET_DB_SEQNUM:
+ CHECK_CONTROL_DATA_SIZE(sizeof(uint64_t));
+ return ctdb_control_get_db_seqnum(ctdb, indata, outdata);
+
+ case CTDB_CONTROL_DB_SET_HEALTHY:
+ CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
+ return ctdb_control_db_set_healthy(ctdb, indata);
+
+ case CTDB_CONTROL_DB_GET_HEALTH:
+ CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
+ return ctdb_control_db_get_health(ctdb, indata, outdata);
+
+ case CTDB_CONTROL_GET_PUBLIC_IP_INFO:
+ CHECK_CONTROL_DATA_SIZE(sizeof(ctdb_sock_addr));
+ return ctdb_control_get_public_ip_info(ctdb, c, indata, outdata);
+
+ case CTDB_CONTROL_GET_IFACES:
+ CHECK_CONTROL_DATA_SIZE(0);
+ return ctdb_control_get_ifaces(ctdb, c, outdata);
+
+ case CTDB_CONTROL_SET_IFACE_LINK_STATE:
+ CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_iface));
+ return ctdb_control_set_iface_link(ctdb, c, indata);
+
+ case CTDB_CONTROL_GET_STAT_HISTORY:
+ CHECK_CONTROL_DATA_SIZE(0);
+ return ctdb_control_get_stat_history(ctdb, c, outdata);
+
+ case CTDB_CONTROL_SCHEDULE_FOR_DELETION: {
+ struct ctdb_control_schedule_for_deletion *d;
+ size_t size = offsetof(struct ctdb_control_schedule_for_deletion, key);
+ CHECK_CONTROL_MIN_DATA_SIZE(size);
+ d = (struct ctdb_control_schedule_for_deletion *)indata.dptr;
+ size += d->keylen;
+ CHECK_CONTROL_DATA_SIZE(size);
+ return ctdb_control_schedule_for_deletion(ctdb, indata);
+ }
+ case CTDB_CONTROL_GET_DB_STATISTICS:
+ CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
+ return ctdb_control_get_db_statistics(ctdb, *(uint32_t *)indata.dptr, outdata);
+
+ case CTDB_CONTROL_RELOAD_PUBLIC_IPS:
+ CHECK_CONTROL_DATA_SIZE(0);
+ return ctdb_control_reload_public_ips(ctdb, c, async_reply);
+
+ case CTDB_CONTROL_RECEIVE_RECORDS:
+ return control_not_implemented("RECEIVE_RECORDS", NULL);
+
+ case CTDB_CONTROL_DB_DETACH:
+ return ctdb_control_db_detach(ctdb, indata, client_id);
+
+ case CTDB_CONTROL_DB_FREEZE:
+ CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
+ return ctdb_control_db_freeze(ctdb, c, *(uint32_t *)indata.dptr,
+ async_reply);
+
+ case CTDB_CONTROL_DB_THAW:
+ CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
+ return ctdb_control_db_thaw(ctdb, *(uint32_t *)indata.dptr);
+
+ case CTDB_CONTROL_DB_TRANSACTION_START:
+ CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_transdb));
+ return ctdb_control_db_transaction_start(ctdb, indata);
+
+ case CTDB_CONTROL_DB_TRANSACTION_COMMIT:
+ CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_transdb));
+ return ctdb_control_db_transaction_commit(ctdb, indata);
+
+ case CTDB_CONTROL_DB_TRANSACTION_CANCEL:
+ CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
+ return ctdb_control_db_transaction_cancel(ctdb, indata);
+
+ case CTDB_CONTROL_DB_PULL:
+ CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_pulldb_ext));
+ return ctdb_control_db_pull(ctdb, c, indata, outdata);
+
+ case CTDB_CONTROL_DB_PUSH_START:
+ CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_pulldb_ext));
+ return ctdb_control_db_push_start(ctdb, indata);
+
+ case CTDB_CONTROL_DB_PUSH_CONFIRM:
+ CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
+ return ctdb_control_db_push_confirm(ctdb, indata, outdata);
+
+ case CTDB_CONTROL_DB_OPEN_FLAGS: {
+ uint32_t db_id;
+ struct ctdb_db_context *ctdb_db;
+ int tdb_flags;
+
+ CHECK_CONTROL_DATA_SIZE(sizeof(db_id));
+ db_id = *(uint32_t *)indata.dptr;
+ ctdb_db = find_ctdb_db(ctdb, db_id);
+ if (ctdb_db == NULL) {
+ return -1;
+ }
+
+ tdb_flags = tdb_get_flags(ctdb_db->ltdb->tdb);
+
+ outdata->dptr = talloc_size(outdata, sizeof(tdb_flags));
+ if (outdata->dptr == NULL) {
+ return -1;
+ }
+
+ outdata->dsize = sizeof(tdb_flags);
+ memcpy(outdata->dptr, &tdb_flags, outdata->dsize);
+ return 0;
+ }
+
+ case CTDB_CONTROL_CHECK_PID_SRVID:
+ CHECK_CONTROL_DATA_SIZE((sizeof(pid_t) + sizeof(uint64_t)));
+ return ctdb_control_check_pid_srvid(ctdb, indata);
+
+ case CTDB_CONTROL_TUNNEL_REGISTER:
+ return ctdb_control_tunnel_register(ctdb, client_id, srvid);
+
+ case CTDB_CONTROL_TUNNEL_DEREGISTER:
+ return ctdb_control_tunnel_deregister(ctdb, client_id, srvid);
+
+ case CTDB_CONTROL_VACUUM_FETCH:
+ return ctdb_control_vacuum_fetch(ctdb, indata);
+
+ case CTDB_CONTROL_DB_VACUUM: {
+ struct ctdb_db_vacuum db_vacuum;
+
+ CHECK_CONTROL_DATA_SIZE(ctdb_db_vacuum_len(&db_vacuum));
+ return ctdb_control_db_vacuum(ctdb, c, indata, async_reply);
+ }
+ case CTDB_CONTROL_ECHO_DATA: {
+ return ctdb_control_echo_data(ctdb, c, indata, async_reply);
+ }
+
+ case CTDB_CONTROL_DISABLE_NODE:
+ CHECK_CONTROL_DATA_SIZE(0);
+ return ctdb_control_disable_node(ctdb);
+
+ case CTDB_CONTROL_ENABLE_NODE:
+ CHECK_CONTROL_DATA_SIZE(0);
+ return ctdb_control_enable_node(ctdb);
+
+ default:
+ DEBUG(DEBUG_CRIT,(__location__ " Unknown CTDB control opcode %u\n", opcode));
+ return -1;
+ }
+}
+
+/*
+ send a reply for a ctdb control
+ */
+void ctdb_request_control_reply(struct ctdb_context *ctdb, struct ctdb_req_control_old *c,
+ TDB_DATA *outdata, int32_t status, const char *errormsg)
+{
+ struct ctdb_reply_control_old *r;
+ size_t len;
+
+ /* some controls send no reply */
+ if (c->flags & CTDB_CTRL_FLAG_NOREPLY) {
+ return;
+ }
+
+ len = offsetof(struct ctdb_reply_control_old, data) + (outdata?outdata->dsize:0);
+ if (errormsg) {
+ len += strlen(errormsg);
+ }
+ r = ctdb_transport_allocate(ctdb, ctdb, CTDB_REPLY_CONTROL, len, struct ctdb_reply_control_old);
+ if (r == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ "Unable to allocate transport - OOM or transport is down\n"));
+ return;
+ }
+
+ r->hdr.destnode = c->hdr.srcnode;
+ r->hdr.reqid = c->hdr.reqid;
+ r->status = status;
+ r->datalen = outdata?outdata->dsize:0;
+ if (outdata && outdata->dsize) {
+ memcpy(&r->data[0], outdata->dptr, outdata->dsize);
+ }
+ if (errormsg) {
+ r->errorlen = strlen(errormsg);
+ memcpy(&r->data[r->datalen], errormsg, r->errorlen);
+ }
+
+ ctdb_queue_packet_opcode(ctdb, &r->hdr, c->opcode);
+
+ talloc_free(r);
+}
+
+/*
+ called when a CTDB_REQ_CONTROL packet comes in
+*/
+void ctdb_request_control(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+ struct ctdb_req_control_old *c = (struct ctdb_req_control_old *)hdr;
+ TDB_DATA data, *outdata;
+ int32_t status;
+ bool async_reply = false;
+ const char *errormsg = NULL;
+
+ data.dptr = &c->data[0];
+ data.dsize = c->datalen;
+
+ outdata = talloc_zero(c, TDB_DATA);
+
+ status = ctdb_control_dispatch(ctdb, c, data, outdata, hdr->srcnode,
+ &errormsg, &async_reply);
+
+ if (!async_reply) {
+ ctdb_request_control_reply(ctdb, c, outdata, status, errormsg);
+ }
+}
+
+/*
+ called when a CTDB_REPLY_CONTROL packet comes in
+*/
+void ctdb_reply_control(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+ struct ctdb_reply_control_old *c = (struct ctdb_reply_control_old *)hdr;
+ TDB_DATA data;
+ struct ctdb_control_state *state;
+ const char *errormsg = NULL;
+
+ state = reqid_find(ctdb->idr, hdr->reqid, struct ctdb_control_state);
+ if (state == NULL) {
+ DEBUG(DEBUG_ERR,("pnn %u Invalid reqid %u in ctdb_reply_control\n",
+ ctdb->pnn, hdr->reqid));
+ return;
+ }
+
+ if (hdr->reqid != state->reqid) {
+ /* we found a record but it was the wrong one */
+ DEBUG(DEBUG_ERR, ("Dropped orphaned control reply with reqid:%u\n", hdr->reqid));
+ return;
+ }
+
+ data.dptr = &c->data[0];
+ data.dsize = c->datalen;
+ if (c->errorlen) {
+ errormsg = talloc_strndup(state,
+ (char *)&c->data[c->datalen], c->errorlen);
+ }
+
+ /* make state a child of the packet, so it goes away when the packet
+ is freed. */
+ talloc_steal(hdr, state);
+
+ state->callback(ctdb, c->status, data, errormsg, state->private_data);
+}
+
+static int ctdb_control_destructor(struct ctdb_control_state *state)
+{
+ reqid_remove(state->ctdb->idr, state->reqid);
+ return 0;
+}
+
+/*
+ handle a timeout of a control
+ */
+static void ctdb_control_timeout(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *private_data)
+{
+ struct ctdb_control_state *state = talloc_get_type(private_data, struct ctdb_control_state);
+ TALLOC_CTX *tmp_ctx = talloc_new(ev);
+
+ CTDB_INCREMENT_STAT(state->ctdb, timeouts.control);
+
+ talloc_steal(tmp_ctx, state);
+
+ state->callback(state->ctdb, -1, tdb_null,
+ "ctdb_control timed out",
+ state->private_data);
+ talloc_free(tmp_ctx);
+}
+
+
+/*
+ send a control message to a node
+ */
+int ctdb_daemon_send_control(struct ctdb_context *ctdb, uint32_t destnode,
+ uint64_t srvid, uint32_t opcode, uint32_t client_id,
+ uint32_t flags,
+ TDB_DATA data,
+ ctdb_control_callback_fn_t callback,
+ void *private_data)
+{
+ struct ctdb_req_control_old *c;
+ struct ctdb_control_state *state;
+ size_t len;
+
+ if (ctdb->methods == NULL) {
+ DEBUG(DEBUG_INFO,(__location__ " Failed to send control. Transport is DOWN\n"));
+ return -1;
+ }
+
+ if (((destnode == CTDB_BROADCAST_ACTIVE) ||
+ (destnode == CTDB_BROADCAST_ALL) ||
+ (destnode == CTDB_BROADCAST_CONNECTED)) &&
+ !(flags & CTDB_CTRL_FLAG_NOREPLY)) {
+ DEBUG(DEBUG_CRIT,("Attempt to broadcast control without NOREPLY\n"));
+ return -1;
+ }
+
+ if (destnode != CTDB_BROADCAST_ACTIVE &&
+ destnode != CTDB_BROADCAST_ALL &&
+ destnode != CTDB_BROADCAST_CONNECTED &&
+ (!ctdb_validate_pnn(ctdb, destnode) ||
+ (ctdb->nodes[destnode]->flags & NODE_FLAGS_DISCONNECTED))) {
+ if (!(flags & CTDB_CTRL_FLAG_NOREPLY)) {
+ callback(ctdb, -1, tdb_null, "ctdb_control to disconnected node", private_data);
+ }
+ return 0;
+ }
+
+ /* the state is made a child of private_data if possible. This means any reply
+ will be discarded if the private_data goes away */
+ state = talloc(private_data?private_data:ctdb, struct ctdb_control_state);
+ CTDB_NO_MEMORY(ctdb, state);
+
+ state->reqid = reqid_new(ctdb->idr, state);
+ state->callback = callback;
+ state->private_data = private_data;
+ state->ctdb = ctdb;
+ state->flags = flags;
+
+ talloc_set_destructor(state, ctdb_control_destructor);
+
+ len = offsetof(struct ctdb_req_control_old, data) + data.dsize;
+ c = ctdb_transport_allocate(ctdb, state, CTDB_REQ_CONTROL, len,
+ struct ctdb_req_control_old);
+ CTDB_NO_MEMORY(ctdb, c);
+ talloc_set_name_const(c, "ctdb_req_control packet");
+
+ c->hdr.destnode = destnode;
+ c->hdr.reqid = state->reqid;
+ c->opcode = opcode;
+ c->client_id = client_id;
+ c->flags = flags;
+ c->srvid = srvid;
+ c->datalen = data.dsize;
+ if (data.dsize) {
+ memcpy(&c->data[0], data.dptr, data.dsize);
+ }
+
+ ctdb_queue_packet(ctdb, &c->hdr);
+
+ if (flags & CTDB_CTRL_FLAG_NOREPLY) {
+ talloc_free(state);
+ return 0;
+ }
+
+ if (ctdb->tunable.control_timeout) {
+ tevent_add_timer(ctdb->ev, state,
+ timeval_current_ofs(ctdb->tunable.control_timeout, 0),
+ ctdb_control_timeout, state);
+ }
+
+ talloc_free(c);
+ return 0;
+}
diff --git a/ctdb/server/ctdb_daemon.c b/ctdb/server/ctdb_daemon.c
new file mode 100644
index 0000000..edd7d57
--- /dev/null
+++ b/ctdb/server/ctdb_daemon.c
@@ -0,0 +1,2248 @@
+/*
+ ctdb daemon code
+
+ Copyright (C) Andrew Tridgell 2006
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "system/wait.h"
+#include "system/time.h"
+
+#include <talloc.h>
+/* Allow use of deprecated function tevent_loop_allow_nesting() */
+#define TEVENT_DEPRECATED
+#include <tevent.h>
+#include <tdb.h>
+
+#include "lib/tdb_wrap/tdb_wrap.h"
+#include "lib/util/dlinklist.h"
+#include "lib/util/debug.h"
+#include "lib/util/time.h"
+#include "lib/util/blocking.h"
+#include "lib/util/become_daemon.h"
+
+#include "version.h"
+#include "ctdb_private.h"
+#include "ctdb_client.h"
+
+#include "common/rb_tree.h"
+#include "common/reqid.h"
+#include "common/system.h"
+#include "common/common.h"
+#include "common/logging.h"
+#include "common/pidfile.h"
+#include "common/sock_io.h"
+
+struct ctdb_client_pid_list {
+ struct ctdb_client_pid_list *next, *prev;
+ struct ctdb_context *ctdb;
+ pid_t pid;
+ struct ctdb_client *client;
+};
+
+const char *ctdbd_pidfile = NULL;
+static struct pidfile_context *ctdbd_pidfile_ctx = NULL;
+
+static void daemon_incoming_packet(void *, struct ctdb_req_header *);
+
+static pid_t __ctdbd_pid;
+
+static void print_exit_message(void)
+{
+ if (getpid() == __ctdbd_pid) {
+ DEBUG(DEBUG_NOTICE,("CTDB daemon shutting down\n"));
+
+ /* Wait a second to allow pending log messages to be flushed */
+ sleep(1);
+ }
+}
+
+#ifdef HAVE_GETRUSAGE
+
+struct cpu_check_threshold_data {
+ unsigned short percent;
+ struct timeval timeofday;
+ struct timeval ru_time;
+};
+
+static void ctdb_cpu_check_threshold(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval tv,
+ void *private_data)
+{
+ struct ctdb_context *ctdb = talloc_get_type_abort(
+ private_data, struct ctdb_context);
+ uint32_t interval = 60;
+
+ static unsigned short threshold = 0;
+ static struct cpu_check_threshold_data prev = {
+ .percent = 0,
+ .timeofday = { .tv_sec = 0 },
+ .ru_time = { .tv_sec = 0 },
+ };
+
+ struct rusage usage;
+ struct cpu_check_threshold_data curr = {
+ .percent = 0,
+ };
+ int64_t ru_time_diff, timeofday_diff;
+ bool first;
+ int ret;
+
+ /*
+ * Cache the threshold so that we don't waste time checking
+ * the environment variable every time
+ */
+ if (threshold == 0) {
+ const char *t;
+
+ threshold = 90;
+
+ t = getenv("CTDB_TEST_CPU_USAGE_THRESHOLD");
+ if (t != NULL) {
+ int th;
+
+ th = atoi(t);
+ if (th <= 0 || th > 100) {
+ DBG_WARNING("Failed to parse env var: %s\n", t);
+ } else {
+ threshold = th;
+ }
+ }
+ }
+
+ ret = getrusage(RUSAGE_SELF, &usage);
+ if (ret != 0) {
+ DBG_WARNING("rusage() failed: %d\n", ret);
+ goto next;
+ }
+
+ /* Sum the system and user CPU usage */
+ curr.ru_time = timeval_sum(&usage.ru_utime, &usage.ru_stime);
+
+ curr.timeofday = tv;
+
+ first = timeval_is_zero(&prev.timeofday);
+ if (first) {
+ /* No previous values recorded so no calculation to do */
+ goto done;
+ }
+
+ timeofday_diff = usec_time_diff(&curr.timeofday, &prev.timeofday);
+ if (timeofday_diff <= 0) {
+ /*
+ * Time went backwards or didn't progress so no (sane)
+ * calculation can be done
+ */
+ goto done;
+ }
+
+ ru_time_diff = usec_time_diff(&curr.ru_time, &prev.ru_time);
+
+ curr.percent = ru_time_diff * 100 / timeofday_diff;
+
+ if (curr.percent >= threshold) {
+ /* Log only if the utilisation changes */
+ if (curr.percent != prev.percent) {
+ D_WARNING("WARNING: CPU utilisation %hu%% >= "
+ "threshold (%hu%%)\n",
+ curr.percent,
+ threshold);
+ }
+ } else {
+ /* Log if the utilisation falls below the threshold */
+ if (prev.percent >= threshold) {
+ D_WARNING("WARNING: CPU utilisation %hu%% < "
+ "threshold (%hu%%)\n",
+ curr.percent,
+ threshold);
+ }
+ }
+
+done:
+ prev = curr;
+
+next:
+ tevent_add_timer(ctdb->ev, ctdb,
+ timeval_current_ofs(interval, 0),
+ ctdb_cpu_check_threshold,
+ ctdb);
+}
+
+static void ctdb_start_cpu_check_threshold(struct ctdb_context *ctdb)
+{
+ tevent_add_timer(ctdb->ev, ctdb,
+ timeval_current(),
+ ctdb_cpu_check_threshold,
+ ctdb);
+}
+#endif /* HAVE_GETRUSAGE */
+
+static void ctdb_time_tick(struct tevent_context *ev, struct tevent_timer *te,
+ struct timeval t, void *private_data)
+{
+ struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+
+ if (getpid() != ctdb->ctdbd_pid) {
+ return;
+ }
+
+ tevent_add_timer(ctdb->ev, ctdb,
+ timeval_current_ofs(1, 0),
+ ctdb_time_tick, ctdb);
+}
+
+/* Used to trigger a dummy event once per second, to make
+ * detection of hangs more reliable.
+ */
+static void ctdb_start_time_tickd(struct ctdb_context *ctdb)
+{
+ tevent_add_timer(ctdb->ev, ctdb,
+ timeval_current_ofs(1, 0),
+ ctdb_time_tick, ctdb);
+}
+
+static void ctdb_start_periodic_events(struct ctdb_context *ctdb)
+{
+ /* start monitoring for connected/disconnected nodes */
+ ctdb_start_keepalive(ctdb);
+
+ /* start periodic update of tcp tickle lists */
+ ctdb_start_tcp_tickle_update(ctdb);
+
+ /* start listening for recovery daemon pings */
+ ctdb_control_recd_ping(ctdb);
+
+ /* start listening to timer ticks */
+ ctdb_start_time_tickd(ctdb);
+
+#ifdef HAVE_GETRUSAGE
+ ctdb_start_cpu_check_threshold(ctdb);
+#endif /* HAVE_GETRUSAGE */
+}
+
+static void ignore_signal(int signum)
+{
+ struct sigaction act;
+
+ memset(&act, 0, sizeof(act));
+
+ act.sa_handler = SIG_IGN;
+ sigemptyset(&act.sa_mask);
+ sigaddset(&act.sa_mask, signum);
+ sigaction(signum, &act, NULL);
+}
+
+
+/*
+ send a packet to a client
+ */
+static int daemon_queue_send(struct ctdb_client *client, struct ctdb_req_header *hdr)
+{
+ CTDB_INCREMENT_STAT(client->ctdb, client_packets_sent);
+ if (hdr->operation == CTDB_REQ_MESSAGE) {
+ if (ctdb_queue_length(client->queue) > client->ctdb->tunable.max_queue_depth_drop_msg) {
+ DEBUG(DEBUG_ERR,("CTDB_REQ_MESSAGE queue full - killing client connection.\n"));
+ talloc_free(client);
+ return -1;
+ }
+ }
+ return ctdb_queue_send(client->queue, (uint8_t *)hdr, hdr->length);
+}
+
+/*
+ message handler for when we are in daemon mode. This redirects the message
+ to the right client
+ */
+static void daemon_message_handler(uint64_t srvid, TDB_DATA data,
+ void *private_data)
+{
+ struct ctdb_client *client = talloc_get_type(private_data, struct ctdb_client);
+ struct ctdb_req_message_old *r;
+ int len;
+
+ /* construct a message to send to the client containing the data */
+ len = offsetof(struct ctdb_req_message_old, data) + data.dsize;
+ r = ctdbd_allocate_pkt(client->ctdb, client->ctdb, CTDB_REQ_MESSAGE,
+ len, struct ctdb_req_message_old);
+ CTDB_NO_MEMORY_VOID(client->ctdb, r);
+
+ talloc_set_name_const(r, "req_message packet");
+
+ r->srvid = srvid;
+ r->datalen = data.dsize;
+ memcpy(&r->data[0], data.dptr, data.dsize);
+
+ daemon_queue_send(client, &r->hdr);
+
+ talloc_free(r);
+}
+
+/*
+ this is called when the ctdb daemon received a ctdb request to
+ set the srvid from the client
+ */
+int daemon_register_message_handler(struct ctdb_context *ctdb, uint32_t client_id, uint64_t srvid)
+{
+ struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
+ int res;
+ if (client == NULL) {
+ DEBUG(DEBUG_ERR,("Bad client_id in daemon_request_register_message_handler\n"));
+ return -1;
+ }
+ res = srvid_register(ctdb->srv, client, srvid, daemon_message_handler,
+ client);
+ if (res != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to register handler %llu in daemon\n",
+ (unsigned long long)srvid));
+ } else {
+ DEBUG(DEBUG_INFO,(__location__ " Registered message handler for srvid=%llu\n",
+ (unsigned long long)srvid));
+ }
+
+ return res;
+}
+
+/*
+ this is called when the ctdb daemon received a ctdb request to
+ remove a srvid from the client
+ */
+int daemon_deregister_message_handler(struct ctdb_context *ctdb, uint32_t client_id, uint64_t srvid)
+{
+ struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
+ if (client == NULL) {
+ DEBUG(DEBUG_ERR,("Bad client_id in daemon_request_deregister_message_handler\n"));
+ return -1;
+ }
+ return srvid_deregister(ctdb->srv, srvid, client);
+}
+
+void daemon_tunnel_handler(uint64_t tunnel_id, TDB_DATA data,
+ void *private_data)
+{
+ struct ctdb_client *client =
+ talloc_get_type_abort(private_data, struct ctdb_client);
+ struct ctdb_req_tunnel_old *c, *pkt;
+ size_t len;
+
+ pkt = (struct ctdb_req_tunnel_old *)data.dptr;
+
+ len = offsetof(struct ctdb_req_tunnel_old, data) + pkt->datalen;
+ c = ctdbd_allocate_pkt(client->ctdb, client->ctdb, CTDB_REQ_TUNNEL,
+ len, struct ctdb_req_tunnel_old);
+ if (c == NULL) {
+ DEBUG(DEBUG_ERR, ("Memory error in daemon_tunnel_handler\n"));
+ return;
+ }
+
+ talloc_set_name_const(c, "req_tunnel packet");
+
+ c->tunnel_id = tunnel_id;
+ c->flags = pkt->flags;
+ c->datalen = pkt->datalen;
+ memcpy(c->data, pkt->data, pkt->datalen);
+
+ daemon_queue_send(client, &c->hdr);
+
+ talloc_free(c);
+}
+
+/*
+ destroy a ctdb_client
+*/
+static int ctdb_client_destructor(struct ctdb_client *client)
+{
+ struct ctdb_db_context *ctdb_db;
+
+ ctdb_takeover_client_destructor_hook(client);
+ reqid_remove(client->ctdb->idr, client->client_id);
+ client->ctdb->num_clients--;
+
+ if (client->num_persistent_updates != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Client disconnecting with %u persistent updates in flight. Starting recovery\n", client->num_persistent_updates));
+ client->ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
+ }
+ ctdb_db = find_ctdb_db(client->ctdb, client->db_id);
+ if (ctdb_db) {
+ DEBUG(DEBUG_ERR, (__location__ " client exit while transaction "
+ "commit active. Forcing recovery.\n"));
+ client->ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
+
+ /*
+ * trans3 transaction state:
+ *
+ * The destructor sets the pointer to NULL.
+ */
+ talloc_free(ctdb_db->persistent_state);
+ }
+
+ return 0;
+}
+
+
+/*
+ this is called when the ctdb daemon received a ctdb request message
+ from a local client over the unix domain socket
+ */
+static void daemon_request_message_from_client(struct ctdb_client *client,
+ struct ctdb_req_message_old *c)
+{
+ TDB_DATA data;
+ int res;
+
+ if (c->hdr.destnode == CTDB_CURRENT_NODE) {
+ c->hdr.destnode = ctdb_get_pnn(client->ctdb);
+ }
+
+ /* maybe the message is for another client on this node */
+ if (ctdb_get_pnn(client->ctdb)==c->hdr.destnode) {
+ ctdb_request_message(client->ctdb, (struct ctdb_req_header *)c);
+ return;
+ }
+
+ /* its for a remote node */
+ data.dptr = &c->data[0];
+ data.dsize = c->datalen;
+ res = ctdb_daemon_send_message(client->ctdb, c->hdr.destnode,
+ c->srvid, data);
+ if (res != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to send message to remote node %u\n",
+ c->hdr.destnode));
+ }
+}
+
+
+struct daemon_call_state {
+ struct ctdb_client *client;
+ uint32_t reqid;
+ struct ctdb_call *call;
+ struct timeval start_time;
+
+ /* readonly request ? */
+ uint32_t readonly_fetch;
+ uint32_t client_callid;
+};
+
+/*
+ complete a call from a client
+*/
+static void daemon_call_from_client_callback(struct ctdb_call_state *state)
+{
+ struct daemon_call_state *dstate = talloc_get_type(state->async.private_data,
+ struct daemon_call_state);
+ struct ctdb_reply_call_old *r;
+ int res;
+ uint32_t length;
+ struct ctdb_client *client = dstate->client;
+ struct ctdb_db_context *ctdb_db = state->ctdb_db;
+
+ talloc_steal(client, dstate);
+ talloc_steal(dstate, dstate->call);
+
+ res = ctdb_daemon_call_recv(state, dstate->call);
+ if (res != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " ctdbd_call_recv() returned error\n"));
+ CTDB_DECREMENT_STAT(client->ctdb, pending_calls);
+
+ CTDB_UPDATE_LATENCY(client->ctdb, ctdb_db, "call_from_client_cb 1", call_latency, dstate->start_time);
+ return;
+ }
+
+ length = offsetof(struct ctdb_reply_call_old, data) + dstate->call->reply_data.dsize;
+ /* If the client asked for readonly FETCH, we remapped this to
+ FETCH_WITH_HEADER when calling the daemon. So we must
+ strip the extra header off the reply data before passing
+ it back to the client.
+ */
+ if (dstate->readonly_fetch
+ && dstate->client_callid == CTDB_FETCH_FUNC) {
+ length -= sizeof(struct ctdb_ltdb_header);
+ }
+
+ r = ctdbd_allocate_pkt(client->ctdb, dstate, CTDB_REPLY_CALL,
+ length, struct ctdb_reply_call_old);
+ if (r == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " Failed to allocate reply_call in ctdb daemon\n"));
+ CTDB_DECREMENT_STAT(client->ctdb, pending_calls);
+ CTDB_UPDATE_LATENCY(client->ctdb, ctdb_db, "call_from_client_cb 2", call_latency, dstate->start_time);
+ return;
+ }
+ r->hdr.reqid = dstate->reqid;
+ r->status = dstate->call->status;
+
+ if (dstate->readonly_fetch
+ && dstate->client_callid == CTDB_FETCH_FUNC) {
+ /* client only asked for a FETCH so we must strip off
+ the extra ctdb_ltdb header
+ */
+ r->datalen = dstate->call->reply_data.dsize - sizeof(struct ctdb_ltdb_header);
+ memcpy(&r->data[0], dstate->call->reply_data.dptr + sizeof(struct ctdb_ltdb_header), r->datalen);
+ } else {
+ r->datalen = dstate->call->reply_data.dsize;
+ memcpy(&r->data[0], dstate->call->reply_data.dptr, r->datalen);
+ }
+
+ res = daemon_queue_send(client, &r->hdr);
+ if (res == -1) {
+ /* client is dead - return immediately */
+ return;
+ }
+ if (res != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " Failed to queue packet from daemon to client\n"));
+ }
+ CTDB_UPDATE_LATENCY(client->ctdb, ctdb_db, "call_from_client_cb 3", call_latency, dstate->start_time);
+ CTDB_DECREMENT_STAT(client->ctdb, pending_calls);
+ talloc_free(dstate);
+}
+
+struct ctdb_daemon_packet_wrap {
+ struct ctdb_context *ctdb;
+ uint32_t client_id;
+};
+
+/*
+ a wrapper to catch disconnected clients
+ */
+static void daemon_incoming_packet_wrap(void *p, struct ctdb_req_header *hdr)
+{
+ struct ctdb_client *client;
+ struct ctdb_daemon_packet_wrap *w = talloc_get_type(p,
+ struct ctdb_daemon_packet_wrap);
+ if (w == NULL) {
+ DEBUG(DEBUG_CRIT,(__location__ " Bad packet type '%s'\n", talloc_get_name(p)));
+ return;
+ }
+
+ client = reqid_find(w->ctdb->idr, w->client_id, struct ctdb_client);
+ if (client == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Packet for disconnected client %u\n",
+ w->client_id));
+ talloc_free(w);
+ return;
+ }
+ talloc_free(w);
+
+ /* process it */
+ daemon_incoming_packet(client, hdr);
+}
+
+struct ctdb_deferred_fetch_call {
+ struct ctdb_deferred_fetch_call *next, *prev;
+ struct ctdb_req_call_old *c;
+ struct ctdb_daemon_packet_wrap *w;
+};
+
+struct ctdb_deferred_fetch_queue {
+ struct ctdb_deferred_fetch_call *deferred_calls;
+};
+
+struct ctdb_deferred_requeue {
+ struct ctdb_deferred_fetch_call *dfc;
+ struct ctdb_client *client;
+};
+
+/* called from a timer event and starts reprocessing the deferred call.*/
+static void reprocess_deferred_call(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *private_data)
+{
+ struct ctdb_deferred_requeue *dfr = (struct ctdb_deferred_requeue *)private_data;
+ struct ctdb_client *client = dfr->client;
+
+ talloc_steal(client, dfr->dfc->c);
+ daemon_incoming_packet(client, (struct ctdb_req_header *)dfr->dfc->c);
+ talloc_free(dfr);
+}
+
+/* the referral context is destroyed either after a timeout or when the initial
+ fetch-lock has finished.
+ at this stage, immediately start reprocessing the queued up deferred
+ calls so they get reprocessed immediately (and since we are dmaster at
+ this stage, trigger the waiting smbd processes to pick up and acquire the
+ record right away.
+*/
+static int deferred_fetch_queue_destructor(struct ctdb_deferred_fetch_queue *dfq)
+{
+
+ /* need to reprocess the packets from the queue explicitly instead of
+ just using a normal destructor since we need to
+ call the clients in the same order as the requests queued up
+ */
+ while (dfq->deferred_calls != NULL) {
+ struct ctdb_client *client;
+ struct ctdb_deferred_fetch_call *dfc = dfq->deferred_calls;
+ struct ctdb_deferred_requeue *dfr;
+
+ DLIST_REMOVE(dfq->deferred_calls, dfc);
+
+ client = reqid_find(dfc->w->ctdb->idr, dfc->w->client_id, struct ctdb_client);
+ if (client == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Packet for disconnected client %u\n",
+ dfc->w->client_id));
+ continue;
+ }
+
+ /* process it by pushing it back onto the eventloop */
+ dfr = talloc(client, struct ctdb_deferred_requeue);
+ if (dfr == NULL) {
+ DEBUG(DEBUG_ERR,("Failed to allocate deferred fetch requeue structure\n"));
+ continue;
+ }
+
+ dfr->dfc = talloc_steal(dfr, dfc);
+ dfr->client = client;
+
+ tevent_add_timer(dfc->w->ctdb->ev, client, timeval_zero(),
+ reprocess_deferred_call, dfr);
+ }
+
+ return 0;
+}
+
+/* insert the new deferral context into the rb tree.
+ there should never be a pre-existing context here, but check for it
+ warn and destroy the previous context if there is already a deferral context
+ for this key.
+*/
+static void *insert_dfq_callback(void *parm, void *data)
+{
+ if (data) {
+ DEBUG(DEBUG_ERR,("Already have DFQ registered. Free old %p and create new %p\n", data, parm));
+ talloc_free(data);
+ }
+ return parm;
+}
+
+/* if the original fetch-lock did not complete within a reasonable time,
+ free the context and context for all deferred requests to cause them to be
+ re-inserted into the event system.
+*/
+static void dfq_timeout(struct tevent_context *ev, struct tevent_timer *te,
+ struct timeval t, void *private_data)
+{
+ talloc_free(private_data);
+}
+
+/* This function is used in the local daemon to register a KEY in a database
+ for being "fetched"
+ While the remote fetch is in-flight, any futher attempts to re-fetch the
+ same record will be deferred until the fetch completes.
+*/
+static int setup_deferred_fetch_locks(struct ctdb_db_context *ctdb_db, struct ctdb_call *call)
+{
+ uint32_t *k;
+ struct ctdb_deferred_fetch_queue *dfq;
+
+ k = ctdb_key_to_idkey(call, call->key);
+ if (k == NULL) {
+ DEBUG(DEBUG_ERR,("Failed to allocate key for deferred fetch\n"));
+ return -1;
+ }
+
+ dfq = talloc(call, struct ctdb_deferred_fetch_queue);
+ if (dfq == NULL) {
+ DEBUG(DEBUG_ERR,("Failed to allocate key for deferred fetch queue structure\n"));
+ talloc_free(k);
+ return -1;
+ }
+ dfq->deferred_calls = NULL;
+
+ trbt_insertarray32_callback(ctdb_db->deferred_fetch, k[0], &k[0], insert_dfq_callback, dfq);
+
+ talloc_set_destructor(dfq, deferred_fetch_queue_destructor);
+
+ /* if the fetch havent completed in 30 seconds, just tear it all down
+ and let it try again as the events are reissued */
+ tevent_add_timer(ctdb_db->ctdb->ev, dfq, timeval_current_ofs(30, 0),
+ dfq_timeout, dfq);
+
+ talloc_free(k);
+ return 0;
+}
+
+/* check if this is a duplicate request to a fetch already in-flight
+ if it is, make this call deferred to be reprocessed later when
+ the in-flight fetch completes.
+*/
+static int requeue_duplicate_fetch(struct ctdb_db_context *ctdb_db, struct ctdb_client *client, TDB_DATA key, struct ctdb_req_call_old *c)
+{
+ uint32_t *k;
+ struct ctdb_deferred_fetch_queue *dfq;
+ struct ctdb_deferred_fetch_call *dfc;
+
+ k = ctdb_key_to_idkey(c, key);
+ if (k == NULL) {
+ DEBUG(DEBUG_ERR,("Failed to allocate key for deferred fetch\n"));
+ return -1;
+ }
+
+ dfq = trbt_lookuparray32(ctdb_db->deferred_fetch, k[0], &k[0]);
+ if (dfq == NULL) {
+ talloc_free(k);
+ return -1;
+ }
+
+
+ talloc_free(k);
+
+ dfc = talloc(dfq, struct ctdb_deferred_fetch_call);
+ if (dfc == NULL) {
+ DEBUG(DEBUG_ERR, ("Failed to allocate deferred fetch call structure\n"));
+ return -1;
+ }
+
+ dfc->w = talloc(dfc, struct ctdb_daemon_packet_wrap);
+ if (dfc->w == NULL) {
+ DEBUG(DEBUG_ERR,("Failed to allocate deferred fetch daemon packet wrap structure\n"));
+ talloc_free(dfc);
+ return -1;
+ }
+
+ dfc->c = talloc_steal(dfc, c);
+ dfc->w->ctdb = ctdb_db->ctdb;
+ dfc->w->client_id = client->client_id;
+
+ DLIST_ADD_END(dfq->deferred_calls, dfc);
+
+ return 0;
+}
+
+
+/*
+ this is called when the ctdb daemon received a ctdb request call
+ from a local client over the unix domain socket
+ */
+static void daemon_request_call_from_client(struct ctdb_client *client,
+ struct ctdb_req_call_old *c)
+{
+ struct ctdb_call_state *state;
+ struct ctdb_db_context *ctdb_db;
+ struct daemon_call_state *dstate;
+ struct ctdb_call *call;
+ struct ctdb_ltdb_header header;
+ TDB_DATA key, data;
+ int ret;
+ struct ctdb_context *ctdb = client->ctdb;
+ struct ctdb_daemon_packet_wrap *w;
+
+ CTDB_INCREMENT_STAT(ctdb, total_calls);
+ CTDB_INCREMENT_STAT(ctdb, pending_calls);
+
+ ctdb_db = find_ctdb_db(client->ctdb, c->db_id);
+ if (!ctdb_db) {
+ DEBUG(DEBUG_ERR, (__location__ " Unknown database in request. db_id==0x%08x",
+ c->db_id));
+ CTDB_DECREMENT_STAT(ctdb, pending_calls);
+ return;
+ }
+
+ if (ctdb_db->unhealthy_reason) {
+ /*
+ * this is just a warning, as the tdb should be empty anyway,
+ * and only persistent databases can be unhealthy, which doesn't
+ * use this code patch
+ */
+ DEBUG(DEBUG_WARNING,("warn: db(%s) unhealty in daemon_request_call_from_client(): %s\n",
+ ctdb_db->db_name, ctdb_db->unhealthy_reason));
+ }
+
+ key.dptr = c->data;
+ key.dsize = c->keylen;
+
+ w = talloc(ctdb, struct ctdb_daemon_packet_wrap);
+ CTDB_NO_MEMORY_VOID(ctdb, w);
+
+ w->ctdb = ctdb;
+ w->client_id = client->client_id;
+
+ ret = ctdb_ltdb_lock_fetch_requeue(ctdb_db, key, &header,
+ (struct ctdb_req_header *)c, &data,
+ daemon_incoming_packet_wrap, w, true);
+ if (ret == -2) {
+ /* will retry later */
+ CTDB_DECREMENT_STAT(ctdb, pending_calls);
+ return;
+ }
+
+ talloc_free(w);
+
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Unable to fetch record\n"));
+ CTDB_DECREMENT_STAT(ctdb, pending_calls);
+ return;
+ }
+
+
+ /* check if this fetch request is a duplicate for a
+ request we already have in flight. If so defer it until
+ the first request completes.
+ */
+ if (ctdb->tunable.fetch_collapse == 1) {
+ if (requeue_duplicate_fetch(ctdb_db, client, key, c) == 0) {
+ ret = ctdb_ltdb_unlock(ctdb_db, key);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+ }
+ CTDB_DECREMENT_STAT(ctdb, pending_calls);
+ talloc_free(data.dptr);
+ return;
+ }
+ }
+
+ /* Dont do READONLY if we don't have a tracking database */
+ if ((c->flags & CTDB_WANT_READONLY) && !ctdb_db_readonly(ctdb_db)) {
+ c->flags &= ~CTDB_WANT_READONLY;
+ }
+
+ if (header.flags & CTDB_REC_RO_REVOKE_COMPLETE) {
+ header.flags &= ~CTDB_REC_RO_FLAGS;
+ CTDB_INCREMENT_STAT(ctdb, total_ro_revokes);
+ CTDB_INCREMENT_DB_STAT(ctdb_db, db_ro_revokes);
+ if (ctdb_ltdb_store(ctdb_db, key, &header, data) != 0) {
+ ctdb_fatal(ctdb, "Failed to write header with cleared REVOKE flag");
+ }
+ /* and clear out the tracking data */
+ if (tdb_delete(ctdb_db->rottdb, key) != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to clear out trackingdb record\n"));
+ }
+ }
+
+ /* if we are revoking, we must defer all other calls until the revoke
+ * had completed.
+ */
+ if (header.flags & CTDB_REC_RO_REVOKING_READONLY) {
+ talloc_free(data.dptr);
+ ret = ctdb_ltdb_unlock(ctdb_db, key);
+
+ if (ctdb_add_revoke_deferred_call(ctdb, ctdb_db, key, (struct ctdb_req_header *)c, daemon_incoming_packet, client) != 0) {
+ ctdb_fatal(ctdb, "Failed to add deferred call for revoke child");
+ }
+ CTDB_DECREMENT_STAT(ctdb, pending_calls);
+ return;
+ }
+
+ if ((header.dmaster == ctdb->pnn)
+ && (!(c->flags & CTDB_WANT_READONLY))
+ && (header.flags & (CTDB_REC_RO_HAVE_DELEGATIONS|CTDB_REC_RO_HAVE_READONLY)) ) {
+ header.flags |= CTDB_REC_RO_REVOKING_READONLY;
+ if (ctdb_ltdb_store(ctdb_db, key, &header, data) != 0) {
+ ctdb_fatal(ctdb, "Failed to store record with HAVE_DELEGATIONS set");
+ }
+ ret = ctdb_ltdb_unlock(ctdb_db, key);
+
+ if (ctdb_start_revoke_ro_record(ctdb, ctdb_db, key, &header, data) != 0) {
+ ctdb_fatal(ctdb, "Failed to start record revoke");
+ }
+ talloc_free(data.dptr);
+
+ if (ctdb_add_revoke_deferred_call(ctdb, ctdb_db, key, (struct ctdb_req_header *)c, daemon_incoming_packet, client) != 0) {
+ ctdb_fatal(ctdb, "Failed to add deferred call for revoke child");
+ }
+
+ CTDB_DECREMENT_STAT(ctdb, pending_calls);
+ return;
+ }
+
+ dstate = talloc(client, struct daemon_call_state);
+ if (dstate == NULL) {
+ ret = ctdb_ltdb_unlock(ctdb_db, key);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+ }
+
+ DEBUG(DEBUG_ERR,(__location__ " Unable to allocate dstate\n"));
+ CTDB_DECREMENT_STAT(ctdb, pending_calls);
+ return;
+ }
+ dstate->start_time = timeval_current();
+ dstate->client = client;
+ dstate->reqid = c->hdr.reqid;
+ talloc_steal(dstate, data.dptr);
+
+ call = dstate->call = talloc_zero(dstate, struct ctdb_call);
+ if (call == NULL) {
+ ret = ctdb_ltdb_unlock(ctdb_db, key);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+ }
+
+ DEBUG(DEBUG_ERR,(__location__ " Unable to allocate call\n"));
+ CTDB_DECREMENT_STAT(ctdb, pending_calls);
+ CTDB_UPDATE_LATENCY(ctdb, ctdb_db, "call_from_client 1", call_latency, dstate->start_time);
+ return;
+ }
+
+ dstate->readonly_fetch = 0;
+ call->call_id = c->callid;
+ call->key = key;
+ call->call_data.dptr = c->data + c->keylen;
+ call->call_data.dsize = c->calldatalen;
+ call->flags = c->flags;
+
+ if (c->flags & CTDB_WANT_READONLY) {
+ /* client wants readonly record, so translate this into a
+ fetch with header. remember what the client asked for
+ so we can remap the reply back to the proper format for
+ the client in the reply
+ */
+ dstate->client_callid = call->call_id;
+ call->call_id = CTDB_FETCH_WITH_HEADER_FUNC;
+ dstate->readonly_fetch = 1;
+ }
+
+ if (header.dmaster == ctdb->pnn) {
+ state = ctdb_call_local_send(ctdb_db, call, &header, &data);
+ } else {
+ state = ctdb_daemon_call_send_remote(ctdb_db, call, &header);
+ if (ctdb->tunable.fetch_collapse == 1) {
+ /* This request triggered a remote fetch-lock.
+ set up a deferral for this key so any additional
+ fetch-locks are deferred until the current one
+ finishes.
+ */
+ setup_deferred_fetch_locks(ctdb_db, call);
+ }
+ }
+
+ ret = ctdb_ltdb_unlock(ctdb_db, key);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+ }
+
+ if (state == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Unable to setup call send\n"));
+ CTDB_DECREMENT_STAT(ctdb, pending_calls);
+ CTDB_UPDATE_LATENCY(ctdb, ctdb_db, "call_from_client 2", call_latency, dstate->start_time);
+ return;
+ }
+ talloc_steal(state, dstate);
+ talloc_steal(client, state);
+
+ state->async.fn = daemon_call_from_client_callback;
+ state->async.private_data = dstate;
+}
+
+
+static void daemon_request_control_from_client(struct ctdb_client *client,
+ struct ctdb_req_control_old *c);
+static void daemon_request_tunnel_from_client(struct ctdb_client *client,
+ struct ctdb_req_tunnel_old *c);
+
+/* data contains a packet from the client */
+static void daemon_incoming_packet(void *p, struct ctdb_req_header *hdr)
+{
+ struct ctdb_client *client = talloc_get_type(p, struct ctdb_client);
+ TALLOC_CTX *tmp_ctx;
+ struct ctdb_context *ctdb = client->ctdb;
+
+ /* place the packet as a child of a tmp_ctx. We then use
+ talloc_free() below to free it. If any of the calls want
+ to keep it, then they will steal it somewhere else, and the
+ talloc_free() will be a no-op */
+ tmp_ctx = talloc_new(client);
+ talloc_steal(tmp_ctx, hdr);
+
+ if (hdr->ctdb_magic != CTDB_MAGIC) {
+ ctdb_set_error(client->ctdb, "Non CTDB packet rejected in daemon\n");
+ goto done;
+ }
+
+ if (hdr->ctdb_version != CTDB_PROTOCOL) {
+ ctdb_set_error(client->ctdb, "Bad CTDB version 0x%x rejected in daemon\n", hdr->ctdb_version);
+ goto done;
+ }
+
+ switch (hdr->operation) {
+ case CTDB_REQ_CALL:
+ CTDB_INCREMENT_STAT(ctdb, client.req_call);
+ daemon_request_call_from_client(client, (struct ctdb_req_call_old *)hdr);
+ break;
+
+ case CTDB_REQ_MESSAGE:
+ CTDB_INCREMENT_STAT(ctdb, client.req_message);
+ daemon_request_message_from_client(client, (struct ctdb_req_message_old *)hdr);
+ break;
+
+ case CTDB_REQ_CONTROL:
+ CTDB_INCREMENT_STAT(ctdb, client.req_control);
+ daemon_request_control_from_client(client, (struct ctdb_req_control_old *)hdr);
+ break;
+
+ case CTDB_REQ_TUNNEL:
+ CTDB_INCREMENT_STAT(ctdb, client.req_tunnel);
+ daemon_request_tunnel_from_client(client, (struct ctdb_req_tunnel_old *)hdr);
+ break;
+
+ default:
+ DEBUG(DEBUG_CRIT,(__location__ " daemon: unrecognized operation %u\n",
+ hdr->operation));
+ }
+
+done:
+ talloc_free(tmp_ctx);
+}
+
+/*
+ called when the daemon gets a incoming packet
+ */
+static void ctdb_daemon_read_cb(uint8_t *data, size_t cnt, void *args)
+{
+ struct ctdb_client *client = talloc_get_type(args, struct ctdb_client);
+ struct ctdb_req_header *hdr;
+
+ if (cnt == 0) {
+ talloc_free(client);
+ return;
+ }
+
+ CTDB_INCREMENT_STAT(client->ctdb, client_packets_recv);
+
+ if (cnt < sizeof(*hdr)) {
+ ctdb_set_error(client->ctdb, "Bad packet length %u in daemon\n",
+ (unsigned)cnt);
+ return;
+ }
+ hdr = (struct ctdb_req_header *)data;
+
+ if (hdr->ctdb_magic != CTDB_MAGIC) {
+ ctdb_set_error(client->ctdb, "Non CTDB packet rejected\n");
+ goto err_out;
+ }
+
+ if (hdr->ctdb_version != CTDB_PROTOCOL) {
+ ctdb_set_error(client->ctdb, "Bad CTDB version 0x%x rejected in daemon\n", hdr->ctdb_version);
+ goto err_out;
+ }
+
+ DEBUG(DEBUG_DEBUG,(__location__ " client request %u of type %u length %u from "
+ "node %u to %u\n", hdr->reqid, hdr->operation, hdr->length,
+ hdr->srcnode, hdr->destnode));
+
+ /* it is the responsibility of the incoming packet function to free 'data' */
+ daemon_incoming_packet(client, hdr);
+ return;
+
+err_out:
+ TALLOC_FREE(data);
+}
+
+
+static int ctdb_clientpid_destructor(struct ctdb_client_pid_list *client_pid)
+{
+ if (client_pid->ctdb->client_pids != NULL) {
+ DLIST_REMOVE(client_pid->ctdb->client_pids, client_pid);
+ }
+
+ return 0;
+}
+
+static int get_new_client_id(struct reqid_context *idr,
+ struct ctdb_client *client,
+ uint32_t *out)
+{
+ uint32_t client_id;
+
+ client_id = reqid_new(idr, client);
+ /*
+ * Some places in the code (e.g. ctdb_control_db_attach(),
+ * ctdb_control_db_detach()) assign a special meaning to
+ * client_id 0. The assumption is that if client_id is 0 then
+ * the control has come from another daemon. Therefore, we
+ * should never return client_id == 0.
+ */
+ if (client_id == 0) {
+ /*
+ * Don't leak ID 0. This is safe because the ID keeps
+ * increasing. A test will be added to ensure that
+ * this doesn't change.
+ */
+ reqid_remove(idr, 0);
+
+ client_id = reqid_new(idr, client);
+ }
+
+ if (client_id == REQID_INVALID) {
+ return EINVAL;
+ }
+
+ if (client_id == 0) {
+ /* Every other ID must have been used and we can't use 0 */
+ reqid_remove(idr, 0);
+ return EINVAL;
+ }
+
+ *out = client_id;
+ return 0;
+}
+
+static void ctdb_accept_client(struct tevent_context *ev,
+ struct tevent_fd *fde, uint16_t flags,
+ void *private_data)
+{
+ struct sockaddr_un addr;
+ socklen_t len;
+ int fd;
+ struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+ struct ctdb_client *client;
+ struct ctdb_client_pid_list *client_pid;
+ pid_t peer_pid = 0;
+ int ret;
+
+ memset(&addr, 0, sizeof(addr));
+ len = sizeof(addr);
+ fd = accept(ctdb->daemon.sd, (struct sockaddr *)&addr, &len);
+ if (fd == -1) {
+ return;
+ }
+ smb_set_close_on_exec(fd);
+
+ ret = set_blocking(fd, false);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,
+ (__location__
+ " failed to set socket non-blocking (%s)\n",
+ strerror(errno)));
+ close(fd);
+ return;
+ }
+
+ set_close_on_exec(fd);
+
+ DEBUG(DEBUG_DEBUG,(__location__ " Created SOCKET FD:%d to connected child\n", fd));
+
+ client = talloc_zero(ctdb, struct ctdb_client);
+ if (ctdb_get_peer_pid(fd, &peer_pid) == 0) {
+ DEBUG(DEBUG_INFO,("Connected client with pid:%u\n", (unsigned)peer_pid));
+ }
+
+ client->ctdb = ctdb;
+ client->fd = fd;
+
+ ret = get_new_client_id(ctdb->idr, client, &client->client_id);
+ if (ret != 0) {
+ DBG_ERR("Unable to get client ID (%d)\n", ret);
+ close(fd);
+ talloc_free(client);
+ return;
+ }
+
+ client->pid = peer_pid;
+
+ client_pid = talloc(client, struct ctdb_client_pid_list);
+ if (client_pid == NULL) {
+ DEBUG(DEBUG_ERR,("Failed to allocate client pid structure\n"));
+ close(fd);
+ talloc_free(client);
+ return;
+ }
+ client_pid->ctdb = ctdb;
+ client_pid->pid = peer_pid;
+ client_pid->client = client;
+
+ DLIST_ADD(ctdb->client_pids, client_pid);
+
+ client->queue = ctdb_queue_setup(ctdb, client, fd, CTDB_DS_ALIGNMENT,
+ ctdb_daemon_read_cb, client,
+ "client-%u", client->pid);
+
+ talloc_set_destructor(client, ctdb_client_destructor);
+ talloc_set_destructor(client_pid, ctdb_clientpid_destructor);
+ ctdb->num_clients++;
+}
+
+
+
+/*
+ * Create a unix domain socket, bind it, secure it and listen. Return
+ * the file descriptor for the socket.
+ */
+static int ux_socket_bind(struct ctdb_context *ctdb, bool test_mode_enabled)
+{
+ struct sockaddr_un addr = { .sun_family = AF_UNIX };
+ int ret;
+
+ ctdb->daemon.sd = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (ctdb->daemon.sd == -1) {
+ return -1;
+ }
+
+ strncpy(addr.sun_path, ctdb->daemon.name, sizeof(addr.sun_path)-1);
+
+ if (! sock_clean(ctdb->daemon.name)) {
+ return -1;
+ }
+
+ set_close_on_exec(ctdb->daemon.sd);
+
+ ret = set_blocking(ctdb->daemon.sd, false);
+ if (ret != 0) {
+ DBG_ERR("Failed to set socket non-blocking (%s)\n",
+ strerror(errno));
+ goto failed;
+ }
+
+ ret = bind(ctdb->daemon.sd, (struct sockaddr *)&addr, sizeof(addr));
+ if (ret == -1) {
+ D_ERR("Unable to bind on ctdb socket '%s'\n", ctdb->daemon.name);
+ goto failed;
+ }
+
+ if (!test_mode_enabled) {
+ ret = chown(ctdb->daemon.name, geteuid(), getegid());
+ if (ret != 0 && !test_mode_enabled) {
+ D_ERR("Unable to secure (chown) ctdb socket '%s'\n",
+ ctdb->daemon.name);
+ goto failed;
+ }
+ }
+
+ ret = chmod(ctdb->daemon.name, 0700);
+ if (ret != 0) {
+ D_ERR("Unable to secure (chmod) ctdb socket '%s'\n",
+ ctdb->daemon.name);
+ goto failed;
+ }
+
+
+ ret = listen(ctdb->daemon.sd, 100);
+ if (ret != 0) {
+ D_ERR("Unable to listen on ctdb socket '%s'\n",
+ ctdb->daemon.name);
+ goto failed;
+ }
+
+ D_NOTICE("Listening to ctdb socket %s\n", ctdb->daemon.name);
+ return 0;
+
+failed:
+ close(ctdb->daemon.sd);
+ ctdb->daemon.sd = -1;
+ return -1;
+}
+
+struct ctdb_node *ctdb_find_node(struct ctdb_context *ctdb, uint32_t pnn)
+{
+ struct ctdb_node *node = NULL;
+ unsigned int i;
+
+ if (pnn == CTDB_CURRENT_NODE) {
+ pnn = ctdb->pnn;
+ }
+
+ /* Always found: PNN correctly set just before this is called */
+ for (i = 0; i < ctdb->num_nodes; i++) {
+ node = ctdb->nodes[i];
+ if (pnn == node->pnn) {
+ return node;
+ }
+ }
+
+ return NULL;
+}
+
+static void initialise_node_flags (struct ctdb_context *ctdb)
+{
+ struct ctdb_node *node = NULL;
+
+ node = ctdb_find_node(ctdb, CTDB_CURRENT_NODE);
+ /*
+ * PNN correctly set just before this is called so always
+ * found but keep static analysers happy...
+ */
+ if (node == NULL) {
+ DBG_ERR("Unable to find current node\n");
+ return;
+ }
+
+ node->flags &= ~NODE_FLAGS_DISCONNECTED;
+
+ /* do we start out in DISABLED mode? */
+ if (ctdb->start_as_disabled != 0) {
+ D_ERR("This node is configured to start in DISABLED state\n");
+ node->flags |= NODE_FLAGS_PERMANENTLY_DISABLED;
+ }
+ /* do we start out in STOPPED mode? */
+ if (ctdb->start_as_stopped != 0) {
+ D_ERR("This node is configured to start in STOPPED state\n");
+ node->flags |= NODE_FLAGS_STOPPED;
+ }
+}
+
+static void ctdb_setup_event_callback(struct ctdb_context *ctdb, int status,
+ void *private_data)
+{
+ if (status != 0) {
+ ctdb_die(ctdb, "Failed to run setup event");
+ }
+ ctdb_run_notification_script(ctdb, "setup");
+
+ /* Start the recovery daemon */
+ if (ctdb_start_recoverd(ctdb) != 0) {
+ DEBUG(DEBUG_ALERT,("Failed to start recovery daemon\n"));
+ exit(11);
+ }
+
+ ctdb_start_periodic_events(ctdb);
+
+ ctdb_wait_for_first_recovery(ctdb);
+}
+
+static struct timeval tevent_before_wait_ts;
+static struct timeval tevent_after_wait_ts;
+
+static void ctdb_tevent_trace_init(void)
+{
+ struct timeval now;
+
+ now = timeval_current();
+
+ tevent_before_wait_ts = now;
+ tevent_after_wait_ts = now;
+}
+
+static void ctdb_tevent_trace(enum tevent_trace_point tp,
+ void *private_data)
+{
+ struct timeval diff;
+ struct timeval now;
+ struct ctdb_context *ctdb =
+ talloc_get_type(private_data, struct ctdb_context);
+
+ if (getpid() != ctdb->ctdbd_pid) {
+ return;
+ }
+
+ now = timeval_current();
+
+ switch (tp) {
+ case TEVENT_TRACE_BEFORE_WAIT:
+ diff = timeval_until(&tevent_after_wait_ts, &now);
+ if (diff.tv_sec > 3) {
+ DEBUG(DEBUG_ERR,
+ ("Handling event took %ld seconds!\n",
+ (long)diff.tv_sec));
+ }
+ tevent_before_wait_ts = now;
+ break;
+
+ case TEVENT_TRACE_AFTER_WAIT:
+ diff = timeval_until(&tevent_before_wait_ts, &now);
+ if (diff.tv_sec > 3) {
+ DEBUG(DEBUG_ERR,
+ ("No event for %ld seconds!\n",
+ (long)diff.tv_sec));
+ }
+ tevent_after_wait_ts = now;
+ break;
+
+ default:
+ /* Do nothing for future tevent trace points */ ;
+ }
+}
+
+static void ctdb_remove_pidfile(void)
+{
+ TALLOC_FREE(ctdbd_pidfile_ctx);
+}
+
+static void ctdb_create_pidfile(TALLOC_CTX *mem_ctx)
+{
+ if (ctdbd_pidfile != NULL) {
+ int ret = pidfile_context_create(mem_ctx, ctdbd_pidfile,
+ &ctdbd_pidfile_ctx);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,
+ ("Failed to create PID file %s\n",
+ ctdbd_pidfile));
+ exit(11);
+ }
+
+ DEBUG(DEBUG_NOTICE, ("Created PID file %s\n", ctdbd_pidfile));
+ atexit(ctdb_remove_pidfile);
+ }
+}
+
+static void ctdb_initialise_vnn_map(struct ctdb_context *ctdb)
+{
+ unsigned int i, j, count;
+
+ /* initialize the vnn mapping table, skipping any deleted nodes */
+ ctdb->vnn_map = talloc(ctdb, struct ctdb_vnn_map);
+ CTDB_NO_MEMORY_FATAL(ctdb, ctdb->vnn_map);
+
+ count = 0;
+ for (i = 0; i < ctdb->num_nodes; i++) {
+ if ((ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) == 0) {
+ count++;
+ }
+ }
+
+ ctdb->vnn_map->generation = INVALID_GENERATION;
+ ctdb->vnn_map->size = count;
+ ctdb->vnn_map->map = talloc_array(ctdb->vnn_map, uint32_t, ctdb->vnn_map->size);
+ CTDB_NO_MEMORY_FATAL(ctdb, ctdb->vnn_map->map);
+
+ for(i=0, j=0; i < ctdb->vnn_map->size; i++) {
+ if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
+ continue;
+ }
+ ctdb->vnn_map->map[j] = i;
+ j++;
+ }
+}
+
+static void ctdb_set_my_pnn(struct ctdb_context *ctdb)
+{
+ if (ctdb->address == NULL) {
+ ctdb_fatal(ctdb,
+ "Can not determine PNN - node address is not set\n");
+ }
+
+ ctdb->pnn = ctdb_ip_to_pnn(ctdb, ctdb->address);
+ if (ctdb->pnn == CTDB_UNKNOWN_PNN) {
+ ctdb_fatal(ctdb,
+ "Can not determine PNN - unknown node address\n");
+ }
+
+ D_NOTICE("PNN is %u\n", ctdb->pnn);
+}
+
+static void stdin_handler(struct tevent_context *ev,
+ struct tevent_fd *fde,
+ uint16_t flags,
+ void *private_data)
+{
+ struct ctdb_context *ctdb = talloc_get_type_abort(
+ private_data, struct ctdb_context);
+ ssize_t nread;
+ char c;
+
+ nread = read(STDIN_FILENO, &c, 1);
+ if (nread != 1) {
+ D_ERR("stdin closed, exiting\n");
+ talloc_free(fde);
+ ctdb_shutdown_sequence(ctdb, EPIPE);
+ }
+}
+
+static int setup_stdin_handler(struct ctdb_context *ctdb)
+{
+ struct tevent_fd *fde;
+ struct stat st;
+ int ret;
+
+ ret = fstat(STDIN_FILENO, &st);
+ if (ret != 0) {
+ /* Problem with stdin, ignore... */
+ DBG_INFO("Can't fstat() stdin\n");
+ return 0;
+ }
+
+ if (!S_ISFIFO(st.st_mode)) {
+ DBG_INFO("Not a pipe...\n");
+ return 0;
+ }
+
+ fde = tevent_add_fd(ctdb->ev,
+ ctdb,
+ STDIN_FILENO,
+ TEVENT_FD_READ,
+ stdin_handler,
+ ctdb);
+ if (fde == NULL) {
+ return ENOMEM;
+ }
+
+ DBG_INFO("Set up stdin handler\n");
+ return 0;
+}
+
+static void fork_only(void)
+{
+ pid_t pid;
+
+ pid = fork();
+ if (pid == -1) {
+ D_ERR("Fork failed (errno=%d)\n", errno);
+ exit(1);
+ }
+
+ if (pid != 0) {
+ /* Parent simply exits... */
+ exit(0);
+ }
+}
+
+static void sighup_hook(void *private_data)
+{
+ struct ctdb_context *ctdb = talloc_get_type_abort(private_data,
+ struct ctdb_context);
+
+ if (ctdb->recoverd_pid > 0) {
+ kill(ctdb->recoverd_pid, SIGHUP);
+ }
+ ctdb_event_reopen_logs(ctdb);
+}
+
+/*
+ start the protocol going as a daemon
+*/
+int ctdb_start_daemon(struct ctdb_context *ctdb,
+ bool interactive,
+ bool test_mode_enabled)
+{
+ bool status;
+ int ret;
+ struct tevent_fd *fde;
+
+ /* Fork if not interactive */
+ if (!interactive) {
+ if (test_mode_enabled) {
+ /* Keep stdin open */
+ fork_only();
+ } else {
+ /* Fork, close stdin, start a session */
+ become_daemon(true, false, false);
+ }
+ }
+
+ ignore_signal(SIGPIPE);
+ ignore_signal(SIGUSR1);
+
+ ctdb->ctdbd_pid = getpid();
+ DEBUG(DEBUG_ERR, ("Starting CTDBD (Version %s) as PID: %u\n",
+ SAMBA_VERSION_STRING, ctdb->ctdbd_pid));
+ ctdb_create_pidfile(ctdb);
+
+ /* create a unix domain stream socket to listen to */
+ ret = ux_socket_bind(ctdb, test_mode_enabled);
+ if (ret != 0) {
+ D_ERR("Cannot continue. Exiting!\n");
+ exit(10);
+ }
+
+ /* Make sure we log something when the daemon terminates.
+ * This must be the first exit handler to run (so the last to
+ * be registered.
+ */
+ __ctdbd_pid = getpid();
+ atexit(print_exit_message);
+
+ if (ctdb->do_setsched) {
+ /* try to set us up as realtime */
+ if (!set_scheduler()) {
+ exit(1);
+ }
+ DEBUG(DEBUG_NOTICE, ("Set real-time scheduler priority\n"));
+ }
+
+ ctdb->ev = tevent_context_init(NULL);
+ if (ctdb->ev == NULL) {
+ DEBUG(DEBUG_ALERT,("tevent_context_init() failed\n"));
+ exit(1);
+ }
+ tevent_loop_allow_nesting(ctdb->ev);
+ ctdb_tevent_trace_init();
+ tevent_set_trace_callback(ctdb->ev, ctdb_tevent_trace, ctdb);
+
+ status = logging_setup_sighup_handler(ctdb->ev,
+ ctdb,
+ sighup_hook,
+ ctdb);
+ if (!status) {
+ D_ERR("Failed to set up signal handler for SIGHUP\n");
+ exit(1);
+ }
+
+ /* set up a handler to pick up sigchld */
+ if (ctdb_init_sigchld(ctdb) == NULL) {
+ DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD\n"));
+ exit(1);
+ }
+
+ if (!interactive) {
+ ctdb_set_child_logging(ctdb);
+ }
+
+ /* Exit if stdin is closed */
+ if (test_mode_enabled) {
+ ret = setup_stdin_handler(ctdb);
+ if (ret != 0) {
+ DBG_ERR("Failed to setup stdin handler\n");
+ exit(1);
+ }
+ }
+
+ TALLOC_FREE(ctdb->srv);
+ if (srvid_init(ctdb, &ctdb->srv) != 0) {
+ DEBUG(DEBUG_CRIT,("Failed to setup message srvid context\n"));
+ exit(1);
+ }
+
+ TALLOC_FREE(ctdb->tunnels);
+ if (srvid_init(ctdb, &ctdb->tunnels) != 0) {
+ DEBUG(DEBUG_ERR, ("Failed to setup tunnels context\n"));
+ exit(1);
+ }
+
+ /* initialize statistics collection */
+ ctdb_statistics_init(ctdb);
+
+ /* force initial recovery for election */
+ ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
+
+ if (ctdb_start_eventd(ctdb) != 0) {
+ DEBUG(DEBUG_ERR, ("Failed to start event daemon\n"));
+ exit(1);
+ }
+
+ ctdb_set_runstate(ctdb, CTDB_RUNSTATE_INIT);
+ ret = ctdb_event_script(ctdb, CTDB_EVENT_INIT);
+ if (ret != 0) {
+ ctdb_die(ctdb, "Failed to run init event\n");
+ }
+ ctdb_run_notification_script(ctdb, "init");
+
+ if (strcmp(ctdb->transport, "tcp") == 0) {
+ ret = ctdb_tcp_init(ctdb);
+ }
+#ifdef USE_INFINIBAND
+ if (strcmp(ctdb->transport, "ib") == 0) {
+ ret = ctdb_ibw_init(ctdb);
+ }
+#endif
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,("Failed to initialise transport '%s'\n", ctdb->transport));
+ return -1;
+ }
+
+ if (ctdb->methods == NULL) {
+ DEBUG(DEBUG_ALERT,(__location__ " Can not initialize transport. ctdb->methods is NULL\n"));
+ ctdb_fatal(ctdb, "transport is unavailable. can not initialize.");
+ }
+
+ /* Initialise the transport. This sets the node address if it
+ * was not set via the command-line. */
+ if (ctdb->methods->initialise(ctdb) != 0) {
+ ctdb_fatal(ctdb, "transport failed to initialise");
+ }
+
+ ctdb_set_my_pnn(ctdb);
+
+ initialise_node_flags(ctdb);
+
+ ret = ctdb_set_public_addresses(ctdb, true);
+ if (ret == -1) {
+ D_ERR("Unable to setup public IP addresses\n");
+ exit(1);
+ }
+
+ ctdb_initialise_vnn_map(ctdb);
+
+ /* attach to existing databases */
+ if (ctdb_attach_databases(ctdb) != 0) {
+ ctdb_fatal(ctdb, "Failed to attach to databases\n");
+ }
+
+ /* start frozen, then let the first election sort things out */
+ if (!ctdb_blocking_freeze(ctdb)) {
+ ctdb_fatal(ctdb, "Failed to get initial freeze\n");
+ }
+
+ /* now start accepting clients, only can do this once frozen */
+ fde = tevent_add_fd(ctdb->ev, ctdb, ctdb->daemon.sd, TEVENT_FD_READ,
+ ctdb_accept_client, ctdb);
+ if (fde == NULL) {
+ ctdb_fatal(ctdb, "Failed to add daemon socket to event loop");
+ }
+ tevent_fd_set_auto_close(fde);
+
+ /* Start the transport */
+ if (ctdb->methods->start(ctdb) != 0) {
+ DEBUG(DEBUG_ALERT,("transport failed to start!\n"));
+ ctdb_fatal(ctdb, "transport failed to start");
+ }
+
+ /* Recovery daemon and timed events are started from the
+ * callback, only after the setup event completes
+ * successfully.
+ */
+ ctdb_set_runstate(ctdb, CTDB_RUNSTATE_SETUP);
+ ret = ctdb_event_script_callback(ctdb,
+ ctdb,
+ ctdb_setup_event_callback,
+ ctdb,
+ CTDB_EVENT_SETUP,
+ "%s",
+ "");
+ if (ret != 0) {
+ DEBUG(DEBUG_CRIT,("Failed to set up 'setup' event\n"));
+ exit(1);
+ }
+
+ lockdown_memory(ctdb->valgrinding);
+
+ /* go into a wait loop to allow other nodes to complete */
+ tevent_loop_wait(ctdb->ev);
+
+ DEBUG(DEBUG_CRIT,("event_loop_wait() returned. this should not happen\n"));
+ exit(1);
+}
+
+/*
+ allocate a packet for use in daemon<->daemon communication
+ */
+struct ctdb_req_header *_ctdb_transport_allocate(struct ctdb_context *ctdb,
+ TALLOC_CTX *mem_ctx,
+ enum ctdb_operation operation,
+ size_t length, size_t slength,
+ const char *type)
+{
+ int size;
+ struct ctdb_req_header *hdr;
+
+ length = MAX(length, slength);
+ size = (length+(CTDB_DS_ALIGNMENT-1)) & ~(CTDB_DS_ALIGNMENT-1);
+
+ if (ctdb->methods == NULL) {
+ DEBUG(DEBUG_INFO,(__location__ " Unable to allocate transport packet for operation %u of length %u. Transport is DOWN.\n",
+ operation, (unsigned)length));
+ return NULL;
+ }
+
+ hdr = (struct ctdb_req_header *)ctdb->methods->allocate_pkt(mem_ctx, size);
+ if (hdr == NULL) {
+ DEBUG(DEBUG_ERR,("Unable to allocate transport packet for operation %u of length %u\n",
+ operation, (unsigned)length));
+ return NULL;
+ }
+ talloc_set_name_const(hdr, type);
+ memset(hdr, 0, slength);
+ hdr->length = length;
+ hdr->operation = operation;
+ hdr->ctdb_magic = CTDB_MAGIC;
+ hdr->ctdb_version = CTDB_PROTOCOL;
+ hdr->generation = ctdb->vnn_map->generation;
+ hdr->srcnode = ctdb->pnn;
+
+ return hdr;
+}
+
+struct daemon_control_state {
+ struct daemon_control_state *next, *prev;
+ struct ctdb_client *client;
+ struct ctdb_req_control_old *c;
+ uint32_t reqid;
+ struct ctdb_node *node;
+};
+
+/*
+ callback when a control reply comes in
+ */
+static void daemon_control_callback(struct ctdb_context *ctdb,
+ int32_t status, TDB_DATA data,
+ const char *errormsg,
+ void *private_data)
+{
+ struct daemon_control_state *state = talloc_get_type(private_data,
+ struct daemon_control_state);
+ struct ctdb_client *client = state->client;
+ struct ctdb_reply_control_old *r;
+ size_t len;
+ int ret;
+
+ /* construct a message to send to the client containing the data */
+ len = offsetof(struct ctdb_reply_control_old, data) + data.dsize;
+ if (errormsg) {
+ len += strlen(errormsg);
+ }
+ r = ctdbd_allocate_pkt(ctdb, state, CTDB_REPLY_CONTROL, len,
+ struct ctdb_reply_control_old);
+ CTDB_NO_MEMORY_VOID(ctdb, r);
+
+ r->hdr.reqid = state->reqid;
+ r->status = status;
+ r->datalen = data.dsize;
+ r->errorlen = 0;
+ memcpy(&r->data[0], data.dptr, data.dsize);
+ if (errormsg) {
+ r->errorlen = strlen(errormsg);
+ memcpy(&r->data[r->datalen], errormsg, r->errorlen);
+ }
+
+ ret = daemon_queue_send(client, &r->hdr);
+ if (ret != -1) {
+ talloc_free(state);
+ }
+}
+
+/*
+ fail all pending controls to a disconnected node
+ */
+void ctdb_daemon_cancel_controls(struct ctdb_context *ctdb, struct ctdb_node *node)
+{
+ struct daemon_control_state *state;
+ while ((state = node->pending_controls)) {
+ DLIST_REMOVE(node->pending_controls, state);
+ daemon_control_callback(ctdb, (uint32_t)-1, tdb_null,
+ "node is disconnected", state);
+ }
+}
+
+/*
+ destroy a daemon_control_state
+ */
+static int daemon_control_destructor(struct daemon_control_state *state)
+{
+ if (state->node) {
+ DLIST_REMOVE(state->node->pending_controls, state);
+ }
+ return 0;
+}
+
+/*
+ this is called when the ctdb daemon received a ctdb request control
+ from a local client over the unix domain socket
+ */
+static void daemon_request_control_from_client(struct ctdb_client *client,
+ struct ctdb_req_control_old *c)
+{
+ TDB_DATA data;
+ int res;
+ struct daemon_control_state *state;
+ TALLOC_CTX *tmp_ctx = talloc_new(client);
+
+ if (c->hdr.destnode == CTDB_CURRENT_NODE) {
+ c->hdr.destnode = client->ctdb->pnn;
+ }
+
+ state = talloc(client, struct daemon_control_state);
+ CTDB_NO_MEMORY_VOID(client->ctdb, state);
+
+ state->client = client;
+ state->c = talloc_steal(state, c);
+ state->reqid = c->hdr.reqid;
+ if (ctdb_validate_pnn(client->ctdb, c->hdr.destnode)) {
+ state->node = client->ctdb->nodes[c->hdr.destnode];
+ DLIST_ADD(state->node->pending_controls, state);
+ } else {
+ state->node = NULL;
+ }
+
+ talloc_set_destructor(state, daemon_control_destructor);
+
+ if (c->flags & CTDB_CTRL_FLAG_NOREPLY) {
+ talloc_steal(tmp_ctx, state);
+ }
+
+ data.dptr = &c->data[0];
+ data.dsize = c->datalen;
+ res = ctdb_daemon_send_control(client->ctdb, c->hdr.destnode,
+ c->srvid, c->opcode, client->client_id,
+ c->flags,
+ data, daemon_control_callback,
+ state);
+ if (res != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to send control to remote node %u\n",
+ c->hdr.destnode));
+ }
+
+ talloc_free(tmp_ctx);
+}
+
+static void daemon_request_tunnel_from_client(struct ctdb_client *client,
+ struct ctdb_req_tunnel_old *c)
+{
+ TDB_DATA data;
+ int ret;
+
+ if (! ctdb_validate_pnn(client->ctdb, c->hdr.destnode)) {
+ DEBUG(DEBUG_ERR, ("Invalid destination 0x%x\n",
+ c->hdr.destnode));
+ return;
+ }
+
+ ret = srvid_exists(client->ctdb->tunnels, c->tunnel_id, NULL);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,
+ ("tunnel id 0x%"PRIx64" not registered, dropping pkt\n",
+ c->tunnel_id));
+ return;
+ }
+
+ data = (TDB_DATA) {
+ .dsize = c->datalen,
+ .dptr = &c->data[0],
+ };
+
+ ret = ctdb_daemon_send_tunnel(client->ctdb, c->hdr.destnode,
+ c->tunnel_id, c->flags, data);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, ("Failed to set tunnel to remote note %u\n",
+ c->hdr.destnode));
+ }
+}
+
+/*
+ register a call function
+*/
+int ctdb_daemon_set_call(struct ctdb_context *ctdb, uint32_t db_id,
+ ctdb_fn_t fn, int id)
+{
+ struct ctdb_registered_call *call;
+ struct ctdb_db_context *ctdb_db;
+
+ ctdb_db = find_ctdb_db(ctdb, db_id);
+ if (ctdb_db == NULL) {
+ return -1;
+ }
+
+ call = talloc(ctdb_db, struct ctdb_registered_call);
+ call->fn = fn;
+ call->id = id;
+
+ DLIST_ADD(ctdb_db->calls, call);
+ return 0;
+}
+
+
+
+/*
+ this local messaging handler is ugly, but is needed to prevent
+ recursion in ctdb_send_message() when the destination node is the
+ same as the source node
+ */
+struct ctdb_local_message {
+ struct ctdb_context *ctdb;
+ uint64_t srvid;
+ TDB_DATA data;
+};
+
+static void ctdb_local_message_trigger(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *private_data)
+{
+ struct ctdb_local_message *m = talloc_get_type(
+ private_data, struct ctdb_local_message);
+
+ srvid_dispatch(m->ctdb->srv, m->srvid, CTDB_SRVID_ALL, m->data);
+ talloc_free(m);
+}
+
+static int ctdb_local_message(struct ctdb_context *ctdb, uint64_t srvid, TDB_DATA data)
+{
+ struct ctdb_local_message *m;
+ m = talloc(ctdb, struct ctdb_local_message);
+ CTDB_NO_MEMORY(ctdb, m);
+
+ m->ctdb = ctdb;
+ m->srvid = srvid;
+ m->data = data;
+ m->data.dptr = talloc_memdup(m, m->data.dptr, m->data.dsize);
+ if (m->data.dptr == NULL) {
+ talloc_free(m);
+ return -1;
+ }
+
+ /* this needs to be done as an event to prevent recursion */
+ tevent_add_timer(ctdb->ev, m, timeval_zero(),
+ ctdb_local_message_trigger, m);
+ return 0;
+}
+
+/*
+ send a ctdb message
+*/
+int ctdb_daemon_send_message(struct ctdb_context *ctdb, uint32_t pnn,
+ uint64_t srvid, TDB_DATA data)
+{
+ struct ctdb_req_message_old *r;
+ int len;
+
+ if (ctdb->methods == NULL) {
+ DEBUG(DEBUG_INFO,(__location__ " Failed to send message. Transport is DOWN\n"));
+ return -1;
+ }
+
+ /* see if this is a message to ourselves */
+ if (pnn == ctdb->pnn) {
+ return ctdb_local_message(ctdb, srvid, data);
+ }
+
+ len = offsetof(struct ctdb_req_message_old, data) + data.dsize;
+ r = ctdb_transport_allocate(ctdb, ctdb, CTDB_REQ_MESSAGE, len,
+ struct ctdb_req_message_old);
+ CTDB_NO_MEMORY(ctdb, r);
+
+ r->hdr.destnode = pnn;
+ r->srvid = srvid;
+ r->datalen = data.dsize;
+ memcpy(&r->data[0], data.dptr, data.dsize);
+
+ ctdb_queue_packet(ctdb, &r->hdr);
+
+ talloc_free(r);
+ return 0;
+}
+
+
+
+struct ctdb_client_notify_list {
+ struct ctdb_client_notify_list *next, *prev;
+ struct ctdb_context *ctdb;
+ uint64_t srvid;
+ TDB_DATA data;
+};
+
+
+static int ctdb_client_notify_destructor(struct ctdb_client_notify_list *nl)
+{
+ int ret;
+
+ DEBUG(DEBUG_ERR,("Sending client notify message for srvid:%llu\n", (unsigned long long)nl->srvid));
+
+ ret = ctdb_daemon_send_message(nl->ctdb, CTDB_BROADCAST_CONNECTED, (unsigned long long)nl->srvid, nl->data);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,("Failed to send client notify message\n"));
+ }
+
+ return 0;
+}
+
+int32_t ctdb_control_register_notify(struct ctdb_context *ctdb, uint32_t client_id, TDB_DATA indata)
+{
+ struct ctdb_notify_data_old *notify = (struct ctdb_notify_data_old *)indata.dptr;
+ struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
+ struct ctdb_client_notify_list *nl;
+
+ DEBUG(DEBUG_INFO,("Register srvid %llu for client %d\n", (unsigned long long)notify->srvid, client_id));
+
+ if (indata.dsize < offsetof(struct ctdb_notify_data_old, notify_data)) {
+ DEBUG(DEBUG_ERR,(__location__ " Too little data in control : %d\n", (int)indata.dsize));
+ return -1;
+ }
+
+ if (indata.dsize != (notify->len + offsetof(struct ctdb_notify_data_old, notify_data))) {
+ DEBUG(DEBUG_ERR,(__location__ " Wrong amount of data in control. Got %d, expected %d\n", (int)indata.dsize, (int)(notify->len + offsetof(struct ctdb_notify_data_old, notify_data))));
+ return -1;
+ }
+
+
+ if (client == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Could not find client parent structure. You can not send this control to a remote node\n"));
+ return -1;
+ }
+
+ for(nl=client->notify; nl; nl=nl->next) {
+ if (nl->srvid == notify->srvid) {
+ break;
+ }
+ }
+ if (nl != NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Notification for srvid:%llu already exists for this client\n", (unsigned long long)notify->srvid));
+ return -1;
+ }
+
+ nl = talloc(client, struct ctdb_client_notify_list);
+ CTDB_NO_MEMORY(ctdb, nl);
+ nl->ctdb = ctdb;
+ nl->srvid = notify->srvid;
+ nl->data.dsize = notify->len;
+ nl->data.dptr = talloc_memdup(nl, notify->notify_data,
+ nl->data.dsize);
+ CTDB_NO_MEMORY(ctdb, nl->data.dptr);
+
+ DLIST_ADD(client->notify, nl);
+ talloc_set_destructor(nl, ctdb_client_notify_destructor);
+
+ return 0;
+}
+
+int32_t ctdb_control_deregister_notify(struct ctdb_context *ctdb, uint32_t client_id, TDB_DATA indata)
+{
+ uint64_t srvid = *(uint64_t *)indata.dptr;
+ struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
+ struct ctdb_client_notify_list *nl;
+
+ DEBUG(DEBUG_INFO,("Deregister srvid %llu for client %d\n", (unsigned long long)srvid, client_id));
+
+ if (client == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Could not find client parent structure. You can not send this control to a remote node\n"));
+ return -1;
+ }
+
+ for(nl=client->notify; nl; nl=nl->next) {
+ if (nl->srvid == srvid) {
+ break;
+ }
+ }
+ if (nl == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " No notification for srvid:%llu found for this client\n", (unsigned long long)srvid));
+ return -1;
+ }
+
+ DLIST_REMOVE(client->notify, nl);
+ talloc_set_destructor(nl, NULL);
+ talloc_free(nl);
+
+ return 0;
+}
+
+struct ctdb_client *ctdb_find_client_by_pid(struct ctdb_context *ctdb, pid_t pid)
+{
+ struct ctdb_client_pid_list *client_pid;
+
+ for (client_pid = ctdb->client_pids; client_pid; client_pid=client_pid->next) {
+ if (client_pid->pid == pid) {
+ return client_pid->client;
+ }
+ }
+ return NULL;
+}
+
+
+/* This control is used by samba when probing if a process (of a samba daemon)
+ exists on the node.
+ Samba does this when it needs/wants to check if a subrecord in one of the
+ databases is still valid, or if it is stale and can be removed.
+ If the node is in unhealthy or stopped state we just kill of the samba
+ process holding this sub-record and return to the calling samba that
+ the process does not exist.
+ This allows us to forcefully recall subrecords registered by samba processes
+ on banned and stopped nodes.
+*/
+int32_t ctdb_control_process_exists(struct ctdb_context *ctdb, pid_t pid)
+{
+ struct ctdb_client *client;
+
+ client = ctdb_find_client_by_pid(ctdb, pid);
+ if (client == NULL) {
+ return -1;
+ }
+
+ if (ctdb->nodes[ctdb->pnn]->flags & NODE_FLAGS_INACTIVE) {
+ DEBUG(DEBUG_NOTICE,
+ ("Killing client with pid:%d on banned/stopped node\n",
+ (int)pid));
+ talloc_free(client);
+ return -1;
+ }
+
+ return kill(pid, 0);
+}
+
+int32_t ctdb_control_check_pid_srvid(struct ctdb_context *ctdb,
+ TDB_DATA indata)
+{
+ struct ctdb_client_pid_list *client_pid;
+ pid_t pid;
+ uint64_t srvid;
+ int ret;
+
+ pid = *(pid_t *)indata.dptr;
+ srvid = *(uint64_t *)(indata.dptr + sizeof(pid_t));
+
+ for (client_pid = ctdb->client_pids;
+ client_pid != NULL;
+ client_pid = client_pid->next) {
+ if (client_pid->pid == pid) {
+ ret = srvid_exists(ctdb->srv, srvid,
+ client_pid->client);
+ if (ret == 0) {
+ return 0;
+ }
+ }
+ }
+
+ return -1;
+}
+
+int ctdb_control_getnodesfile(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
+{
+ struct ctdb_node_map_old *node_map = NULL;
+
+ CHECK_CONTROL_DATA_SIZE(0);
+
+ node_map = ctdb_read_nodes_file(ctdb, ctdb->nodes_file);
+ if (node_map == NULL) {
+ DEBUG(DEBUG_ERR, ("Failed to read nodes file\n"));
+ return -1;
+ }
+
+ outdata->dptr = (unsigned char *)node_map;
+ outdata->dsize = talloc_get_size(outdata->dptr);
+
+ return 0;
+}
+
+void ctdb_shutdown_sequence(struct ctdb_context *ctdb, int exit_code)
+{
+ if (ctdb->runstate == CTDB_RUNSTATE_SHUTDOWN) {
+ DEBUG(DEBUG_NOTICE,("Already shutting down so will not proceed.\n"));
+ return;
+ }
+
+ DEBUG(DEBUG_ERR,("Shutdown sequence commencing.\n"));
+ ctdb_set_runstate(ctdb, CTDB_RUNSTATE_SHUTDOWN);
+ ctdb_stop_recoverd(ctdb);
+ ctdb_stop_keepalive(ctdb);
+ ctdb_stop_monitoring(ctdb);
+ ctdb_event_script(ctdb, CTDB_EVENT_SHUTDOWN);
+ ctdb_stop_eventd(ctdb);
+ if (ctdb->methods != NULL && ctdb->methods->shutdown != NULL) {
+ ctdb->methods->shutdown(ctdb);
+ }
+
+ DEBUG(DEBUG_ERR,("Shutdown sequence complete, exiting.\n"));
+ exit(exit_code);
+}
+
+/* When forking the main daemon and the child process needs to connect
+ * back to the daemon as a client process, this function can be used
+ * to change the ctdb context from daemon into client mode. The child
+ * process must be created using ctdb_fork() and not fork() -
+ * ctdb_fork() does some necessary housekeeping.
+ */
+int switch_from_server_to_client(struct ctdb_context *ctdb)
+{
+ int ret;
+
+ if (ctdb->daemon.sd != -1) {
+ close(ctdb->daemon.sd);
+ ctdb->daemon.sd = -1;
+ }
+
+ /* get a new event context */
+ ctdb->ev = tevent_context_init(ctdb);
+ if (ctdb->ev == NULL) {
+ DEBUG(DEBUG_ALERT,("tevent_context_init() failed\n"));
+ exit(1);
+ }
+ tevent_loop_allow_nesting(ctdb->ev);
+
+ /* Connect to main CTDB daemon */
+ ret = ctdb_socket_connect(ctdb);
+ if (ret != 0) {
+ DEBUG(DEBUG_ALERT, (__location__ " Failed to init ctdb client\n"));
+ return -1;
+ }
+
+ ctdb->can_send_controls = true;
+
+ return 0;
+}
diff --git a/ctdb/server/ctdb_fork.c b/ctdb/server/ctdb_fork.c
new file mode 100644
index 0000000..1065423
--- /dev/null
+++ b/ctdb/server/ctdb_fork.c
@@ -0,0 +1,216 @@
+/*
+ functions to track and manage processes
+
+ Copyright (C) Ronnie Sahlberg 2012
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/wait.h"
+#include "system/network.h"
+
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/util/debug.h"
+#include "lib/util/time.h"
+
+#include "ctdb_private.h"
+#include "ctdb_client.h"
+
+#include "common/rb_tree.h"
+#include "common/system.h"
+#include "common/common.h"
+#include "common/logging.h"
+
+void ctdb_track_child(struct ctdb_context *ctdb, pid_t pid)
+{
+ char *process;
+
+ /* Only CTDB main daemon should track child processes */
+ if (getpid() != ctdb->ctdbd_pid) {
+ return;
+ }
+
+ process = talloc_asprintf(ctdb->child_processes, "process:%d", (int)pid);
+ trbt_insert32(ctdb->child_processes, pid, process);
+}
+
+/*
+ * This function forks a child process and drops the realtime
+ * scheduler for the child process.
+ */
+pid_t ctdb_fork(struct ctdb_context *ctdb)
+{
+ pid_t pid;
+ struct timeval before;
+ double delta_t;
+
+ before = timeval_current();
+
+ pid = fork();
+ if (pid == -1) {
+ DEBUG(DEBUG_ERR,
+ (__location__ " fork() failed (%s)\n", strerror(errno)));
+ return -1;
+ }
+ if (pid == 0) {
+ /* Close the Unix Domain socket and the TCP socket.
+ * This ensures that none of the child processes will
+ * look like the main daemon when it is not running.
+ * tevent needs to be stopped before closing sockets.
+ */
+ if (ctdb->ev != NULL) {
+ talloc_free(ctdb->ev);
+ ctdb->ev = NULL;
+ }
+ if (ctdb->daemon.sd != -1) {
+ close(ctdb->daemon.sd);
+ ctdb->daemon.sd = -1;
+ }
+ if (ctdb->methods != NULL && ctdb->methods->shutdown != NULL) {
+ ctdb->methods->shutdown(ctdb);
+ }
+
+ /* The child does not need to be realtime */
+ if (ctdb->do_setsched) {
+ reset_scheduler();
+ }
+ ctdb->can_send_controls = false;
+
+ return 0;
+ }
+
+ delta_t = timeval_elapsed(&before);
+ if (delta_t > 3.0) {
+ DEBUG(DEBUG_WARNING, ("fork() took %lf seconds\n", delta_t));
+ }
+
+ ctdb_track_child(ctdb, pid);
+ return pid;
+}
+
+/*
+ * vfork + exec
+ */
+pid_t ctdb_vfork_exec(TALLOC_CTX *mem_ctx, struct ctdb_context *ctdb,
+ const char *helper, int helper_argc,
+ const char **helper_argv)
+{
+ pid_t pid;
+ struct timeval before;
+ double delta_t;
+ char **argv;
+ int i;
+
+ argv = talloc_array(mem_ctx, char *, helper_argc + 1);
+ if (argv == NULL) {
+ DEBUG(DEBUG_ERR, ("Memory allocation error\n"));
+ return -1;
+ }
+
+ argv[0] = discard_const(helper);
+ for (i=0; i<helper_argc; i++) {
+ argv[i+1] = discard_const(helper_argv[i]);
+ }
+
+ before = timeval_current();
+
+ pid = vfork();
+ if (pid == -1) {
+ DEBUG(DEBUG_ERR, ("vfork() failed (%s)\n", strerror(errno)));
+ return -1;
+ }
+
+ if (pid == 0) {
+ execv(helper, argv);
+ _exit(1);
+ }
+
+ delta_t = timeval_elapsed(&before);
+ if (delta_t > 3.0) {
+ DEBUG(DEBUG_WARNING, ("vfork() took %lf seconds\n", delta_t));
+ }
+
+ ctdb_track_child(ctdb, pid);
+ return pid;
+}
+
+static void ctdb_sigchld_handler(struct tevent_context *ev,
+ struct tevent_signal *te, int signum, int count,
+ void *dont_care,
+ void *private_data)
+{
+ struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+ int status;
+ pid_t pid = -1;
+
+ while (pid != 0) {
+ pid = waitpid(-1, &status, WNOHANG);
+ if (pid == -1) {
+ DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%d\n", errno));
+ return;
+ }
+ if (pid > 0) {
+ char *process;
+
+ if (getpid() != ctdb->ctdbd_pid) {
+ continue;
+ }
+
+ process = trbt_lookup32(ctdb->child_processes, pid);
+ if (process == NULL) {
+ DEBUG(DEBUG_ERR,("Got SIGCHLD from pid:%d we didn not spawn with ctdb_fork\n", pid));
+ }
+
+ DEBUG(DEBUG_DEBUG, ("SIGCHLD from %d %s\n", (int)pid, process));
+ talloc_free(process);
+ }
+ }
+}
+
+
+struct tevent_signal *
+ctdb_init_sigchld(struct ctdb_context *ctdb)
+{
+ struct tevent_signal *se;
+
+ ctdb->child_processes = trbt_create(ctdb, 0);
+
+ se = tevent_add_signal(ctdb->ev, ctdb, SIGCHLD, 0, ctdb_sigchld_handler, ctdb);
+ return se;
+}
+
+int
+ctdb_kill(struct ctdb_context *ctdb, pid_t pid, int signum)
+{
+ char *process;
+
+ if (signum == 0) {
+ return kill(pid, signum);
+ }
+
+ if (getpid() != ctdb->ctdbd_pid) {
+ return kill(pid, signum);
+ }
+
+ process = trbt_lookup32(ctdb->child_processes, pid);
+ if (process == NULL) {
+ DEBUG(DEBUG_ERR,("ctdb_kill: trying to kill(%d, %d) a process that does not exist\n", pid, signum));
+ return 0;
+ }
+
+ return kill(pid, signum);
+}
diff --git a/ctdb/server/ctdb_freeze.c b/ctdb/server/ctdb_freeze.c
new file mode 100644
index 0000000..06aeacf
--- /dev/null
+++ b/ctdb/server/ctdb_freeze.c
@@ -0,0 +1,923 @@
+/*
+ ctdb freeze handling
+
+ Copyright (C) Andrew Tridgell 2007
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "replace.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "system/wait.h"
+
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/tdb_wrap/tdb_wrap.h"
+#include "lib/util/dlinklist.h"
+#include "lib/util/debug.h"
+
+#include "ctdb_private.h"
+
+#include "common/rb_tree.h"
+#include "common/common.h"
+#include "common/logging.h"
+
+/**
+ * Cancel a transaction on database
+ */
+static int db_transaction_cancel_handler(struct ctdb_db_context *ctdb_db,
+ void *private_data)
+{
+ int ret;
+
+ tdb_add_flags(ctdb_db->ltdb->tdb, TDB_NOLOCK);
+ ret = tdb_transaction_cancel(ctdb_db->ltdb->tdb);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, ("Failed to cancel transaction for db %s\n",
+ ctdb_db->db_name));
+ }
+ tdb_remove_flags(ctdb_db->ltdb->tdb, TDB_NOLOCK);
+ return 0;
+}
+
+/**
+ * Start a transaction on database
+ */
+static int db_transaction_start_handler(struct ctdb_db_context *ctdb_db,
+ void *private_data)
+{
+ bool freeze_transaction_started = *(bool *)private_data;
+ int ret;
+
+ tdb_add_flags(ctdb_db->ltdb->tdb, TDB_NOLOCK);
+ if (freeze_transaction_started) {
+ ret = tdb_transaction_cancel(ctdb_db->ltdb->tdb);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,
+ ("Failed to cancel transaction for db %s\n",
+ ctdb_db->db_name));
+ }
+ }
+ ret = tdb_transaction_start(ctdb_db->ltdb->tdb);
+ tdb_remove_flags(ctdb_db->ltdb->tdb, TDB_NOLOCK);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, ("Failed to start transaction for db %s\n",
+ ctdb_db->db_name));
+ return -1;
+ }
+ return 0;
+}
+
+/**
+ * Commit a transaction on database
+ */
+static int db_transaction_commit_handler(struct ctdb_db_context *ctdb_db,
+ void *private_data)
+{
+ unsigned int healthy_nodes = *(unsigned int *)private_data;
+ int ret;
+
+ tdb_add_flags(ctdb_db->ltdb->tdb, TDB_NOLOCK);
+ ret = tdb_transaction_commit(ctdb_db->ltdb->tdb);
+ tdb_remove_flags(ctdb_db->ltdb->tdb, TDB_NOLOCK);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, ("Failed to commit transaction for db %s\n",
+ ctdb_db->db_name));
+ return -1;
+ }
+
+ ret = ctdb_update_persistent_health(ctdb_db->ctdb, ctdb_db, NULL,
+ healthy_nodes);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, ("Failed to update persistent health for db %s\n",
+ ctdb_db->db_name));
+ }
+ return ret;
+}
+
+/* a list of control requests waiting for db freeze */
+struct ctdb_db_freeze_waiter {
+ struct ctdb_db_freeze_waiter *next, *prev;
+ struct ctdb_context *ctdb;
+ void *private_data;
+ int32_t status;
+};
+
+/* a handle to a db freeze lock child process */
+struct ctdb_db_freeze_handle {
+ struct ctdb_db_context *ctdb_db;
+ struct lock_request *lreq;
+ struct ctdb_db_freeze_waiter *waiters;
+};
+
+/**
+ * Called when freeing database freeze handle
+ */
+static int ctdb_db_freeze_handle_destructor(struct ctdb_db_freeze_handle *h)
+{
+ struct ctdb_db_context *ctdb_db = h->ctdb_db;
+
+ DEBUG(DEBUG_ERR, ("Release freeze handle for db %s\n",
+ ctdb_db->db_name));
+
+ /* Cancel any pending transactions */
+ if (ctdb_db->freeze_transaction_started) {
+ db_transaction_cancel_handler(ctdb_db, NULL);
+ ctdb_db->freeze_transaction_started = false;
+ }
+ ctdb_db->freeze_mode = CTDB_FREEZE_NONE;
+ ctdb_db->freeze_handle = NULL;
+
+ /* Clear invalid records flag */
+ ctdb_db->invalid_records = false;
+
+ talloc_free(h->lreq);
+ return 0;
+}
+
+/**
+ * Called when a database is frozen
+ */
+static void ctdb_db_freeze_handler(void *private_data, bool locked)
+{
+ struct ctdb_db_freeze_handle *h = talloc_get_type_abort(
+ private_data, struct ctdb_db_freeze_handle);
+ struct ctdb_db_freeze_waiter *w;
+
+ if (h->ctdb_db->freeze_mode == CTDB_FREEZE_FROZEN) {
+ DEBUG(DEBUG_ERR, ("Freeze db child died - unfreezing\n"));
+ h->ctdb_db->freeze_mode = CTDB_FREEZE_NONE;
+ talloc_free(h);
+ return;
+ }
+
+ if (!locked) {
+ DEBUG(DEBUG_ERR, ("Failed to get db lock for %s\n",
+ h->ctdb_db->db_name));
+ h->ctdb_db->freeze_mode = CTDB_FREEZE_NONE;
+ talloc_free(h);
+ return;
+ }
+
+ h->ctdb_db->freeze_mode = CTDB_FREEZE_FROZEN;
+
+ /* notify the waiters */
+ while ((w = h->waiters) != NULL) {
+ w->status = 0;
+ DLIST_REMOVE(h->waiters, w);
+ talloc_free(w);
+ }
+}
+
+/**
+ * Start freeze process for a database
+ */
+static void ctdb_start_db_freeze(struct ctdb_db_context *ctdb_db)
+{
+ struct ctdb_db_freeze_handle *h;
+
+ if (ctdb_db->freeze_mode == CTDB_FREEZE_FROZEN) {
+ return;
+ }
+
+ if (ctdb_db->freeze_handle != NULL) {
+ return;
+ }
+
+ DEBUG(DEBUG_ERR, ("Freeze db: %s\n", ctdb_db->db_name));
+
+ ctdb_stop_vacuuming(ctdb_db->ctdb);
+
+ h = talloc_zero(ctdb_db, struct ctdb_db_freeze_handle);
+ CTDB_NO_MEMORY_FATAL(ctdb_db->ctdb, h);
+
+ h->ctdb_db = ctdb_db;
+ h->lreq = ctdb_lock_db(h, ctdb_db, false, ctdb_db_freeze_handler, h);
+ CTDB_NO_MEMORY_FATAL(ctdb_db->ctdb, h->lreq);
+ talloc_set_destructor(h, ctdb_db_freeze_handle_destructor);
+
+ ctdb_db->freeze_handle = h;
+ ctdb_db->freeze_mode = CTDB_FREEZE_PENDING;
+}
+
+/**
+ * Reply to a waiter for db freeze
+ */
+static int ctdb_db_freeze_waiter_destructor(struct ctdb_db_freeze_waiter *w)
+{
+ /* 'c' pointer is talloc_memdup(), so cannot use talloc_get_type */
+ struct ctdb_req_control_old *c =
+ (struct ctdb_req_control_old *)w->private_data;
+
+ ctdb_request_control_reply(w->ctdb, c, NULL, w->status, NULL);
+ return 0;
+}
+
+/**
+ * freeze a database
+ */
+int32_t ctdb_control_db_freeze(struct ctdb_context *ctdb,
+ struct ctdb_req_control_old *c,
+ uint32_t db_id,
+ bool *async_reply)
+{
+ struct ctdb_db_context *ctdb_db;
+ struct ctdb_db_freeze_waiter *w;
+
+ ctdb_db = find_ctdb_db(ctdb, db_id);
+ if (ctdb_db == NULL) {
+ DEBUG(DEBUG_ERR, ("Freeze db for unknown dbid 0x%08x\n", db_id));
+ return -1;
+ }
+
+ if (ctdb_db->freeze_mode == CTDB_FREEZE_FROZEN) {
+ DEBUG(DEBUG_ERR, ("Freeze db: %s frozen\n", ctdb_db->db_name));
+ return 0;
+ }
+
+ ctdb_start_db_freeze(ctdb_db);
+
+ /* add ourselves to the list of waiters */
+ w = talloc(ctdb_db->freeze_handle, struct ctdb_db_freeze_waiter);
+ CTDB_NO_MEMORY(ctdb, w);
+ w->ctdb = ctdb;
+ w->private_data = talloc_steal(w, c);
+ w->status = -1;
+ talloc_set_destructor(w, ctdb_db_freeze_waiter_destructor);
+ DLIST_ADD(ctdb_db->freeze_handle->waiters, w);
+
+ *async_reply = true;
+ return 0;
+}
+
+/**
+ * Thaw a database
+ */
+int32_t ctdb_control_db_thaw(struct ctdb_context *ctdb, uint32_t db_id)
+{
+ struct ctdb_db_context *ctdb_db;
+
+ ctdb_db = find_ctdb_db(ctdb, db_id);
+ if (ctdb_db == NULL) {
+ DEBUG(DEBUG_ERR, ("Thaw db for unknown dbid 0x%08x\n", db_id));
+ return -1;
+ }
+
+ DEBUG(DEBUG_ERR, ("Thaw db: %s generation %u\n", ctdb_db->db_name,
+ ctdb_db->generation));
+
+ TALLOC_FREE(ctdb_db->freeze_handle);
+ ctdb_call_resend_db(ctdb_db);
+ return 0;
+}
+
+
+/*
+ a list of control requests waiting for a freeze lock child to get
+ the database locks
+ */
+struct ctdb_freeze_waiter {
+ struct ctdb_freeze_waiter *next, *prev;
+ struct ctdb_context *ctdb;
+ struct ctdb_req_control_old *c;
+ int32_t status;
+};
+
+/* a handle to a freeze lock child process */
+struct ctdb_freeze_handle {
+ struct ctdb_context *ctdb;
+ unsigned int num_total, num_locked, num_failed;
+ struct ctdb_freeze_waiter *waiters;
+};
+
+static int db_thaw(struct ctdb_db_context *ctdb_db, void *private_data)
+{
+ talloc_free(ctdb_db->freeze_handle);
+ return 0;
+}
+
+/*
+ destroy a freeze handle
+ */
+static int ctdb_freeze_handle_destructor(struct ctdb_freeze_handle *h)
+{
+ struct ctdb_context *ctdb = h->ctdb;
+
+ DEBUG(DEBUG_ERR,("Release freeze handle\n"));
+
+ /* cancel any pending transactions */
+ if (ctdb->freeze_transaction_started) {
+ ctdb_db_iterator(ctdb, db_transaction_cancel_handler, NULL);
+ ctdb->freeze_transaction_started = false;
+ }
+
+ ctdb_db_iterator(ctdb, db_thaw, NULL);
+
+ ctdb->freeze_mode = CTDB_FREEZE_NONE;
+ ctdb->freeze_handle = NULL;
+
+ return 0;
+}
+
+/*
+ called when the child writes its status to us
+ */
+static void ctdb_freeze_lock_handler(void *private_data, bool locked)
+{
+ struct ctdb_freeze_handle *h = talloc_get_type_abort(private_data,
+ struct ctdb_freeze_handle);
+ struct ctdb_freeze_waiter *w;
+
+ if (h->ctdb->freeze_mode == CTDB_FREEZE_FROZEN) {
+ DEBUG(DEBUG_INFO,("freeze child died - unfreezing\n"));
+ talloc_free(h);
+ return;
+ }
+
+ if (!locked) {
+ DEBUG(DEBUG_ERR,("Failed to get locks in ctdb_freeze_child\n"));
+ /* we didn't get the locks - destroy the handle */
+ talloc_free(h);
+ return;
+ }
+
+ h->ctdb->freeze_mode = CTDB_FREEZE_FROZEN;
+
+ /* notify the waiters */
+ if (h != h->ctdb->freeze_handle) {
+ DEBUG(DEBUG_ERR,("lockwait finished but h is not linked\n"));
+ }
+ while ((w = h->waiters)) {
+ w->status = 0;
+ DLIST_REMOVE(h->waiters, w);
+ talloc_free(w);
+ }
+}
+
+/**
+ * When single database is frozen
+ */
+static int db_freeze_waiter_destructor(struct ctdb_db_freeze_waiter *w)
+{
+ struct ctdb_freeze_handle *h = talloc_get_type_abort(
+ w->private_data, struct ctdb_freeze_handle);
+
+ if (w->status == 0) {
+ h->num_locked += 1;
+ } else {
+ h->num_failed += 1;
+ }
+
+ /* Call ctdb_freeze_lock_handler() only when the status of all
+ * databases is known.
+ */
+ if (h->num_locked + h->num_failed == h->num_total) {
+ bool locked;
+
+ if (h->num_locked == h->num_total) {
+ locked = true;
+ } else {
+ locked = false;
+ }
+ ctdb_freeze_lock_handler(h, locked);
+ }
+ return 0;
+}
+
+/**
+ * Invalidate the records in the database.
+ * This only applies to volatile databases.
+ */
+static int db_invalidate(struct ctdb_db_context *ctdb_db, void *private_data)
+{
+ if (ctdb_db_volatile(ctdb_db)) {
+ ctdb_db->invalid_records = true;
+ }
+
+ return 0;
+}
+
+/**
+ * Count the number of databases
+ */
+static int db_count(struct ctdb_db_context *ctdb_db, void *private_data)
+{
+ unsigned int *count = (unsigned int *)private_data;
+
+ *count += 1;
+
+ return 0;
+}
+
+/**
+ * Freeze a single database
+ */
+static int db_freeze(struct ctdb_db_context *ctdb_db, void *private_data)
+{
+ struct ctdb_freeze_handle *h = talloc_get_type_abort(
+ private_data, struct ctdb_freeze_handle);
+ struct ctdb_db_freeze_waiter *w;
+
+ ctdb_start_db_freeze(ctdb_db);
+
+ w = talloc(ctdb_db->freeze_handle, struct ctdb_db_freeze_waiter);
+ CTDB_NO_MEMORY(h->ctdb, w);
+ w->ctdb = h->ctdb;
+ w->private_data = h;
+ w->status = -1;
+ talloc_set_destructor(w, db_freeze_waiter_destructor);
+
+ if (ctdb_db->freeze_mode == CTDB_FREEZE_FROZEN) {
+ /* Early return if already frozen */
+ w->status = 0;
+ talloc_free(w);
+ return 0;
+ }
+
+ DLIST_ADD(ctdb_db->freeze_handle->waiters, w);
+
+ return 0;
+}
+
+/*
+ start the freeze process for all databases
+ This is only called from ctdb_control_freeze(), which is called
+ only on node becoming INACTIVE. So mark the records invalid.
+ */
+static void ctdb_start_freeze(struct ctdb_context *ctdb)
+{
+ struct ctdb_freeze_handle *h;
+ int ret;
+
+ ctdb_db_iterator(ctdb, db_invalidate, NULL);
+
+ if (ctdb->freeze_mode == CTDB_FREEZE_FROZEN) {
+ unsigned int count = 0;
+
+ /*
+ * Check if all the databases are frozen
+ *
+ * It's possible that the databases can get attached after
+ * initial freeze. This typically happens during startup as
+ * CTDB will only attach persistent databases and go in to
+ * startup freeze. The recovery master during recovery will
+ * attach all the missing databases.
+ */
+
+ h = ctdb->freeze_handle;
+ if (h == NULL) {
+ ctdb->freeze_mode = CTDB_FREEZE_NONE;
+ return;
+ }
+
+ ret = ctdb_db_iterator(ctdb, db_count, &count);
+ if (ret != 0) {
+ TALLOC_FREE(ctdb->freeze_handle);
+ ctdb->freeze_mode = CTDB_FREEZE_NONE;
+ return;
+ }
+
+ if (count != h->num_total) {
+ DEBUG(DEBUG_ERR, ("Freeze all: incremental\n"));
+
+ h->num_total = count;
+ h->num_locked = 0;
+ h->num_failed = 0;
+
+ ctdb->freeze_mode = CTDB_FREEZE_PENDING;
+
+ ret = ctdb_db_iterator(ctdb, db_freeze, h);
+ if (ret != 0) {
+ TALLOC_FREE(ctdb->freeze_handle);
+ ctdb->freeze_mode = CTDB_FREEZE_NONE;
+ }
+ }
+ return;
+ }
+
+ if (ctdb->freeze_handle != NULL) {
+ /* already trying to freeze */
+ return;
+ }
+
+ DEBUG(DEBUG_ERR, ("Freeze all\n"));
+
+ /* Stop any vacuuming going on: we don't want to wait. */
+ ctdb_stop_vacuuming(ctdb);
+
+ /* create freeze lock children for each database */
+ h = talloc_zero(ctdb, struct ctdb_freeze_handle);
+ CTDB_NO_MEMORY_FATAL(ctdb, h);
+ h->ctdb = ctdb;
+ talloc_set_destructor(h, ctdb_freeze_handle_destructor);
+ ctdb->freeze_handle = h;
+
+ ret = ctdb_db_iterator(ctdb, db_count, &h->num_total);
+ if (ret != 0) {
+ talloc_free(h);
+ return;
+ }
+
+ ctdb->freeze_mode = CTDB_FREEZE_PENDING;
+
+ ret = ctdb_db_iterator(ctdb, db_freeze, h);
+ if (ret != 0) {
+ talloc_free(h);
+ return;
+ }
+
+ if (h->num_total == 0) {
+ ctdb->freeze_mode = CTDB_FREEZE_FROZEN;
+ }
+}
+
+/*
+ destroy a waiter for a freeze mode change
+ */
+static int ctdb_freeze_waiter_destructor(struct ctdb_freeze_waiter *w)
+{
+ ctdb_request_control_reply(w->ctdb, w->c, NULL, w->status, NULL);
+ return 0;
+}
+
+/*
+ freeze all the databases
+ This control is only used when freezing database on node becoming INACTIVE.
+ So mark the records invalid in ctdb_start_freeze().
+ */
+int32_t ctdb_control_freeze(struct ctdb_context *ctdb,
+ struct ctdb_req_control_old *c, bool *async_reply)
+{
+ struct ctdb_freeze_waiter *w;
+
+ ctdb_start_freeze(ctdb);
+
+ if (ctdb->freeze_mode == CTDB_FREEZE_FROZEN) {
+ DEBUG(DEBUG_ERR, ("Freeze all: frozen\n"));
+ /* we're already frozen */
+ return 0;
+ }
+
+ if (ctdb->freeze_handle == NULL) {
+ DEBUG(DEBUG_ERR,("No freeze lock handle when adding a waiter\n"));
+ return -1;
+ }
+
+ /* If there are no databases, we are done. */
+ if (ctdb->freeze_handle->num_total == 0) {
+ return 0;
+ }
+
+ /* add ourselves to list of waiters */
+ w = talloc(ctdb->freeze_handle, struct ctdb_freeze_waiter);
+ CTDB_NO_MEMORY(ctdb, w);
+ w->ctdb = ctdb;
+ w->c = talloc_steal(w, c);
+ w->status = -1;
+ talloc_set_destructor(w, ctdb_freeze_waiter_destructor);
+ DLIST_ADD(ctdb->freeze_handle->waiters, w);
+
+ /* we won't reply till later */
+ *async_reply = true;
+ return 0;
+}
+
+
+static int db_freeze_block(struct ctdb_db_context *ctdb_db, void *private_data)
+{
+ struct tevent_context *ev = (struct tevent_context *)private_data;
+
+ ctdb_start_db_freeze(ctdb_db);
+
+ while (ctdb_db->freeze_mode == CTDB_FREEZE_PENDING) {
+ tevent_loop_once(ev);
+ }
+
+ if (ctdb_db->freeze_mode != CTDB_FREEZE_FROZEN) {
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ block until we are frozen, used during daemon startup
+ */
+bool ctdb_blocking_freeze(struct ctdb_context *ctdb)
+{
+ int ret;
+
+ ret = ctdb_db_iterator(ctdb, db_freeze_block, ctdb->ev);
+ if (ret != 0) {
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ thaw the databases
+ */
+int32_t ctdb_control_thaw(struct ctdb_context *ctdb, bool check_recmode)
+{
+ if (check_recmode && ctdb->recovery_mode == CTDB_RECOVERY_ACTIVE) {
+ DEBUG(DEBUG_ERR, ("Failing to thaw databases while "
+ "recovery is active\n"));
+ return -1;
+ }
+
+ DEBUG(DEBUG_ERR,("Thawing all\n"));
+
+ /* cancel any pending transactions */
+ if (ctdb->freeze_transaction_started) {
+ ctdb_db_iterator(ctdb, db_transaction_cancel_handler, NULL);
+ ctdb->freeze_transaction_started = false;
+ }
+
+ ctdb_db_iterator(ctdb, db_thaw, NULL);
+ TALLOC_FREE(ctdb->freeze_handle);
+
+ ctdb_call_resend_all(ctdb);
+ return 0;
+}
+
+/**
+ * Database transaction wrappers
+ *
+ * These functions are wrappers around transaction start/cancel/commit handlers.
+ */
+
+struct db_start_transaction_state {
+ uint32_t transaction_id;
+ bool transaction_started;
+};
+
+static int db_start_transaction(struct ctdb_db_context *ctdb_db,
+ void *private_data)
+{
+ struct db_start_transaction_state *state =
+ (struct db_start_transaction_state *)private_data;
+ int ret;
+ bool transaction_started;
+
+ if (ctdb_db->freeze_mode != CTDB_FREEZE_FROZEN) {
+ DEBUG(DEBUG_ERR,
+ ("Database %s not frozen, cannot start transaction\n",
+ ctdb_db->db_name));
+ return -1;
+ }
+
+ transaction_started = state->transaction_started &
+ ctdb_db->freeze_transaction_started;
+
+ ret = db_transaction_start_handler(ctdb_db,
+ &transaction_started);
+ if (ret != 0) {
+ return -1;
+ }
+
+ ctdb_db->freeze_transaction_started = true;
+ ctdb_db->freeze_transaction_id = state->transaction_id;
+
+ return 0;
+}
+
+static int db_cancel_transaction(struct ctdb_db_context *ctdb_db,
+ void *private_data)
+{
+ int ret;
+
+ ret = db_transaction_cancel_handler(ctdb_db, private_data);
+ if (ret != 0) {
+ return ret;
+ }
+
+ ctdb_db->freeze_transaction_started = false;
+
+ return 0;
+}
+
+struct db_commit_transaction_state {
+ uint32_t transaction_id;
+ unsigned int healthy_nodes;
+};
+
+static int db_commit_transaction(struct ctdb_db_context *ctdb_db,
+ void *private_data)
+{
+ struct db_commit_transaction_state *state =
+ (struct db_commit_transaction_state *)private_data;
+ int ret;
+
+ if (ctdb_db->freeze_mode != CTDB_FREEZE_FROZEN) {
+ DEBUG(DEBUG_ERR,
+ ("Database %s not frozen, cannot commit transaction\n",
+ ctdb_db->db_name));
+ return -1;
+ }
+
+ if (!ctdb_db->freeze_transaction_started) {
+ DEBUG(DEBUG_ERR, ("Transaction not started on %s\n",
+ ctdb_db->db_name));
+ return -1;
+ }
+
+ if (ctdb_db->freeze_transaction_id != state->transaction_id) {
+ DEBUG(DEBUG_ERR,
+ ("Incorrect transaction commit id 0x%08x for %s\n",
+ state->transaction_id, ctdb_db->db_name));
+ return -1;
+ }
+
+ ret = db_transaction_commit_handler(ctdb_db, &state->healthy_nodes);
+ if (ret != 0) {
+ return -1;
+ }
+
+ ctdb_db->freeze_transaction_started = false;
+ ctdb_db->freeze_transaction_id = 0;
+ ctdb_db->generation = state->transaction_id;
+ return 0;
+}
+
+/**
+ * Start a transaction on a database - used for db recovery
+ */
+int32_t ctdb_control_db_transaction_start(struct ctdb_context *ctdb,
+ TDB_DATA indata)
+{
+ struct ctdb_transdb *w =
+ (struct ctdb_transdb *)indata.dptr;
+ struct ctdb_db_context *ctdb_db;
+ struct db_start_transaction_state state;
+
+ ctdb_db = find_ctdb_db(ctdb, w->db_id);
+ if (ctdb_db == NULL) {
+ DEBUG(DEBUG_ERR,
+ ("Transaction start for unknown dbid 0x%08x\n",
+ w->db_id));
+ return -1;
+ }
+
+ state.transaction_id = w->tid;
+ state.transaction_started = true;
+
+ return db_start_transaction(ctdb_db, &state);
+}
+
+/**
+ * Cancel a transaction on a database - used for db recovery
+ */
+int32_t ctdb_control_db_transaction_cancel(struct ctdb_context *ctdb,
+ TDB_DATA indata)
+{
+ uint32_t db_id = *(uint32_t *)indata.dptr;
+ struct ctdb_db_context *ctdb_db;
+
+ ctdb_db = find_ctdb_db(ctdb, db_id);
+ if (ctdb_db == NULL) {
+ DEBUG(DEBUG_ERR,
+ ("Transaction cancel for unknown dbid 0x%08x\n", db_id));
+ return -1;
+ }
+
+ DEBUG(DEBUG_ERR, ("Recovery db transaction cancelled for %s\n",
+ ctdb_db->db_name));
+
+ return db_cancel_transaction(ctdb_db, NULL);
+}
+
+/**
+ * Commit a transaction on a database - used for db recovery
+ */
+int32_t ctdb_control_db_transaction_commit(struct ctdb_context *ctdb,
+ TDB_DATA indata)
+{
+ struct ctdb_transdb *w =
+ (struct ctdb_transdb *)indata.dptr;
+ struct ctdb_db_context *ctdb_db;
+ struct db_commit_transaction_state state;
+ unsigned int healthy_nodes, i;
+
+ ctdb_db = find_ctdb_db(ctdb, w->db_id);
+ if (ctdb_db == NULL) {
+ DEBUG(DEBUG_ERR,
+ ("Transaction commit for unknown dbid 0x%08x\n",
+ w->db_id));
+ return -1;
+ }
+
+ healthy_nodes = 0;
+ for (i=0; i < ctdb->num_nodes; i++) {
+ if (ctdb->nodes[i]->flags == 0) {
+ healthy_nodes += 1;
+ }
+ }
+
+ state.transaction_id = w->tid;
+ state.healthy_nodes = healthy_nodes;
+
+ return db_commit_transaction(ctdb_db, &state);
+}
+
+/*
+ wipe a database - only possible when in a frozen transaction
+ */
+int32_t ctdb_control_wipe_database(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+ struct ctdb_transdb w = *(struct ctdb_transdb *)indata.dptr;
+ struct ctdb_db_context *ctdb_db;
+
+ ctdb_db = find_ctdb_db(ctdb, w.db_id);
+ if (!ctdb_db) {
+ DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%x\n", w.db_id));
+ return -1;
+ }
+
+ if (ctdb_db->freeze_mode != CTDB_FREEZE_FROZEN) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed transaction_start while not frozen\n"));
+ return -1;
+ }
+
+ if (!ctdb_db->freeze_transaction_started) {
+ DEBUG(DEBUG_ERR,(__location__ " transaction not started\n"));
+ return -1;
+ }
+
+ if (w.tid != ctdb_db->freeze_transaction_id) {
+ DEBUG(DEBUG_ERR,(__location__ " incorrect transaction id 0x%x in commit\n", w.tid));
+ return -1;
+ }
+
+ if (tdb_wipe_all(ctdb_db->ltdb->tdb) != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to wipe database for db '%s'\n",
+ ctdb_db->db_name));
+ return -1;
+ }
+
+ if (ctdb_db_volatile(ctdb_db)) {
+ talloc_free(ctdb_db->delete_queue);
+ talloc_free(ctdb_db->fetch_queue);
+ ctdb_db->delete_queue = trbt_create(ctdb_db, 0);
+ if (ctdb_db->delete_queue == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " Failed to re-create "
+ "the delete queue.\n"));
+ return -1;
+ }
+ ctdb_db->fetch_queue = trbt_create(ctdb_db, 0);
+ if (ctdb_db->fetch_queue == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " Failed to re-create "
+ "the fetch queue.\n"));
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+bool ctdb_db_frozen(struct ctdb_db_context *ctdb_db)
+{
+ if (ctdb_db->freeze_mode != CTDB_FREEZE_FROZEN) {
+ return false;
+ }
+
+ return true;
+}
+
+bool ctdb_db_all_frozen(struct ctdb_context *ctdb)
+{
+ if (ctdb->freeze_mode != CTDB_FREEZE_FROZEN) {
+ return false;
+ }
+ return true;
+}
+
+bool ctdb_db_allow_access(struct ctdb_db_context *ctdb_db)
+{
+ if (ctdb_db->freeze_mode == CTDB_FREEZE_NONE) {
+ /* If database is not frozen, then allow access. */
+ return true;
+ } else if (ctdb_db->freeze_transaction_started) {
+ /* If database is frozen, allow access only if the
+ * transaction is started. This is required during
+ * recovery.
+ *
+ * If a node is inactive, then transaction is not started.
+ */
+ return true;
+ }
+
+ return false;
+}
diff --git a/ctdb/server/ctdb_keepalive.c b/ctdb/server/ctdb_keepalive.c
new file mode 100644
index 0000000..9155ade
--- /dev/null
+++ b/ctdb/server/ctdb_keepalive.c
@@ -0,0 +1,234 @@
+/*
+ monitoring links to all other nodes to detect dead nodes
+
+
+ Copyright (C) Ronnie Sahlberg 2007
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/filesys.h"
+#include "system/network.h"
+#include "system/time.h"
+#include "system/wait.h"
+
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+
+#include "ctdb_private.h"
+#include "version.h"
+
+#include "common/common.h"
+#include "common/logging.h"
+
+
+static uint32_t keepalive_version(void)
+{
+ static uint32_t version = 0;
+
+ if (version == 0) {
+ const char *t;
+
+ version = (SAMBA_VERSION_MAJOR << 16) | SAMBA_VERSION_MINOR;
+
+ t = getenv("CTDB_TEST_SAMBA_VERSION");
+ if (t != NULL) {
+ int v;
+
+ v = atoi(t);
+ if (v <= 0) {
+ DBG_WARNING("Failed to parse env var: %s\n", t);
+ } else {
+ version = v;
+ }
+ }
+ }
+
+ return version;
+}
+
+static uint32_t keepalive_uptime(struct ctdb_context *ctdb)
+{
+ struct timeval current = tevent_timeval_current();
+
+ return current.tv_sec - ctdb->ctdbd_start_time.tv_sec;
+}
+
+/*
+ send a keepalive packet to the other node
+*/
+static void ctdb_send_keepalive(struct ctdb_context *ctdb, uint32_t destnode)
+{
+ struct ctdb_req_keepalive_old *r;
+
+ if (ctdb->methods == NULL) {
+ DEBUG(DEBUG_INFO,
+ ("Failed to send keepalive. Transport is DOWN\n"));
+ return;
+ }
+
+ r = ctdb_transport_allocate(ctdb, ctdb, CTDB_REQ_KEEPALIVE,
+ sizeof(struct ctdb_req_keepalive_old),
+ struct ctdb_req_keepalive_old);
+ CTDB_NO_MEMORY_FATAL(ctdb, r);
+ r->hdr.destnode = destnode;
+ r->hdr.reqid = 0;
+
+ r->version = keepalive_version();
+ r->uptime = keepalive_uptime(ctdb);
+
+ CTDB_INCREMENT_STAT(ctdb, keepalive_packets_sent);
+
+ ctdb_queue_packet(ctdb, &r->hdr);
+
+ talloc_free(r);
+}
+
+/*
+ see if any nodes are dead
+ */
+static void ctdb_check_for_dead_nodes(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *private_data)
+{
+ struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+ unsigned int i;
+
+ /* send a keepalive to all other nodes, unless */
+ for (i=0;i<ctdb->num_nodes;i++) {
+ struct ctdb_node *node = ctdb->nodes[i];
+
+ if (node->flags & NODE_FLAGS_DELETED) {
+ continue;
+ }
+
+ if (node->pnn == ctdb->pnn) {
+ continue;
+ }
+
+ if (node->flags & NODE_FLAGS_DISCONNECTED) {
+ /* it might have come alive again */
+ if (node->rx_cnt != 0) {
+ ctdb_node_connected(node);
+ }
+ continue;
+ }
+
+
+ if (node->rx_cnt == 0) {
+ node->dead_count++;
+ } else {
+ node->dead_count = 0;
+ }
+
+ node->rx_cnt = 0;
+
+ if (node->dead_count >= ctdb->tunable.keepalive_limit) {
+ DEBUG(DEBUG_NOTICE,("dead count reached for node %u\n", node->pnn));
+ ctdb_node_dead(node);
+ ctdb_send_keepalive(ctdb, node->pnn);
+ /* maybe tell the transport layer to kill the
+ sockets as well?
+ */
+ continue;
+ }
+
+ DEBUG(DEBUG_DEBUG,("sending keepalive to %u\n", node->pnn));
+ ctdb_send_keepalive(ctdb, node->pnn);
+
+ node->tx_cnt = 0;
+ }
+
+ tevent_add_timer(ctdb->ev, ctdb->keepalive_ctx,
+ timeval_current_ofs(ctdb->tunable.keepalive_interval, 0),
+ ctdb_check_for_dead_nodes, ctdb);
+}
+
+
+void ctdb_start_keepalive(struct ctdb_context *ctdb)
+{
+ struct tevent_timer *te;
+
+ ctdb->keepalive_ctx = talloc_new(ctdb);
+ CTDB_NO_MEMORY_FATAL(ctdb, ctdb->keepalive_ctx);
+
+ te = tevent_add_timer(ctdb->ev, ctdb->keepalive_ctx,
+ timeval_current_ofs(ctdb->tunable.keepalive_interval, 0),
+ ctdb_check_for_dead_nodes, ctdb);
+ CTDB_NO_MEMORY_FATAL(ctdb, te);
+
+ DEBUG(DEBUG_NOTICE,("Keepalive monitoring has been started\n"));
+
+ if (ctdb->tunable.allow_mixed_versions == 1) {
+ DEBUG(DEBUG_WARNING,
+ ("CTDB cluster with mixed versions configured\n"));
+ }
+}
+
+void ctdb_stop_keepalive(struct ctdb_context *ctdb)
+{
+ talloc_free(ctdb->keepalive_ctx);
+ ctdb->keepalive_ctx = NULL;
+}
+
+void ctdb_request_keepalive(struct ctdb_context *ctdb,
+ struct ctdb_req_header *hdr)
+{
+ struct ctdb_req_keepalive_old *c =
+ (struct ctdb_req_keepalive_old *)hdr;
+ uint32_t my_version = keepalive_version();
+ uint32_t my_uptime = keepalive_uptime(ctdb);
+
+ /* Don't check anything if mixed versions are allowed */
+ if (ctdb->tunable.allow_mixed_versions == 1) {
+ return;
+ }
+
+ if (hdr->length == sizeof(struct ctdb_req_header)) {
+ /* Old keepalive */
+ goto fail1;
+ }
+
+ if (c->version != my_version) {
+ if (c->uptime > my_uptime) {
+ goto fail2;
+ } else if (c->uptime == my_uptime) {
+ if (c->version > my_version) {
+ goto fail2;
+ }
+ }
+ }
+
+ return;
+
+fail1:
+ DEBUG(DEBUG_ERR,
+ ("Keepalive version missing from node %u\n", hdr->srcnode));
+ goto shutdown;
+
+fail2:
+ DEBUG(DEBUG_ERR,
+ ("Keepalive version mismatch 0x%08x != 0x%08x from node %u\n",
+ my_version, c->version, hdr->srcnode));
+ goto shutdown;
+
+shutdown:
+ DEBUG(DEBUG_ERR,
+ ("CTDB Cluster with mixed versions, cannot continue\n"));
+ ctdb_shutdown_sequence(ctdb, 0);
+}
diff --git a/ctdb/server/ctdb_lock.c b/ctdb/server/ctdb_lock.c
new file mode 100644
index 0000000..063ebfa
--- /dev/null
+++ b/ctdb/server/ctdb_lock.c
@@ -0,0 +1,996 @@
+/*
+ ctdb lock handling
+ provide API to do non-blocking locks for single or all databases
+
+ Copyright (C) Amitay Isaacs 2012
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "replace.h"
+#include "system/filesys.h"
+#include "system/network.h"
+
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/tdb_wrap/tdb_wrap.h"
+#include "lib/util/dlinklist.h"
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+#include "lib/util/sys_rw.h"
+
+#include "ctdb_private.h"
+
+#include "common/common.h"
+#include "common/logging.h"
+
+/*
+ * Non-blocking Locking API
+ *
+ * 1. Create a child process to do blocking locks.
+ * 2. Once the locks are obtained, signal parent process via fd.
+ * 3. Invoke registered callback routine with locking status.
+ * 4. If the child process cannot get locks within certain time,
+ * execute an external script to debug.
+ *
+ * ctdb_lock_record() - get a lock on a record
+ * ctdb_lock_db() - get a lock on a DB
+ *
+ * auto_mark - whether to mark/unmark DBs in before/after callback
+ * = false is used for freezing databases for
+ * recovery since the recovery cannot start till
+ * databases are locked on all the nodes.
+ * = true is used for record locks.
+ */
+
+enum lock_type {
+ LOCK_RECORD,
+ LOCK_DB,
+};
+
+static const char * const lock_type_str[] = {
+ "lock_record",
+ "lock_db",
+};
+
+struct lock_request;
+
+/* lock_context is the common part for a lock request */
+struct lock_context {
+ struct lock_context *next, *prev;
+ enum lock_type type;
+ struct ctdb_context *ctdb;
+ struct ctdb_db_context *ctdb_db;
+ TDB_DATA key;
+ uint32_t priority;
+ bool auto_mark;
+ struct lock_request *request;
+ pid_t child;
+ int fd[2];
+ struct tevent_fd *tfd;
+ struct tevent_timer *ttimer;
+ struct timeval start_time;
+ uint32_t key_hash;
+ bool can_schedule;
+};
+
+/* lock_request is the client specific part for a lock request */
+struct lock_request {
+ struct lock_context *lctx;
+ void (*callback)(void *, bool);
+ void *private_data;
+};
+
+
+int ctdb_db_iterator(struct ctdb_context *ctdb, ctdb_db_handler_t handler,
+ void *private_data)
+{
+ struct ctdb_db_context *ctdb_db;
+ int ret;
+
+ for (ctdb_db = ctdb->db_list; ctdb_db; ctdb_db = ctdb_db->next) {
+ ret = handler(ctdb_db, private_data);
+ if (ret != 0) {
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * lock all databases - mark only
+ */
+static int db_lock_mark_handler(struct ctdb_db_context *ctdb_db,
+ void *private_data)
+{
+ int tdb_transaction_write_lock_mark(struct tdb_context *);
+
+ DEBUG(DEBUG_INFO, ("marking locked database %s\n", ctdb_db->db_name));
+
+ if (tdb_transaction_write_lock_mark(ctdb_db->ltdb->tdb) != 0) {
+ DEBUG(DEBUG_ERR, ("Failed to mark (transaction lock) database %s\n",
+ ctdb_db->db_name));
+ return -1;
+ }
+
+ if (tdb_lockall_mark(ctdb_db->ltdb->tdb) != 0) {
+ DEBUG(DEBUG_ERR, ("Failed to mark (all lock) database %s\n",
+ ctdb_db->db_name));
+ return -1;
+ }
+
+ return 0;
+}
+
+int ctdb_lockdb_mark(struct ctdb_db_context *ctdb_db)
+{
+ if (!ctdb_db_frozen(ctdb_db)) {
+ DEBUG(DEBUG_ERR,
+ ("Attempt to mark database locked when not frozen\n"));
+ return -1;
+ }
+
+ return db_lock_mark_handler(ctdb_db, NULL);
+}
+
+/*
+ * lock all databases - unmark only
+ */
+static int db_lock_unmark_handler(struct ctdb_db_context *ctdb_db,
+ void *private_data)
+{
+ int tdb_transaction_write_lock_unmark(struct tdb_context *);
+
+ DEBUG(DEBUG_INFO, ("unmarking locked database %s\n", ctdb_db->db_name));
+
+ if (tdb_transaction_write_lock_unmark(ctdb_db->ltdb->tdb) != 0) {
+ DEBUG(DEBUG_ERR, ("Failed to unmark (transaction lock) database %s\n",
+ ctdb_db->db_name));
+ return -1;
+ }
+
+ if (tdb_lockall_unmark(ctdb_db->ltdb->tdb) != 0) {
+ DEBUG(DEBUG_ERR, ("Failed to unmark (all lock) database %s\n",
+ ctdb_db->db_name));
+ return -1;
+ }
+
+ return 0;
+}
+
+int ctdb_lockdb_unmark(struct ctdb_db_context *ctdb_db)
+{
+ if (!ctdb_db_frozen(ctdb_db)) {
+ DEBUG(DEBUG_ERR,
+ ("Attempt to unmark database locked when not frozen\n"));
+ return -1;
+ }
+
+ return db_lock_unmark_handler(ctdb_db, NULL);
+}
+
+static void ctdb_lock_schedule(struct ctdb_context *ctdb);
+
+/*
+ * Destructor to kill the child locking process
+ */
+static int ctdb_lock_context_destructor(struct lock_context *lock_ctx)
+{
+ if (lock_ctx->request) {
+ lock_ctx->request->lctx = NULL;
+ }
+ if (lock_ctx->child > 0) {
+ ctdb_kill(lock_ctx->ctdb, lock_ctx->child, SIGTERM);
+ if (lock_ctx->type == LOCK_RECORD) {
+ DLIST_REMOVE(lock_ctx->ctdb_db->lock_current, lock_ctx);
+ } else {
+ DLIST_REMOVE(lock_ctx->ctdb->lock_current, lock_ctx);
+ }
+ if (lock_ctx->ctdb_db->lock_num_current == 0) {
+ ctdb_fatal(NULL, "Lock count is 0 before decrement\n");
+ }
+ lock_ctx->ctdb_db->lock_num_current--;
+ CTDB_DECREMENT_STAT(lock_ctx->ctdb, locks.num_current);
+ CTDB_DECREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_current);
+ } else {
+ if (lock_ctx->type == LOCK_RECORD) {
+ DLIST_REMOVE(lock_ctx->ctdb_db->lock_pending, lock_ctx);
+ } else {
+ DLIST_REMOVE(lock_ctx->ctdb->lock_pending, lock_ctx);
+ }
+ CTDB_DECREMENT_STAT(lock_ctx->ctdb, locks.num_pending);
+ CTDB_DECREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_pending);
+ }
+
+ ctdb_lock_schedule(lock_ctx->ctdb);
+
+ return 0;
+}
+
+
+/*
+ * Destructor to remove lock request
+ */
+static int ctdb_lock_request_destructor(struct lock_request *lock_request)
+{
+ if (lock_request->lctx == NULL) {
+ return 0;
+ }
+
+ lock_request->lctx->request = NULL;
+ TALLOC_FREE(lock_request->lctx);
+
+ return 0;
+}
+
+/*
+ * Process all the callbacks waiting for lock
+ *
+ * If lock has failed, callback is executed with locked=false
+ */
+static void process_callbacks(struct lock_context *lock_ctx, bool locked)
+{
+ struct lock_request *request;
+ bool auto_mark = lock_ctx->auto_mark;
+
+ if (auto_mark && locked) {
+ switch (lock_ctx->type) {
+ case LOCK_RECORD:
+ tdb_chainlock_mark(lock_ctx->ctdb_db->ltdb->tdb, lock_ctx->key);
+ break;
+
+ case LOCK_DB:
+ (void)ctdb_lockdb_mark(lock_ctx->ctdb_db);
+ break;
+ }
+ }
+
+ request = lock_ctx->request;
+ if (auto_mark) {
+ /* Since request may be freed in the callback, unset the lock
+ * context, so request destructor will not free lock context.
+ */
+ request->lctx = NULL;
+ }
+
+ /* Since request may be freed in the callback, unset the request */
+ lock_ctx->request = NULL;
+
+ request->callback(request->private_data, locked);
+
+ if (!auto_mark) {
+ return;
+ }
+
+ if (locked) {
+ switch (lock_ctx->type) {
+ case LOCK_RECORD:
+ tdb_chainlock_unmark(lock_ctx->ctdb_db->ltdb->tdb, lock_ctx->key);
+ break;
+
+ case LOCK_DB:
+ ctdb_lockdb_unmark(lock_ctx->ctdb_db);
+ break;
+ }
+ }
+
+ talloc_free(lock_ctx);
+}
+
+
+static int lock_bucket_id(double t)
+{
+ double ms = 1.e-3, s = 1;
+ int id;
+
+ if (t < 1*ms) {
+ id = 0;
+ } else if (t < 10*ms) {
+ id = 1;
+ } else if (t < 100*ms) {
+ id = 2;
+ } else if (t < 1*s) {
+ id = 3;
+ } else if (t < 2*s) {
+ id = 4;
+ } else if (t < 4*s) {
+ id = 5;
+ } else if (t < 8*s) {
+ id = 6;
+ } else if (t < 16*s) {
+ id = 7;
+ } else if (t < 32*s) {
+ id = 8;
+ } else if (t < 64*s) {
+ id = 9;
+ } else {
+ id = 10;
+ }
+
+ return id;
+}
+
+/*
+ * Callback routine when the required locks are obtained.
+ * Called from parent context
+ */
+static void ctdb_lock_handler(struct tevent_context *ev,
+ struct tevent_fd *tfd,
+ uint16_t flags,
+ void *private_data)
+{
+ struct lock_context *lock_ctx;
+ char c;
+ bool locked;
+ double t;
+ int id;
+
+ lock_ctx = talloc_get_type_abort(private_data, struct lock_context);
+
+ /* cancel the timeout event */
+ TALLOC_FREE(lock_ctx->ttimer);
+
+ t = timeval_elapsed(&lock_ctx->start_time);
+ id = lock_bucket_id(t);
+
+ /* Read the status from the child process */
+ if (sys_read(lock_ctx->fd[0], &c, 1) != 1) {
+ locked = false;
+ } else {
+ locked = (c == 0 ? true : false);
+ }
+
+ /* Update statistics */
+ CTDB_INCREMENT_STAT(lock_ctx->ctdb, locks.num_calls);
+ CTDB_INCREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_calls);
+
+ if (locked) {
+ CTDB_INCREMENT_STAT(lock_ctx->ctdb, locks.buckets[id]);
+ CTDB_UPDATE_LATENCY(lock_ctx->ctdb, lock_ctx->ctdb_db,
+ lock_type_str[lock_ctx->type], locks.latency,
+ lock_ctx->start_time);
+
+ CTDB_UPDATE_DB_LATENCY(lock_ctx->ctdb_db, lock_type_str[lock_ctx->type], locks.latency, t);
+ CTDB_INCREMENT_DB_STAT(lock_ctx->ctdb_db, locks.buckets[id]);
+ } else {
+ CTDB_INCREMENT_STAT(lock_ctx->ctdb, locks.num_failed);
+ CTDB_INCREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_failed);
+ }
+
+ process_callbacks(lock_ctx, locked);
+}
+
+struct lock_log_entry {
+ struct db_hash_context *lock_log;
+ TDB_DATA key;
+ unsigned long log_sec;
+ struct tevent_timer *timer;
+};
+
+static int lock_log_fetch_parser(uint8_t *keybuf, size_t keylen,
+ uint8_t *databuf, size_t datalen,
+ void *private_data)
+{
+ struct lock_log_entry **entry =
+ (struct lock_log_entry **)private_data;
+
+ if (datalen != sizeof(struct lock_log_entry *)) {
+ return EINVAL;
+ }
+
+ *entry = talloc_get_type_abort(*(void **)databuf,
+ struct lock_log_entry);
+ return 0;
+}
+
+static void lock_log_cleanup(struct tevent_context *ev,
+ struct tevent_timer *ttimer,
+ struct timeval current_time,
+ void *private_data)
+{
+ struct lock_log_entry *entry = talloc_get_type_abort(
+ private_data, struct lock_log_entry);
+ int ret;
+
+ entry->timer = NULL;
+
+ ret = db_hash_delete(entry->lock_log, entry->key.dptr,
+ entry->key.dsize);
+ if (ret != 0) {
+ return;
+ }
+ talloc_free(entry);
+}
+
+static bool lock_log_skip(struct tevent_context *ev,
+ struct db_hash_context *lock_log,
+ TDB_DATA key, unsigned long elapsed_sec)
+{
+ struct lock_log_entry *entry = NULL;
+ int ret;
+
+ ret = db_hash_fetch(lock_log, key.dptr, key.dsize,
+ lock_log_fetch_parser, &entry);
+ if (ret == ENOENT) {
+
+ entry = talloc_zero(lock_log, struct lock_log_entry);
+ if (entry == NULL) {
+ goto fail;
+ }
+
+ entry->lock_log = lock_log;
+
+ entry->key.dptr = talloc_memdup(entry, key.dptr, key.dsize);
+ if (entry->key.dptr == NULL) {
+ talloc_free(entry);
+ goto fail;
+ }
+ entry->key.dsize = key.dsize;
+
+ entry->log_sec = elapsed_sec;
+ entry->timer = tevent_add_timer(ev, entry,
+ timeval_current_ofs(30, 0),
+ lock_log_cleanup, entry);
+ if (entry->timer == NULL) {
+ talloc_free(entry);
+ goto fail;
+ }
+
+ ret = db_hash_add(lock_log, key.dptr, key.dsize,
+ (uint8_t *)&entry,
+ sizeof(struct lock_log_entry *));
+ if (ret != 0) {
+ talloc_free(entry);
+ goto fail;
+ }
+
+ return false;
+
+ } else if (ret == EINVAL) {
+
+ ret = db_hash_delete(lock_log, key.dptr, key.dsize);
+ if (ret != 0) {
+ goto fail;
+ }
+
+ return false;
+
+ } else if (ret == 0) {
+
+ if (elapsed_sec <= entry->log_sec) {
+ return true;
+ }
+
+ entry->log_sec = elapsed_sec;
+
+ TALLOC_FREE(entry->timer);
+ entry->timer = tevent_add_timer(ev, entry,
+ timeval_current_ofs(30, 0),
+ lock_log_cleanup, entry);
+ if (entry->timer == NULL) {
+ ret = db_hash_delete(lock_log, key.dptr, key.dsize);
+ if (ret != 0) {
+ goto fail;
+ }
+ talloc_free(entry);
+ }
+
+ return false;
+ }
+
+
+fail:
+ return false;
+
+}
+
+static const char **debug_locks_args(TALLOC_CTX *mem_ctx, struct lock_context *lock_ctx)
+{
+ const char **args = NULL;
+ int tdb_flags;
+ int nargs, i;
+
+ /* Program, lock helper PID, db|record, tdb path, fcntl|mutex, NULL */
+ nargs = 6;
+
+ args = talloc_array(mem_ctx, const char *, nargs);
+ if (args == NULL) {
+ return NULL;
+ }
+
+ args[0] = talloc_strdup(args, "debug_locks");
+ args[1] = talloc_asprintf(args, "%d", lock_ctx->child);
+
+ if (lock_ctx->type == LOCK_RECORD) {
+ args[2] = talloc_strdup(args, "RECORD");
+ } else {
+ args[2] = talloc_strdup(args, "DB");
+ }
+
+ args[3] = talloc_strdup(args, lock_ctx->ctdb_db->db_path);
+
+ tdb_flags = tdb_get_flags(lock_ctx->ctdb_db->ltdb->tdb);
+ if (tdb_flags & TDB_MUTEX_LOCKING) {
+ args[4] = talloc_strdup(args, "MUTEX");
+ } else {
+ args[4] = talloc_strdup(args, "FCNTL");
+ }
+
+ args[5] = NULL;
+
+ for (i=0; i<nargs-1; i++) {
+ if (args[i] == NULL) {
+ talloc_free(args);
+ return NULL;
+ }
+ }
+
+ return args;
+}
+
+/*
+ * Callback routine when required locks are not obtained within timeout
+ * Called from parent context
+ */
+static void ctdb_lock_timeout_handler(struct tevent_context *ev,
+ struct tevent_timer *ttimer,
+ struct timeval current_time,
+ void *private_data)
+{
+ static char debug_locks[PATH_MAX+1] = "";
+ struct lock_context *lock_ctx;
+ struct ctdb_context *ctdb;
+ pid_t pid;
+ double elapsed_time;
+ bool skip;
+ char *keystr;
+ const char **args;
+
+ lock_ctx = talloc_get_type_abort(private_data, struct lock_context);
+ ctdb = lock_ctx->ctdb;
+
+ elapsed_time = timeval_elapsed(&lock_ctx->start_time);
+
+ /* For database locks, always log */
+ if (lock_ctx->type == LOCK_DB) {
+ DEBUG(DEBUG_WARNING,
+ ("Unable to get DB lock on database %s for "
+ "%.0lf seconds\n",
+ lock_ctx->ctdb_db->db_name, elapsed_time));
+ goto lock_debug;
+ }
+
+ /* For record locks, check if we have already logged */
+ skip = lock_log_skip(ev, lock_ctx->ctdb_db->lock_log,
+ lock_ctx->key, (unsigned long)elapsed_time);
+ if (skip) {
+ goto skip_lock_debug;
+ }
+
+ keystr = hex_encode_talloc(lock_ctx, lock_ctx->key.dptr,
+ lock_ctx->key.dsize);
+ DEBUG(DEBUG_WARNING,
+ ("Unable to get RECORD lock on database %s for %.0lf seconds"
+ " (key %s)\n",
+ lock_ctx->ctdb_db->db_name, elapsed_time,
+ keystr ? keystr : ""));
+ TALLOC_FREE(keystr);
+
+ /* If a node stopped/banned, don't spam the logs */
+ if (ctdb->nodes[ctdb->pnn]->flags & NODE_FLAGS_INACTIVE) {
+ goto skip_lock_debug;
+ }
+
+lock_debug:
+
+ if (ctdb_set_helper("lock debugging helper",
+ debug_locks, sizeof(debug_locks),
+ "CTDB_DEBUG_LOCKS",
+ getenv("CTDB_BASE"), "debug_locks.sh")) {
+ args = debug_locks_args(lock_ctx, lock_ctx);
+ if (args != NULL) {
+ pid = vfork();
+ if (pid == 0) {
+ execvp(debug_locks, discard_const(args));
+ _exit(0);
+ }
+ talloc_free(args);
+ ctdb_track_child(ctdb, pid);
+ } else {
+ D_WARNING("No memory for debug locks args\n");
+ }
+ } else {
+ DEBUG(DEBUG_WARNING,
+ (__location__
+ " Unable to setup lock debugging\n"));
+ }
+
+skip_lock_debug:
+
+ /* reset the timeout timer */
+ // talloc_free(lock_ctx->ttimer);
+ lock_ctx->ttimer = tevent_add_timer(ctdb->ev,
+ lock_ctx,
+ timeval_current_ofs(10, 0),
+ ctdb_lock_timeout_handler,
+ (void *)lock_ctx);
+}
+
+static bool lock_helper_args(TALLOC_CTX *mem_ctx,
+ struct lock_context *lock_ctx, int fd,
+ int *argc, const char ***argv)
+{
+ const char **args = NULL;
+ int nargs = 0, i;
+
+ switch (lock_ctx->type) {
+ case LOCK_RECORD:
+ nargs = 6;
+ break;
+
+ case LOCK_DB:
+ nargs = 5;
+ break;
+ }
+
+ /* Add extra argument for null termination */
+ nargs++;
+
+ args = talloc_array(mem_ctx, const char *, nargs);
+ if (args == NULL) {
+ return false;
+ }
+
+ args[0] = talloc_asprintf(args, "%d", getpid());
+ args[1] = talloc_asprintf(args, "%d", fd);
+
+ switch (lock_ctx->type) {
+ case LOCK_RECORD:
+ args[2] = talloc_strdup(args, "RECORD");
+ args[3] = talloc_strdup(args, lock_ctx->ctdb_db->db_path);
+ args[4] = talloc_asprintf(args, "0x%x",
+ tdb_get_flags(lock_ctx->ctdb_db->ltdb->tdb));
+ if (lock_ctx->key.dsize == 0) {
+ args[5] = talloc_strdup(args, "NULL");
+ } else {
+ args[5] = hex_encode_talloc(args, lock_ctx->key.dptr, lock_ctx->key.dsize);
+ }
+ break;
+
+ case LOCK_DB:
+ args[2] = talloc_strdup(args, "DB");
+ args[3] = talloc_strdup(args, lock_ctx->ctdb_db->db_path);
+ args[4] = talloc_asprintf(args, "0x%x",
+ tdb_get_flags(lock_ctx->ctdb_db->ltdb->tdb));
+ break;
+ }
+
+ /* Make sure last argument is NULL */
+ args[nargs-1] = NULL;
+
+ for (i=0; i<nargs-1; i++) {
+ if (args[i] == NULL) {
+ talloc_free(args);
+ return false;
+ }
+ }
+
+ *argc = nargs;
+ *argv = args;
+ return true;
+}
+
+/*
+ * Find a lock request that can be scheduled
+ */
+static struct lock_context *ctdb_find_lock_context(struct ctdb_context *ctdb)
+{
+ struct lock_context *lock_ctx, *next_ctx;
+ struct ctdb_db_context *ctdb_db;
+
+ /* First check if there are database lock requests */
+
+ for (lock_ctx = ctdb->lock_pending; lock_ctx != NULL;
+ lock_ctx = next_ctx) {
+
+ if (lock_ctx->request != NULL) {
+ /* Found a lock context with a request */
+ return lock_ctx;
+ }
+
+ next_ctx = lock_ctx->next;
+
+ DEBUG(DEBUG_INFO, ("Removing lock context without lock "
+ "request\n"));
+ DLIST_REMOVE(ctdb->lock_pending, lock_ctx);
+ CTDB_DECREMENT_STAT(ctdb, locks.num_pending);
+ CTDB_DECREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_pending);
+ talloc_free(lock_ctx);
+ }
+
+ /* Next check database queues */
+ for (ctdb_db = ctdb->db_list; ctdb_db; ctdb_db = ctdb_db->next) {
+ if (ctdb_db->lock_num_current ==
+ ctdb->tunable.lock_processes_per_db) {
+ continue;
+ }
+
+ for (lock_ctx = ctdb_db->lock_pending; lock_ctx != NULL;
+ lock_ctx = next_ctx) {
+
+ next_ctx = lock_ctx->next;
+
+ if (lock_ctx->request != NULL) {
+ return lock_ctx;
+ }
+
+ DEBUG(DEBUG_INFO, ("Removing lock context without "
+ "lock request\n"));
+ DLIST_REMOVE(ctdb_db->lock_pending, lock_ctx);
+ CTDB_DECREMENT_STAT(ctdb, locks.num_pending);
+ CTDB_DECREMENT_DB_STAT(ctdb_db, locks.num_pending);
+ talloc_free(lock_ctx);
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * Schedule a new lock child process
+ * Set up callback handler and timeout handler
+ */
+static void ctdb_lock_schedule(struct ctdb_context *ctdb)
+{
+ struct lock_context *lock_ctx;
+ int ret, argc;
+ TALLOC_CTX *tmp_ctx;
+ static char prog[PATH_MAX+1] = "";
+ const char **args;
+
+ if (!ctdb_set_helper("lock helper",
+ prog, sizeof(prog),
+ "CTDB_LOCK_HELPER",
+ CTDB_HELPER_BINDIR, "ctdb_lock_helper")) {
+ ctdb_die(ctdb, __location__
+ " Unable to set lock helper\n");
+ }
+
+ /* Find a lock context with requests */
+ lock_ctx = ctdb_find_lock_context(ctdb);
+ if (lock_ctx == NULL) {
+ return;
+ }
+
+ lock_ctx->child = -1;
+ ret = pipe(lock_ctx->fd);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, ("Failed to create pipe in ctdb_lock_schedule\n"));
+ return;
+ }
+
+ set_close_on_exec(lock_ctx->fd[0]);
+
+ /* Create data for child process */
+ tmp_ctx = talloc_new(lock_ctx);
+ if (tmp_ctx == NULL) {
+ DEBUG(DEBUG_ERR, ("Failed to allocate memory for helper args\n"));
+ close(lock_ctx->fd[0]);
+ close(lock_ctx->fd[1]);
+ return;
+ }
+
+ if (! ctdb->do_setsched) {
+ ret = setenv("CTDB_NOSETSCHED", "1", 1);
+ if (ret != 0) {
+ DEBUG(DEBUG_WARNING,
+ ("Failed to set CTDB_NOSETSCHED variable\n"));
+ }
+ }
+
+ /* Create arguments for lock helper */
+ if (!lock_helper_args(tmp_ctx, lock_ctx, lock_ctx->fd[1],
+ &argc, &args)) {
+ DEBUG(DEBUG_ERR, ("Failed to create lock helper args\n"));
+ close(lock_ctx->fd[0]);
+ close(lock_ctx->fd[1]);
+ talloc_free(tmp_ctx);
+ return;
+ }
+
+ lock_ctx->child = ctdb_vfork_exec(lock_ctx, ctdb, prog, argc,
+ (const char **)args);
+ if (lock_ctx->child == -1) {
+ DEBUG(DEBUG_ERR, ("Failed to create a child in ctdb_lock_schedule\n"));
+ close(lock_ctx->fd[0]);
+ close(lock_ctx->fd[1]);
+ talloc_free(tmp_ctx);
+ return;
+ }
+
+ /* Parent process */
+ close(lock_ctx->fd[1]);
+
+ talloc_free(tmp_ctx);
+
+ /* Set up timeout handler */
+ lock_ctx->ttimer = tevent_add_timer(ctdb->ev,
+ lock_ctx,
+ timeval_current_ofs(10, 0),
+ ctdb_lock_timeout_handler,
+ (void *)lock_ctx);
+ if (lock_ctx->ttimer == NULL) {
+ ctdb_kill(ctdb, lock_ctx->child, SIGTERM);
+ lock_ctx->child = -1;
+ close(lock_ctx->fd[0]);
+ return;
+ }
+
+ /* Set up callback */
+ lock_ctx->tfd = tevent_add_fd(ctdb->ev,
+ lock_ctx,
+ lock_ctx->fd[0],
+ TEVENT_FD_READ,
+ ctdb_lock_handler,
+ (void *)lock_ctx);
+ if (lock_ctx->tfd == NULL) {
+ TALLOC_FREE(lock_ctx->ttimer);
+ ctdb_kill(ctdb, lock_ctx->child, SIGTERM);
+ lock_ctx->child = -1;
+ close(lock_ctx->fd[0]);
+ return;
+ }
+ tevent_fd_set_auto_close(lock_ctx->tfd);
+
+ /* Move the context from pending to current */
+ if (lock_ctx->type == LOCK_RECORD) {
+ DLIST_REMOVE(lock_ctx->ctdb_db->lock_pending, lock_ctx);
+ DLIST_ADD_END(lock_ctx->ctdb_db->lock_current, lock_ctx);
+ } else {
+ DLIST_REMOVE(ctdb->lock_pending, lock_ctx);
+ DLIST_ADD_END(ctdb->lock_current, lock_ctx);
+ }
+ CTDB_DECREMENT_STAT(lock_ctx->ctdb, locks.num_pending);
+ CTDB_INCREMENT_STAT(lock_ctx->ctdb, locks.num_current);
+ lock_ctx->ctdb_db->lock_num_current++;
+ CTDB_DECREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_pending);
+ CTDB_INCREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_current);
+}
+
+
+/*
+ * Lock record / db depending on type
+ */
+static struct lock_request *ctdb_lock_internal(TALLOC_CTX *mem_ctx,
+ struct ctdb_context *ctdb,
+ struct ctdb_db_context *ctdb_db,
+ TDB_DATA key,
+ uint32_t priority,
+ void (*callback)(void *, bool),
+ void *private_data,
+ enum lock_type type,
+ bool auto_mark)
+{
+ struct lock_context *lock_ctx = NULL;
+ struct lock_request *request;
+
+ if (callback == NULL) {
+ DEBUG(DEBUG_WARNING, ("No callback function specified, not locking\n"));
+ return NULL;
+ }
+
+ lock_ctx = talloc_zero(ctdb, struct lock_context);
+ if (lock_ctx == NULL) {
+ DEBUG(DEBUG_ERR, ("Failed to create a new lock context\n"));
+ return NULL;
+ }
+
+ if ((request = talloc_zero(mem_ctx, struct lock_request)) == NULL) {
+ talloc_free(lock_ctx);
+ return NULL;
+ }
+
+ lock_ctx->type = type;
+ lock_ctx->ctdb = ctdb;
+ lock_ctx->ctdb_db = ctdb_db;
+ lock_ctx->key.dsize = key.dsize;
+ if (key.dsize > 0) {
+ lock_ctx->key.dptr = talloc_memdup(lock_ctx, key.dptr, key.dsize);
+ if (lock_ctx->key.dptr == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ "Memory allocation error\n"));
+ talloc_free(lock_ctx);
+ talloc_free(request);
+ return NULL;
+ }
+ lock_ctx->key_hash = ctdb_hash(&key);
+ } else {
+ lock_ctx->key.dptr = NULL;
+ }
+ lock_ctx->priority = priority;
+ lock_ctx->auto_mark = auto_mark;
+
+ lock_ctx->request = request;
+ lock_ctx->child = -1;
+
+ /* Non-record locks are required by recovery and should be scheduled
+ * immediately, so keep them at the head of the pending queue.
+ */
+ if (lock_ctx->type == LOCK_RECORD) {
+ DLIST_ADD_END(ctdb_db->lock_pending, lock_ctx);
+ } else {
+ DLIST_ADD_END(ctdb->lock_pending, lock_ctx);
+ }
+ CTDB_INCREMENT_STAT(ctdb, locks.num_pending);
+ if (ctdb_db) {
+ CTDB_INCREMENT_DB_STAT(ctdb_db, locks.num_pending);
+ }
+
+ /* Start the timer when we activate the context */
+ lock_ctx->start_time = timeval_current();
+
+ request->lctx = lock_ctx;
+ request->callback = callback;
+ request->private_data = private_data;
+
+ talloc_set_destructor(request, ctdb_lock_request_destructor);
+ talloc_set_destructor(lock_ctx, ctdb_lock_context_destructor);
+
+ ctdb_lock_schedule(ctdb);
+
+ return request;
+}
+
+
+/*
+ * obtain a lock on a record in a database
+ */
+struct lock_request *ctdb_lock_record(TALLOC_CTX *mem_ctx,
+ struct ctdb_db_context *ctdb_db,
+ TDB_DATA key,
+ bool auto_mark,
+ void (*callback)(void *, bool),
+ void *private_data)
+{
+ return ctdb_lock_internal(mem_ctx,
+ ctdb_db->ctdb,
+ ctdb_db,
+ key,
+ 0,
+ callback,
+ private_data,
+ LOCK_RECORD,
+ auto_mark);
+}
+
+
+/*
+ * obtain a lock on a database
+ */
+struct lock_request *ctdb_lock_db(TALLOC_CTX *mem_ctx,
+ struct ctdb_db_context *ctdb_db,
+ bool auto_mark,
+ void (*callback)(void *, bool),
+ void *private_data)
+{
+ return ctdb_lock_internal(mem_ctx,
+ ctdb_db->ctdb,
+ ctdb_db,
+ tdb_null,
+ 0,
+ callback,
+ private_data,
+ LOCK_DB,
+ auto_mark);
+}
diff --git a/ctdb/server/ctdb_lock_helper.c b/ctdb/server/ctdb_lock_helper.c
new file mode 100644
index 0000000..51d2992
--- /dev/null
+++ b/ctdb/server/ctdb_lock_helper.c
@@ -0,0 +1,350 @@
+/*
+ ctdb lock helper
+
+ Copyright (C) Amitay Isaacs 2013
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/filesys.h"
+#include "system/network.h"
+#include "system/wait.h"
+
+#include <talloc.h>
+#include <tevent.h>
+#include <tdb.h>
+
+#include "lib/util/sys_rw.h"
+#include "lib/util/tevent_unix.h"
+
+#include "protocol/protocol.h"
+
+#include "common/system.h"
+
+static bool realtime = true;
+
+struct lock_state {
+ struct tdb_context *tdb;
+ TDB_DATA key;
+};
+
+static void set_priority(void)
+{
+ const char *ptr;
+
+ ptr = getenv("CTDB_NOSETSCHED");
+ if (ptr != NULL) {
+ realtime = false;
+ }
+
+ if (! realtime) {
+ return;
+ }
+
+ realtime = set_scheduler();
+ if (! realtime) {
+ fprintf(stderr,
+ "locking: Unable to set real-time scheduler priority\n");
+ }
+}
+
+static void reset_priority(void)
+{
+ if (realtime) {
+ reset_scheduler();
+ }
+}
+
+static void send_result(int fd, char result)
+{
+ sys_write(fd, &result, 1);
+ if (result == 1) {
+ exit(1);
+ }
+}
+
+
+static void usage(const char *progname)
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: %s <ctdbd-pid> <output-fd> RECORD <db-path> <db-flags> <db-key>\n", progname);
+ fprintf(stderr, " %s <ctdbd-pid> <output-fd> DB <db-path> <db-flags>\n", progname);
+}
+
+static uint8_t *hex_decode_talloc(TALLOC_CTX *mem_ctx,
+ const char *hex_in, size_t *len)
+{
+ unsigned int i;
+ int num;
+ uint8_t *buffer;
+
+ *len = strlen(hex_in) / 2;
+ buffer = talloc_array(mem_ctx, unsigned char, *len);
+
+ for (i=0; i<*len; i++) {
+ sscanf(&hex_in[i*2], "%02X", &num);
+ buffer[i] = (uint8_t)num;
+ }
+
+ return buffer;
+}
+
+static int lock_record(const char *dbpath, const char *dbflags,
+ const char *dbkey, struct lock_state *state)
+{
+ int tdb_flags;
+
+ /* No error checking since CTDB always passes sane values */
+ tdb_flags = strtol(dbflags, NULL, 0);
+
+ /* Convert hex key to key */
+ if (strcmp(dbkey, "NULL") == 0) {
+ state->key.dptr = NULL;
+ state->key.dsize = 0;
+ } else {
+ state->key.dptr = hex_decode_talloc(NULL, dbkey,
+ &state->key.dsize);
+ }
+
+ state->tdb = tdb_open(dbpath, 0, tdb_flags, O_RDWR, 0600);
+ if (state->tdb == NULL) {
+ fprintf(stderr, "locking: Error opening database %s\n", dbpath);
+ return 1;
+ }
+
+ set_priority();
+
+ if (tdb_chainlock(state->tdb, state->key) < 0) {
+ fprintf(stderr, "locking: Error getting record lock (%s)\n",
+ tdb_errorstr(state->tdb));
+ return 1;
+ }
+
+ reset_priority();
+
+ return 0;
+
+}
+
+static int lock_db(const char *dbpath, const char *dbflags,
+ struct lock_state *state)
+{
+ int tdb_flags;
+
+ /* No error checking since CTDB always passes sane values */
+ tdb_flags = strtol(dbflags, NULL, 0);
+
+ state->tdb = tdb_open(dbpath, 0, tdb_flags, O_RDWR, 0600);
+ if (state->tdb == NULL) {
+ fprintf(stderr, "locking: Error opening database %s\n", dbpath);
+ return 1;
+ }
+
+ set_priority();
+
+ if (tdb_lockall(state->tdb) < 0) {
+ fprintf(stderr, "locking: Error getting db lock (%s)\n",
+ tdb_errorstr(state->tdb));
+ return 1;
+ }
+
+ reset_priority();
+
+ return 0;
+}
+
+struct wait_for_parent_state {
+ struct tevent_context *ev;
+ pid_t ppid;
+};
+
+static void wait_for_parent_check(struct tevent_req *subreq);
+
+static struct tevent_req *wait_for_parent_send(TALLOC_CTX *mem_ctx,
+ struct tevent_context *ev,
+ pid_t ppid)
+{
+ struct tevent_req *req, *subreq;
+ struct wait_for_parent_state *state;
+
+ req = tevent_req_create(mem_ctx, &state, struct wait_for_parent_state);
+ if (req == NULL) {
+ return NULL;
+ }
+
+ state->ev = ev;
+ state->ppid = ppid;
+
+ if (ppid == 1) {
+ tevent_req_done(req);
+ return tevent_req_post(req, ev);
+ }
+
+ subreq = tevent_wakeup_send(state, ev,
+ tevent_timeval_current_ofs(5,0));
+ if (tevent_req_nomem(subreq, req)) {
+ return tevent_req_post(req, ev);
+ }
+ tevent_req_set_callback(subreq, wait_for_parent_check, req);
+
+ return req;
+}
+
+static void wait_for_parent_check(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct wait_for_parent_state *state = tevent_req_data(
+ req, struct wait_for_parent_state);
+ bool status;
+
+ status = tevent_wakeup_recv(subreq);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ /* Ignore error */
+ fprintf(stderr, "locking: tevent_wakeup_recv() failed\n");
+ }
+
+ if (kill(state->ppid, 0) == -1 && errno == ESRCH) {
+ tevent_req_done(req);
+ return;
+ }
+
+ subreq = tevent_wakeup_send(state, state->ev,
+ tevent_timeval_current_ofs(5,0));
+ if (tevent_req_nomem(subreq, req)) {
+ return;
+ }
+ tevent_req_set_callback(subreq, wait_for_parent_check, req);
+}
+
+static bool wait_for_parent_recv(struct tevent_req *req, int *perr)
+{
+ if (tevent_req_is_unix_error(req, perr)) {
+ return false;
+ }
+
+ return true;
+}
+
+static void cleanup(struct lock_state *state)
+{
+ if (state->tdb != NULL) {
+ if (state->key.dsize == 0) {
+ tdb_unlockall(state->tdb);
+ } else {
+ tdb_chainunlock(state->tdb, state->key);
+ }
+ tdb_close(state->tdb);
+ }
+}
+
+static void signal_handler(struct tevent_context *ev,
+ struct tevent_signal *se,
+ int signum, int count, void *siginfo,
+ void *private_data)
+{
+ struct lock_state *state = (struct lock_state *)private_data;
+
+ cleanup(state);
+ exit(0);
+}
+
+int main(int argc, char *argv[])
+{
+ struct tevent_context *ev;
+ struct tevent_signal *se;
+ struct tevent_req *req;
+ struct lock_state state = { 0 };
+ int write_fd;
+ char result = 0;
+ int ppid;
+ const char *lock_type;
+ bool status;
+ int err;
+
+ reset_scheduler();
+
+ if (argc < 4) {
+ usage(argv[0]);
+ exit(1);
+ }
+
+ ppid = atoi(argv[1]);
+ write_fd = atoi(argv[2]);
+ lock_type = argv[3];
+
+ ev = tevent_context_init(NULL);
+ if (ev == NULL) {
+ fprintf(stderr, "locking: tevent_context_init() failed\n");
+ exit(1);
+ }
+
+ se = tevent_add_signal(ev, ev, SIGTERM, 0,
+ signal_handler, &state);
+ if (se == NULL) {
+ fprintf(stderr, "locking: tevent_add_signal() failed\n");
+ talloc_free(ev);
+ exit(1);
+ }
+
+ if (strcmp(lock_type, "RECORD") == 0) {
+ if (argc != 7) {
+ fprintf(stderr,
+ "locking: Invalid number of arguments (%d)\n",
+ argc);
+ usage(argv[0]);
+ exit(1);
+ }
+ result = lock_record(argv[4], argv[5], argv[6], &state);
+
+ } else if (strcmp(lock_type, "DB") == 0) {
+ if (argc != 6) {
+ fprintf(stderr,
+ "locking: Invalid number of arguments (%d)\n",
+ argc);
+ usage(argv[0]);
+ exit(1);
+ }
+ result = lock_db(argv[4], argv[5], &state);
+
+ } else {
+ fprintf(stderr, "locking: Invalid lock-type '%s'\n", lock_type);
+ usage(argv[0]);
+ exit(1);
+ }
+
+ send_result(write_fd, result);
+
+ req = wait_for_parent_send(ev, ev, ppid);
+ if (req == NULL) {
+ fprintf(stderr, "locking: wait_for_parent_send() failed\n");
+ cleanup(&state);
+ exit(1);
+ }
+
+ tevent_req_poll(req, ev);
+
+ status = wait_for_parent_recv(req, &err);
+ if (! status) {
+ fprintf(stderr,
+ "locking: wait_for_parent_recv() failed (%d)\n",
+ err);
+ }
+
+ talloc_free(ev);
+ cleanup(&state);
+ return 0;
+}
diff --git a/ctdb/server/ctdb_logging.c b/ctdb/server/ctdb_logging.c
new file mode 100644
index 0000000..1da26b5
--- /dev/null
+++ b/ctdb/server/ctdb_logging.c
@@ -0,0 +1,174 @@
+/*
+ ctdb logging code
+
+ Copyright (C) Andrew Tridgell 2008
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/filesys.h"
+#include "system/network.h"
+#include "system/time.h"
+
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/util/dlinklist.h"
+#include "lib/util/debug.h"
+#include "lib/util/blocking.h"
+#include "lib/util/sys_rw.h"
+#include "lib/util/time.h"
+
+#include "ctdb_private.h"
+#include "ctdb_client.h"
+
+#include "common/common.h"
+#include "common/logging.h"
+
+struct ctdb_log_state {
+ int fd, pfd;
+ char buf[1024];
+ uint16_t buf_used;
+};
+
+/* Used by ctdb_set_child_logging() */
+static struct ctdb_log_state *log_state;
+
+/* Initialise logging */
+bool ctdb_logging_init(TALLOC_CTX *mem_ctx, const char *logging,
+ const char *debug_level)
+{
+ int ret;
+
+ log_state = talloc_zero(mem_ctx, struct ctdb_log_state);
+ if (log_state == NULL) {
+ return false;
+ }
+
+ ret = logging_init(mem_ctx, logging, debug_level, "ctdbd");
+ if (ret != 0) {
+ return false;
+ }
+
+ return true;
+}
+
+static void write_to_log(const char *buf, unsigned int len)
+{
+ DEBUG(script_log_level, ("%*.*s\n", len, len, buf));
+}
+
+/*
+ called when log data comes in from a child process
+ */
+static void ctdb_child_log_handler(struct tevent_context *ev,
+ struct tevent_fd *fde,
+ uint16_t flags, void *private)
+{
+ struct ctdb_log_state *log = talloc_get_type(private, struct ctdb_log_state);
+ char *p;
+ int n;
+
+ if (!(flags & TEVENT_FD_READ)) {
+ return;
+ }
+
+ n = sys_read(log->pfd, &log->buf[log->buf_used],
+ sizeof(log->buf) - log->buf_used);
+ if (n > 0) {
+ log->buf_used += n;
+ } else if (n == 0) {
+ if (log != log_state) {
+ talloc_free(log);
+ }
+ return;
+ }
+
+ while (log->buf_used > 0 &&
+ (p = memchr(log->buf, '\n', log->buf_used)) != NULL) {
+ int n1 = (p - log->buf)+1;
+ int n2 = n1 - 1;
+ /* swallow \r from child processes */
+ if (n2 > 0 && log->buf[n2-1] == '\r') {
+ n2--;
+ }
+ write_to_log(log->buf, n2);
+ memmove(log->buf, p+1, sizeof(log->buf) - n1);
+ log->buf_used -= n1;
+ }
+
+ /* the buffer could have completely filled - unfortunately we have
+ no choice but to dump it out straight away */
+ if (log->buf_used == sizeof(log->buf)) {
+ write_to_log(log->buf, log->buf_used);
+ log->buf_used = 0;
+ }
+}
+
+/*
+ setup for logging of child process stdout
+*/
+int ctdb_set_child_logging(struct ctdb_context *ctdb)
+{
+ int p[2];
+ int old_stdout, old_stderr;
+ struct tevent_fd *fde;
+
+ /* setup a pipe to catch IO from subprocesses */
+ if (pipe(p) != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to setup for child logging pipe\n"));
+ return -1;
+ }
+
+ /* We'll fail if stderr/stdout not already open; it's simpler. */
+ old_stdout = dup(STDOUT_FILENO);
+ if (old_stdout < 0) {
+ DEBUG(DEBUG_ERR, ("Failed to dup stdout for child logging\n"));
+ return -1;
+ }
+ old_stderr = dup(STDERR_FILENO);
+ if (old_stderr < 0) {
+ DEBUG(DEBUG_ERR, ("Failed to dup stderr for child logging\n"));
+ close(old_stdout);
+ return -1;
+ }
+ if (dup2(p[1], STDOUT_FILENO) < 0 || dup2(p[1], STDERR_FILENO) < 0) {
+ int saved_errno = errno;
+ dup2(old_stdout, STDOUT_FILENO);
+ dup2(old_stderr, STDERR_FILENO);
+ close(old_stdout);
+ close(old_stderr);
+ close(p[0]);
+ close(p[1]);
+ errno = saved_errno;
+
+ printf(__location__ " dup2 failed: %s\n",
+ strerror(errno));
+ return -1;
+ }
+ close(p[1]);
+ close(old_stdout);
+ close(old_stderr);
+
+ fde = tevent_add_fd(ctdb->ev, log_state, p[0], TEVENT_FD_READ,
+ ctdb_child_log_handler, log_state);
+ tevent_fd_set_auto_close(fde);
+
+ log_state->pfd = p[0];
+
+ DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for logging\n", p[0]));
+
+ return 0;
+}
diff --git a/ctdb/server/ctdb_ltdb_server.c b/ctdb/server/ctdb_ltdb_server.c
new file mode 100644
index 0000000..b6c7630
--- /dev/null
+++ b/ctdb/server/ctdb_ltdb_server.c
@@ -0,0 +1,1663 @@
+/*
+ ctdb ltdb code - server side
+
+ Copyright (C) Andrew Tridgell 2007
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "system/dir.h"
+#include "system/time.h"
+#include "system/locale.h"
+
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/tdb_wrap/tdb_wrap.h"
+#include "lib/util/dlinklist.h"
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+
+#include "ctdb_private.h"
+#include "ctdb_client.h"
+
+#include "common/rb_tree.h"
+#include "common/reqid.h"
+#include "common/system.h"
+#include "common/common.h"
+#include "common/logging.h"
+
+#include "server/ctdb_config.h"
+
+#define PERSISTENT_HEALTH_TDB "persistent_health.tdb"
+
+/**
+ * write a record to a normal database
+ *
+ * This is the server-variant of the ctdb_ltdb_store function.
+ * It contains logic to determine whether a record should be
+ * stored or deleted. It also sends SCHEDULE_FOR_DELETION
+ * controls to the local ctdb daemon if apporpriate.
+ */
+static int ctdb_ltdb_store_server(struct ctdb_db_context *ctdb_db,
+ TDB_DATA key,
+ struct ctdb_ltdb_header *header,
+ TDB_DATA data)
+{
+ struct ctdb_context *ctdb = ctdb_db->ctdb;
+ TDB_DATA rec[2];
+ uint32_t hsize = sizeof(struct ctdb_ltdb_header);
+ int ret;
+ bool keep = false;
+ bool schedule_for_deletion = false;
+ bool remove_from_delete_queue = false;
+ uint32_t lmaster;
+
+ if (ctdb->flags & CTDB_FLAG_TORTURE) {
+ TDB_DATA old;
+ struct ctdb_ltdb_header *h2;
+
+ old = tdb_fetch(ctdb_db->ltdb->tdb, key);
+ h2 = (struct ctdb_ltdb_header *)old.dptr;
+ if (old.dptr != NULL &&
+ old.dsize >= hsize &&
+ h2->rsn > header->rsn) {
+ DEBUG(DEBUG_ERR,
+ ("RSN regression! %"PRIu64" %"PRIu64"\n",
+ h2->rsn, header->rsn));
+ }
+ if (old.dptr) {
+ free(old.dptr);
+ }
+ }
+
+ if (ctdb->vnn_map == NULL) {
+ /*
+ * Called from a client: always store the record
+ * Also don't call ctdb_lmaster since it uses the vnn_map!
+ */
+ keep = true;
+ goto store;
+ }
+
+ lmaster = ctdb_lmaster(ctdb_db->ctdb, &key);
+
+ /*
+ * If we migrate an empty record off to another node
+ * and the record has not been migrated with data,
+ * delete the record instead of storing the empty record.
+ */
+ if (data.dsize != 0) {
+ keep = true;
+ } else if (header->flags & CTDB_REC_RO_FLAGS) {
+ keep = true;
+ } else if (header->flags & CTDB_REC_FLAG_AUTOMATIC) {
+ /*
+ * The record is not created by the client but
+ * automatically by the ctdb_ltdb_fetch logic that
+ * creates a record with an initial header in the
+ * ltdb before trying to migrate the record from
+ * the current lmaster. Keep it instead of trying
+ * to delete the non-existing record...
+ */
+ keep = true;
+ schedule_for_deletion = true;
+ } else if (header->flags & CTDB_REC_FLAG_MIGRATED_WITH_DATA) {
+ keep = true;
+ } else if (ctdb_db->ctdb->pnn == lmaster) {
+ /*
+ * If we are lmaster, then we usually keep the record.
+ * But if we retrieve the dmaster role by a VACUUM_MIGRATE
+ * and the record is empty and has never been migrated
+ * with data, then we should delete it instead of storing it.
+ * This is part of the vacuuming process.
+ *
+ * The reason that we usually need to store even empty records
+ * on the lmaster is that a client operating directly on the
+ * lmaster (== dmaster) expects the local copy of the record to
+ * exist after successful ctdb migrate call. If the record does
+ * not exist, the client goes into a migrate loop and eventually
+ * fails. So storing the empty record makes sure that we do not
+ * need to change the client code.
+ */
+ if (!(header->flags & CTDB_REC_FLAG_VACUUM_MIGRATED)) {
+ keep = true;
+ } else if (ctdb_db->ctdb->pnn != header->dmaster) {
+ keep = true;
+ }
+ } else if (ctdb_db->ctdb->pnn == header->dmaster) {
+ keep = true;
+ }
+
+ if (keep) {
+ if (ctdb_db_volatile(ctdb_db) &&
+ (ctdb_db->ctdb->pnn == header->dmaster) &&
+ !(header->flags & CTDB_REC_RO_FLAGS))
+ {
+ header->rsn++;
+
+ if (data.dsize == 0) {
+ schedule_for_deletion = true;
+ }
+ }
+ remove_from_delete_queue = !schedule_for_deletion;
+ }
+
+store:
+ /*
+ * The VACUUM_MIGRATED flag is only set temporarily for
+ * the above logic when the record was retrieved by a
+ * VACUUM_MIGRATE call and should not be stored in the
+ * database.
+ *
+ * The VACUUM_MIGRATE call is triggered by a vacuum fetch,
+ * and there are two cases in which the corresponding record
+ * is stored in the local database:
+ * 1. The record has been migrated with data in the past
+ * (the MIGRATED_WITH_DATA record flag is set).
+ * 2. The record has been filled with data again since it
+ * had been submitted in the VACUUM_FETCH message to the
+ * lmaster.
+ * For such records it is important to not store the
+ * VACUUM_MIGRATED flag in the database.
+ */
+ header->flags &= ~CTDB_REC_FLAG_VACUUM_MIGRATED;
+
+ /*
+ * Similarly, clear the AUTOMATIC flag which should not enter
+ * the local database copy since this would require client
+ * modifications to clear the flag when the client stores
+ * the record.
+ */
+ header->flags &= ~CTDB_REC_FLAG_AUTOMATIC;
+
+ rec[0].dsize = hsize;
+ rec[0].dptr = (uint8_t *)header;
+
+ rec[1].dsize = data.dsize;
+ rec[1].dptr = data.dptr;
+
+ DEBUG(DEBUG_DEBUG, (__location__ " db[%s]: %s record: hash[0x%08x]\n",
+ ctdb_db->db_name,
+ keep?"storing":"deleting",
+ ctdb_hash(&key)));
+
+ if (keep) {
+ ret = tdb_storev(ctdb_db->ltdb->tdb, key, rec, 2, TDB_REPLACE);
+ } else {
+ ret = tdb_delete(ctdb_db->ltdb->tdb, key);
+ }
+
+ if (ret != 0) {
+ int lvl = DEBUG_ERR;
+
+ if (keep == false &&
+ tdb_error(ctdb_db->ltdb->tdb) == TDB_ERR_NOEXIST)
+ {
+ lvl = DEBUG_DEBUG;
+ }
+
+ DEBUG(lvl, (__location__ " db[%s]: Failed to %s record: "
+ "%d - %s\n",
+ ctdb_db->db_name,
+ keep?"store":"delete", ret,
+ tdb_errorstr(ctdb_db->ltdb->tdb)));
+
+ schedule_for_deletion = false;
+ remove_from_delete_queue = false;
+ }
+
+ if (schedule_for_deletion) {
+ int ret2;
+ ret2 = ctdb_local_schedule_for_deletion(ctdb_db, header, key);
+ if (ret2 != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " ctdb_local_schedule_for_deletion failed.\n"));
+ }
+ }
+
+ if (remove_from_delete_queue) {
+ ctdb_local_remove_from_delete_queue(ctdb_db, header, key);
+ }
+
+ return ret;
+}
+
+struct lock_fetch_state {
+ struct ctdb_context *ctdb;
+ struct ctdb_db_context *ctdb_db;
+ void (*recv_pkt)(void *, struct ctdb_req_header *);
+ void *recv_context;
+ struct ctdb_req_header *hdr;
+ uint32_t generation;
+ bool ignore_generation;
+};
+
+/*
+ called when we should retry the operation
+ */
+static void lock_fetch_callback(void *p, bool locked)
+{
+ struct lock_fetch_state *state = talloc_get_type(p, struct lock_fetch_state);
+ if (!state->ignore_generation &&
+ state->generation != state->ctdb_db->generation) {
+ DEBUG(DEBUG_NOTICE,("Discarding previous generation lockwait packet\n"));
+ talloc_free(state->hdr);
+ return;
+ }
+ state->recv_pkt(state->recv_context, state->hdr);
+ DEBUG(DEBUG_INFO,(__location__ " PACKET REQUEUED\n"));
+}
+
+
+/*
+ do a non-blocking ltdb_lock, deferring this ctdb request until we
+ have the chainlock
+
+ It does the following:
+
+ 1) tries to get the chainlock. If it succeeds, then it returns 0
+
+ 2) if it fails to get a chainlock immediately then it sets up a
+ non-blocking chainlock via ctdb_lock_record, and when it gets the
+ chainlock it re-submits this ctdb request to the main packet
+ receive function.
+
+ This effectively queues all ctdb requests that cannot be
+ immediately satisfied until it can get the lock. This means that
+ the main ctdb daemon will not block waiting for a chainlock held by
+ a client
+
+ There are 3 possible return values:
+
+ 0: means that it got the lock immediately.
+ -1: means that it failed to get the lock, and won't retry
+ -2: means that it failed to get the lock immediately, but will retry
+ */
+int ctdb_ltdb_lock_requeue(struct ctdb_db_context *ctdb_db,
+ TDB_DATA key, struct ctdb_req_header *hdr,
+ void (*recv_pkt)(void *, struct ctdb_req_header *),
+ void *recv_context, bool ignore_generation)
+{
+ int ret;
+ struct tdb_context *tdb = ctdb_db->ltdb->tdb;
+ struct lock_request *lreq;
+ struct lock_fetch_state *state;
+
+ ret = tdb_chainlock_nonblock(tdb, key);
+
+ if (ret != 0 &&
+ !(errno == EACCES || errno == EAGAIN || errno == EDEADLK)) {
+ /* a hard failure - don't try again */
+ return -1;
+ }
+
+ /* when torturing, ensure we test the contended path */
+ if ((ctdb_db->ctdb->flags & CTDB_FLAG_TORTURE) &&
+ random() % 5 == 0) {
+ ret = -1;
+ tdb_chainunlock(tdb, key);
+ }
+
+ /* first the non-contended path */
+ if (ret == 0) {
+ return 0;
+ }
+
+ state = talloc(hdr, struct lock_fetch_state);
+ state->ctdb = ctdb_db->ctdb;
+ state->ctdb_db = ctdb_db;
+ state->hdr = hdr;
+ state->recv_pkt = recv_pkt;
+ state->recv_context = recv_context;
+ state->generation = ctdb_db->generation;
+ state->ignore_generation = ignore_generation;
+
+ /* now the contended path */
+ lreq = ctdb_lock_record(state, ctdb_db, key, true, lock_fetch_callback, state);
+ if (lreq == NULL) {
+ return -1;
+ }
+
+ /* we need to move the packet off the temporary context in ctdb_input_pkt(),
+ so it won't be freed yet */
+ talloc_steal(state, hdr);
+
+ /* now tell the caller than we will retry asynchronously */
+ return -2;
+}
+
+/*
+ a varient of ctdb_ltdb_lock_requeue that also fetches the record
+ */
+int ctdb_ltdb_lock_fetch_requeue(struct ctdb_db_context *ctdb_db,
+ TDB_DATA key, struct ctdb_ltdb_header *header,
+ struct ctdb_req_header *hdr, TDB_DATA *data,
+ void (*recv_pkt)(void *, struct ctdb_req_header *),
+ void *recv_context, bool ignore_generation)
+{
+ int ret;
+
+ ret = ctdb_ltdb_lock_requeue(ctdb_db, key, hdr, recv_pkt,
+ recv_context, ignore_generation);
+ if (ret != 0) {
+ return ret;
+ }
+
+ ret = ctdb_ltdb_fetch(ctdb_db, key, header, hdr, data);
+ if (ret != 0) {
+ int uret;
+ uret = ctdb_ltdb_unlock(ctdb_db, key);
+ if (uret != 0) {
+ DBG_ERR("ctdb_ltdb_unlock() failed with error %d\n",
+ uret);
+ }
+ }
+ return ret;
+}
+
+
+/*
+ paranoid check to see if the db is empty
+ */
+static void ctdb_check_db_empty(struct ctdb_db_context *ctdb_db)
+{
+ struct tdb_context *tdb = ctdb_db->ltdb->tdb;
+ int count = tdb_traverse_read(tdb, NULL, NULL);
+ if (count != 0) {
+ DEBUG(DEBUG_ALERT,(__location__ " tdb '%s' not empty on attach! aborting\n",
+ ctdb_db->db_path));
+ ctdb_fatal(ctdb_db->ctdb, "database not empty on attach");
+ }
+}
+
+int ctdb_load_persistent_health(struct ctdb_context *ctdb,
+ struct ctdb_db_context *ctdb_db)
+{
+ struct tdb_context *tdb = ctdb->db_persistent_health->tdb;
+ char *old;
+ char *reason = NULL;
+ TDB_DATA key;
+ TDB_DATA val;
+
+ key.dptr = discard_const_p(uint8_t, ctdb_db->db_name);
+ key.dsize = strlen(ctdb_db->db_name);
+
+ old = ctdb_db->unhealthy_reason;
+ ctdb_db->unhealthy_reason = NULL;
+
+ val = tdb_fetch(tdb, key);
+ if (val.dsize > 0) {
+ reason = talloc_strndup(ctdb_db,
+ (const char *)val.dptr,
+ val.dsize);
+ if (reason == NULL) {
+ DEBUG(DEBUG_ALERT,(__location__ " talloc_strndup(%d) failed\n",
+ (int)val.dsize));
+ ctdb_db->unhealthy_reason = old;
+ free(val.dptr);
+ return -1;
+ }
+ }
+
+ if (val.dptr) {
+ free(val.dptr);
+ }
+
+ talloc_free(old);
+ ctdb_db->unhealthy_reason = reason;
+ return 0;
+}
+
+int ctdb_update_persistent_health(struct ctdb_context *ctdb,
+ struct ctdb_db_context *ctdb_db,
+ const char *given_reason,/* NULL means healthy */
+ unsigned int num_healthy_nodes)
+{
+ struct tdb_context *tdb = ctdb->db_persistent_health->tdb;
+ int ret;
+ TDB_DATA key;
+ TDB_DATA val;
+ char *new_reason = NULL;
+ char *old_reason = NULL;
+
+ ret = tdb_transaction_start(tdb);
+ if (ret != 0) {
+ DEBUG(DEBUG_ALERT,(__location__ " tdb_transaction_start('%s') failed: %d - %s\n",
+ tdb_name(tdb), ret, tdb_errorstr(tdb)));
+ return -1;
+ }
+
+ ret = ctdb_load_persistent_health(ctdb, ctdb_db);
+ if (ret != 0) {
+ DEBUG(DEBUG_ALERT,(__location__ " ctdb_load_persistent_health('%s') failed: %d\n",
+ ctdb_db->db_name, ret));
+ return -1;
+ }
+ old_reason = ctdb_db->unhealthy_reason;
+
+ key.dptr = discard_const_p(uint8_t, ctdb_db->db_name);
+ key.dsize = strlen(ctdb_db->db_name);
+
+ if (given_reason) {
+ new_reason = talloc_strdup(ctdb_db, given_reason);
+ if (new_reason == NULL) {
+ DEBUG(DEBUG_ALERT,(__location__ " talloc_strdup(%s) failed\n",
+ given_reason));
+ return -1;
+ }
+ } else if (old_reason && num_healthy_nodes == 0) {
+ /*
+ * If the reason indicates ok, but there where no healthy nodes
+ * available, that it means, we have not recovered valid content
+ * of the db. So if there's an old reason, prefix it with
+ * "NO-HEALTHY-NODES - "
+ */
+ const char *prefix;
+
+#define _TMP_PREFIX "NO-HEALTHY-NODES - "
+ ret = strncmp(_TMP_PREFIX, old_reason, strlen(_TMP_PREFIX));
+ if (ret != 0) {
+ prefix = _TMP_PREFIX;
+ } else {
+ prefix = "";
+ }
+ new_reason = talloc_asprintf(ctdb_db, "%s%s",
+ prefix, old_reason);
+ if (new_reason == NULL) {
+ DEBUG(DEBUG_ALERT,(__location__ " talloc_asprintf(%s%s) failed\n",
+ prefix, old_reason));
+ return -1;
+ }
+#undef _TMP_PREFIX
+ }
+
+ if (new_reason) {
+ val.dptr = discard_const_p(uint8_t, new_reason);
+ val.dsize = strlen(new_reason);
+
+ ret = tdb_store(tdb, key, val, TDB_REPLACE);
+ if (ret != 0) {
+ tdb_transaction_cancel(tdb);
+ DEBUG(DEBUG_ALERT,(__location__ " tdb_store('%s', %s, %s) failed: %d - %s\n",
+ tdb_name(tdb), ctdb_db->db_name, new_reason,
+ ret, tdb_errorstr(tdb)));
+ talloc_free(new_reason);
+ return -1;
+ }
+ DEBUG(DEBUG_ALERT,("Updated db health for db(%s) to: %s\n",
+ ctdb_db->db_name, new_reason));
+ } else if (old_reason) {
+ ret = tdb_delete(tdb, key);
+ if (ret != 0) {
+ tdb_transaction_cancel(tdb);
+ DEBUG(DEBUG_ALERT,(__location__ " tdb_delete('%s', %s) failed: %d - %s\n",
+ tdb_name(tdb), ctdb_db->db_name,
+ ret, tdb_errorstr(tdb)));
+ talloc_free(new_reason);
+ return -1;
+ }
+ DEBUG(DEBUG_NOTICE,("Updated db health for db(%s): OK\n",
+ ctdb_db->db_name));
+ }
+
+ ret = tdb_transaction_commit(tdb);
+ if (ret != TDB_SUCCESS) {
+ DEBUG(DEBUG_ALERT,(__location__ " tdb_transaction_commit('%s') failed: %d - %s\n",
+ tdb_name(tdb), ret, tdb_errorstr(tdb)));
+ talloc_free(new_reason);
+ return -1;
+ }
+
+ talloc_free(old_reason);
+ ctdb_db->unhealthy_reason = new_reason;
+
+ return 0;
+}
+
+static int ctdb_backup_corrupted_tdb(struct ctdb_context *ctdb,
+ struct ctdb_db_context *ctdb_db)
+{
+ time_t now = time(NULL);
+ char *new_path;
+ char *new_reason;
+ int ret;
+ struct tm *tm;
+
+ tm = gmtime(&now);
+
+ /* formatted like: foo.tdb.0.corrupted.20091204160825.0Z */
+ new_path = talloc_asprintf(ctdb_db, "%s.corrupted."
+ "%04u%02u%02u%02u%02u%02u.0Z",
+ ctdb_db->db_path,
+ tm->tm_year+1900, tm->tm_mon+1,
+ tm->tm_mday, tm->tm_hour, tm->tm_min,
+ tm->tm_sec);
+ if (new_path == NULL) {
+ DEBUG(DEBUG_CRIT,(__location__ " talloc_asprintf() failed\n"));
+ return -1;
+ }
+
+ new_reason = talloc_asprintf(ctdb_db,
+ "ERROR - Backup of corrupted TDB in '%s'",
+ new_path);
+ if (new_reason == NULL) {
+ DEBUG(DEBUG_CRIT,(__location__ " talloc_asprintf() failed\n"));
+ return -1;
+ }
+ ret = ctdb_update_persistent_health(ctdb, ctdb_db, new_reason, 0);
+ talloc_free(new_reason);
+ if (ret != 0) {
+ DEBUG(DEBUG_CRIT,(__location__
+ ": ctdb_backup_corrupted_tdb(%s) not implemented yet\n",
+ ctdb_db->db_path));
+ return -1;
+ }
+
+ ret = rename(ctdb_db->db_path, new_path);
+ if (ret != 0) {
+ DEBUG(DEBUG_CRIT,(__location__
+ ": ctdb_backup_corrupted_tdb(%s) rename to %s failed: %d - %s\n",
+ ctdb_db->db_path, new_path,
+ errno, strerror(errno)));
+ talloc_free(new_path);
+ return -1;
+ }
+
+ DEBUG(DEBUG_CRIT,(__location__
+ ": ctdb_backup_corrupted_tdb(%s) renamed to %s\n",
+ ctdb_db->db_path, new_path));
+ talloc_free(new_path);
+ return 0;
+}
+
+int ctdb_recheck_persistent_health(struct ctdb_context *ctdb)
+{
+ struct ctdb_db_context *ctdb_db;
+ int ret;
+ int ok = 0;
+ int fail = 0;
+
+ for (ctdb_db = ctdb->db_list; ctdb_db; ctdb_db = ctdb_db->next) {
+ if (!ctdb_db_persistent(ctdb_db)) {
+ continue;
+ }
+
+ ret = ctdb_load_persistent_health(ctdb, ctdb_db);
+ if (ret != 0) {
+ DEBUG(DEBUG_ALERT,(__location__
+ " load persistent health for '%s' failed\n",
+ ctdb_db->db_path));
+ return -1;
+ }
+
+ if (ctdb_db->unhealthy_reason == NULL) {
+ ok++;
+ DEBUG(DEBUG_INFO,(__location__
+ " persistent db '%s' healthy\n",
+ ctdb_db->db_path));
+ continue;
+ }
+
+ fail++;
+ DEBUG(DEBUG_ALERT,(__location__
+ " persistent db '%s' unhealthy: %s\n",
+ ctdb_db->db_path,
+ ctdb_db->unhealthy_reason));
+ }
+ DEBUG(DEBUG_NOTICE,
+ ("ctdb_recheck_persistent_health: OK[%d] FAIL[%d]\n",
+ ok, fail));
+
+ if (fail != 0) {
+ return -1;
+ }
+
+ return 0;
+}
+
+
+/*
+ mark a database - as healthy
+ */
+int32_t ctdb_control_db_set_healthy(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+ uint32_t db_id = *(uint32_t *)indata.dptr;
+ struct ctdb_db_context *ctdb_db;
+ int ret;
+ bool may_recover = false;
+
+ ctdb_db = find_ctdb_db(ctdb, db_id);
+ if (!ctdb_db) {
+ DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%x\n", db_id));
+ return -1;
+ }
+
+ if (ctdb_db->unhealthy_reason) {
+ may_recover = true;
+ }
+
+ ret = ctdb_update_persistent_health(ctdb, ctdb_db, NULL, 1);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__
+ " ctdb_update_persistent_health(%s) failed\n",
+ ctdb_db->db_name));
+ return -1;
+ }
+
+ if (may_recover && ctdb->runstate == CTDB_RUNSTATE_STARTUP) {
+ DEBUG(DEBUG_ERR, (__location__ " db %s become healthy - force recovery for startup\n",
+ ctdb_db->db_name));
+ ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
+ }
+
+ return 0;
+}
+
+int32_t ctdb_control_db_get_health(struct ctdb_context *ctdb,
+ TDB_DATA indata,
+ TDB_DATA *outdata)
+{
+ uint32_t db_id = *(uint32_t *)indata.dptr;
+ struct ctdb_db_context *ctdb_db;
+ int ret;
+
+ ctdb_db = find_ctdb_db(ctdb, db_id);
+ if (!ctdb_db) {
+ DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%x\n", db_id));
+ return -1;
+ }
+
+ ret = ctdb_load_persistent_health(ctdb, ctdb_db);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__
+ " ctdb_load_persistent_health(%s) failed\n",
+ ctdb_db->db_name));
+ return -1;
+ }
+
+ *outdata = tdb_null;
+ if (ctdb_db->unhealthy_reason) {
+ outdata->dptr = (uint8_t *)ctdb_db->unhealthy_reason;
+ outdata->dsize = strlen(ctdb_db->unhealthy_reason)+1;
+ }
+
+ return 0;
+}
+
+
+int ctdb_set_db_readonly(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db)
+{
+ char *ropath;
+
+ if (ctdb_db_readonly(ctdb_db)) {
+ return 0;
+ }
+
+ if (! ctdb_db_volatile(ctdb_db)) {
+ DEBUG(DEBUG_ERR,
+ ("Non-volatile databases do not support readonly flag\n"));
+ return -1;
+ }
+
+ ropath = talloc_asprintf(ctdb_db, "%s.RO", ctdb_db->db_path);
+ if (ropath == NULL) {
+ DEBUG(DEBUG_CRIT,("Failed to asprintf the tracking database\n"));
+ return -1;
+ }
+ ctdb_db->rottdb = tdb_open(ropath,
+ ctdb->tunable.database_hash_size,
+ TDB_NOLOCK|TDB_CLEAR_IF_FIRST|TDB_NOSYNC,
+ O_CREAT|O_RDWR, 0600);
+ if (ctdb_db->rottdb == NULL) {
+ DEBUG(DEBUG_CRIT,("Failed to open/create the tracking database '%s'\n", ropath));
+ talloc_free(ropath);
+ return -1;
+ }
+
+ DEBUG(DEBUG_NOTICE,("OPENED tracking database : '%s'\n", ropath));
+
+ ctdb_db_set_readonly(ctdb_db);
+
+ DEBUG(DEBUG_NOTICE, ("Readonly property set on DB %s\n", ctdb_db->db_name));
+
+ talloc_free(ropath);
+ return 0;
+}
+
+/*
+ attach to a database, handling both persistent and non-persistent databases
+ return 0 on success, -1 on failure
+ */
+static int ctdb_local_attach(struct ctdb_context *ctdb, const char *db_name,
+ uint8_t db_flags, const char *unhealthy_reason)
+{
+ struct ctdb_db_context *ctdb_db, *tmp_db;
+ int ret;
+ struct TDB_DATA key;
+ int tdb_flags;
+ int mode = 0600;
+ int remaining_tries = 0;
+
+ ctdb_db = talloc_zero(ctdb, struct ctdb_db_context);
+ CTDB_NO_MEMORY(ctdb, ctdb_db);
+
+ ctdb_db->ctdb = ctdb;
+ ctdb_db->db_name = talloc_strdup(ctdb_db, db_name);
+ CTDB_NO_MEMORY(ctdb, ctdb_db->db_name);
+
+ key.dsize = strlen(db_name)+1;
+ key.dptr = discard_const(db_name);
+ ctdb_db->db_id = ctdb_hash(&key);
+ ctdb_db->db_flags = db_flags;
+
+ if (ctdb_db_volatile(ctdb_db)) {
+ ctdb_db->delete_queue = trbt_create(ctdb_db, 0);
+ if (ctdb_db->delete_queue == NULL) {
+ CTDB_NO_MEMORY(ctdb, ctdb_db->delete_queue);
+ }
+
+ ctdb_db->fetch_queue = trbt_create(ctdb_db, 0);
+ if (ctdb_db->fetch_queue == NULL) {
+ CTDB_NO_MEMORY(ctdb, ctdb_db->fetch_queue);
+ }
+
+ ctdb_db->ctdb_ltdb_store_fn = ctdb_ltdb_store_server;
+ }
+
+ /* check for hash collisions */
+ for (tmp_db=ctdb->db_list;tmp_db;tmp_db=tmp_db->next) {
+ if (tmp_db->db_id == ctdb_db->db_id) {
+ DEBUG(DEBUG_CRIT,("db_id 0x%x hash collision. name1='%s' name2='%s'\n",
+ tmp_db->db_id, db_name, tmp_db->db_name));
+ talloc_free(ctdb_db);
+ return -1;
+ }
+ }
+
+ if (ctdb_db_persistent(ctdb_db)) {
+ if (unhealthy_reason) {
+ ret = ctdb_update_persistent_health(ctdb, ctdb_db,
+ unhealthy_reason, 0);
+ if (ret != 0) {
+ DEBUG(DEBUG_ALERT,(__location__ " ctdb_update_persistent_health('%s','%s') failed: %d\n",
+ ctdb_db->db_name, unhealthy_reason, ret));
+ talloc_free(ctdb_db);
+ return -1;
+ }
+ }
+
+ if (ctdb->max_persistent_check_errors > 0) {
+ remaining_tries = 1;
+ }
+ if (ctdb->runstate == CTDB_RUNSTATE_RUNNING) {
+ remaining_tries = 0;
+ }
+
+ ret = ctdb_load_persistent_health(ctdb, ctdb_db);
+ if (ret != 0) {
+ DEBUG(DEBUG_ALERT,(__location__ " ctdb_load_persistent_health('%s') failed: %d\n",
+ ctdb_db->db_name, ret));
+ talloc_free(ctdb_db);
+ return -1;
+ }
+ }
+
+ if (ctdb_db->unhealthy_reason && remaining_tries == 0) {
+ DEBUG(DEBUG_ALERT,(__location__ "ERROR: tdb %s is marked as unhealthy: %s\n",
+ ctdb_db->db_name, ctdb_db->unhealthy_reason));
+ talloc_free(ctdb_db);
+ return -1;
+ }
+
+ if (ctdb_db->unhealthy_reason) {
+ /* this is just a warning, but we want that in the log file! */
+ DEBUG(DEBUG_ALERT,(__location__ "Warning: tdb %s is marked as unhealthy: %s\n",
+ ctdb_db->db_name, ctdb_db->unhealthy_reason));
+ }
+
+ /* open the database */
+ ctdb_db->db_path = talloc_asprintf(ctdb_db, "%s/%s.%u",
+ ctdb_db_persistent(ctdb_db) ?
+ ctdb->db_directory_persistent :
+ ctdb->db_directory,
+ db_name, ctdb->pnn);
+
+ tdb_flags = ctdb_db_tdb_flags(db_flags,
+ ctdb->valgrinding,
+ ctdb_config.tdb_mutexes);
+
+again:
+ ctdb_db->ltdb = tdb_wrap_open(ctdb_db, ctdb_db->db_path,
+ ctdb->tunable.database_hash_size,
+ tdb_flags,
+ O_CREAT|O_RDWR, mode);
+ if (ctdb_db->ltdb == NULL) {
+ struct stat st;
+ int saved_errno = errno;
+
+ if (! ctdb_db_persistent(ctdb_db)) {
+ DEBUG(DEBUG_CRIT,("Failed to open tdb '%s': %d - %s\n",
+ ctdb_db->db_path,
+ saved_errno,
+ strerror(saved_errno)));
+ talloc_free(ctdb_db);
+ return -1;
+ }
+
+ if (remaining_tries == 0) {
+ DEBUG(DEBUG_CRIT,(__location__
+ "Failed to open persistent tdb '%s': %d - %s\n",
+ ctdb_db->db_path,
+ saved_errno,
+ strerror(saved_errno)));
+ talloc_free(ctdb_db);
+ return -1;
+ }
+
+ ret = stat(ctdb_db->db_path, &st);
+ if (ret != 0) {
+ DEBUG(DEBUG_CRIT,(__location__
+ "Failed to open persistent tdb '%s': %d - %s\n",
+ ctdb_db->db_path,
+ saved_errno,
+ strerror(saved_errno)));
+ talloc_free(ctdb_db);
+ return -1;
+ }
+
+ ret = ctdb_backup_corrupted_tdb(ctdb, ctdb_db);
+ if (ret != 0) {
+ DEBUG(DEBUG_CRIT,(__location__
+ "Failed to open persistent tdb '%s': %d - %s\n",
+ ctdb_db->db_path,
+ saved_errno,
+ strerror(saved_errno)));
+ talloc_free(ctdb_db);
+ return -1;
+ }
+
+ remaining_tries--;
+ mode = st.st_mode;
+ goto again;
+ }
+
+ if (!ctdb_db_persistent(ctdb_db)) {
+ ctdb_check_db_empty(ctdb_db);
+ } else {
+ ret = tdb_check(ctdb_db->ltdb->tdb, NULL, NULL);
+ if (ret != 0) {
+ int fd;
+ struct stat st;
+
+ DEBUG(DEBUG_CRIT,("tdb_check(%s) failed: %d - %s\n",
+ ctdb_db->db_path, ret,
+ tdb_errorstr(ctdb_db->ltdb->tdb)));
+ if (remaining_tries == 0) {
+ talloc_free(ctdb_db);
+ return -1;
+ }
+
+ fd = tdb_fd(ctdb_db->ltdb->tdb);
+ ret = fstat(fd, &st);
+ if (ret != 0) {
+ DEBUG(DEBUG_CRIT,(__location__
+ "Failed to fstat() persistent tdb '%s': %d - %s\n",
+ ctdb_db->db_path,
+ errno,
+ strerror(errno)));
+ talloc_free(ctdb_db);
+ return -1;
+ }
+
+ /* close the TDB */
+ talloc_free(ctdb_db->ltdb);
+ ctdb_db->ltdb = NULL;
+
+ ret = ctdb_backup_corrupted_tdb(ctdb, ctdb_db);
+ if (ret != 0) {
+ DEBUG(DEBUG_CRIT,("Failed to backup corrupted tdb '%s'\n",
+ ctdb_db->db_path));
+ talloc_free(ctdb_db);
+ return -1;
+ }
+
+ remaining_tries--;
+ mode = st.st_mode;
+ goto again;
+ }
+ }
+
+ /* remember the flags the client has specified */
+ tdb_add_flags(ctdb_db->ltdb->tdb, tdb_flags);
+
+
+ /* set up a rb tree we can use to track which records we have a
+ fetch-lock in-flight for so we can defer any additional calls
+ for the same record.
+ */
+ ctdb_db->deferred_fetch = trbt_create(ctdb_db, 0);
+ if (ctdb_db->deferred_fetch == NULL) {
+ DEBUG(DEBUG_ERR,("Failed to create deferred fetch rb tree for ctdb database\n"));
+ talloc_free(ctdb_db);
+ return -1;
+ }
+
+ ctdb_db->defer_dmaster = trbt_create(ctdb_db, 0);
+ if (ctdb_db->defer_dmaster == NULL) {
+ DEBUG(DEBUG_ERR, ("Failed to create defer dmaster rb tree for %s\n",
+ ctdb_db->db_name));
+ talloc_free(ctdb_db);
+ return -1;
+ }
+
+ DLIST_ADD(ctdb->db_list, ctdb_db);
+
+ /* setting this can help some high churn databases */
+ tdb_set_max_dead(ctdb_db->ltdb->tdb, ctdb->tunable.database_max_dead);
+
+ /*
+ all databases support the "null" function. we need this in
+ order to do forced migration of records
+ */
+ ret = ctdb_daemon_set_call(ctdb, ctdb_db->db_id, ctdb_null_func, CTDB_NULL_FUNC);
+ if (ret != 0) {
+ DEBUG(DEBUG_CRIT,("Failed to setup null function for '%s'\n", ctdb_db->db_name));
+ talloc_free(ctdb_db);
+ return -1;
+ }
+
+ /*
+ all databases support the "fetch" function. we need this
+ for efficient Samba3 ctdb fetch
+ */
+ ret = ctdb_daemon_set_call(ctdb, ctdb_db->db_id, ctdb_fetch_func, CTDB_FETCH_FUNC);
+ if (ret != 0) {
+ DEBUG(DEBUG_CRIT,("Failed to setup fetch function for '%s'\n", ctdb_db->db_name));
+ talloc_free(ctdb_db);
+ return -1;
+ }
+
+ /*
+ all databases support the "fetch_with_header" function. we need this
+ for efficient readonly record fetches
+ */
+ ret = ctdb_daemon_set_call(ctdb, ctdb_db->db_id, ctdb_fetch_with_header_func, CTDB_FETCH_WITH_HEADER_FUNC);
+ if (ret != 0) {
+ DEBUG(DEBUG_CRIT,("Failed to setup fetch function for '%s'\n", ctdb_db->db_name));
+ talloc_free(ctdb_db);
+ return -1;
+ }
+
+ ret = ctdb_vacuum_init(ctdb_db);
+ if (ret != 0) {
+ DEBUG(DEBUG_CRIT,("Failed to setup vacuuming for "
+ "database '%s'\n", ctdb_db->db_name));
+ talloc_free(ctdb_db);
+ return -1;
+ }
+
+ ret = ctdb_migration_init(ctdb_db);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,
+ ("Failed to setup migration tracking for db '%s'\n",
+ ctdb_db->db_name));
+ talloc_free(ctdb_db);
+ return -1;
+ }
+
+ ret = db_hash_init(ctdb_db, "lock_log", 2048, DB_HASH_COMPLEX,
+ &ctdb_db->lock_log);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,
+ ("Failed to setup lock logging for db '%s'\n",
+ ctdb_db->db_name));
+ talloc_free(ctdb_db);
+ return -1;
+ }
+
+ ctdb_db->generation = ctdb->vnn_map->generation;
+
+ DEBUG(DEBUG_NOTICE,("Attached to database '%s' with flags 0x%x\n",
+ ctdb_db->db_path, tdb_flags));
+
+ /* success */
+ return 0;
+}
+
+
+struct ctdb_deferred_attach_context {
+ struct ctdb_deferred_attach_context *next, *prev;
+ struct ctdb_context *ctdb;
+ struct ctdb_req_control_old *c;
+};
+
+
+static int ctdb_deferred_attach_destructor(struct ctdb_deferred_attach_context *da_ctx)
+{
+ DLIST_REMOVE(da_ctx->ctdb->deferred_attach, da_ctx);
+
+ return 0;
+}
+
+static void ctdb_deferred_attach_timeout(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *private_data)
+{
+ struct ctdb_deferred_attach_context *da_ctx = talloc_get_type(private_data, struct ctdb_deferred_attach_context);
+ struct ctdb_context *ctdb = da_ctx->ctdb;
+
+ ctdb_request_control_reply(ctdb, da_ctx->c, NULL, -1, NULL);
+ talloc_free(da_ctx);
+}
+
+static void ctdb_deferred_attach_callback(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *private_data)
+{
+ struct ctdb_deferred_attach_context *da_ctx = talloc_get_type(private_data, struct ctdb_deferred_attach_context);
+ struct ctdb_context *ctdb = da_ctx->ctdb;
+
+ /* This talloc-steals the packet ->c */
+ ctdb_input_pkt(ctdb, (struct ctdb_req_header *)da_ctx->c);
+ talloc_free(da_ctx);
+}
+
+int ctdb_process_deferred_attach(struct ctdb_context *ctdb)
+{
+ struct ctdb_deferred_attach_context *da_ctx;
+
+ /* call it from the main event loop as soon as the current event
+ finishes.
+ */
+ while ((da_ctx = ctdb->deferred_attach) != NULL) {
+ DLIST_REMOVE(ctdb->deferred_attach, da_ctx);
+ tevent_add_timer(ctdb->ev, da_ctx,
+ timeval_current_ofs(1,0),
+ ctdb_deferred_attach_callback, da_ctx);
+ }
+
+ return 0;
+}
+
+/*
+ a client has asked to attach a new database
+ */
+int32_t ctdb_control_db_attach(struct ctdb_context *ctdb,
+ TDB_DATA indata,
+ TDB_DATA *outdata,
+ uint8_t db_flags,
+ uint32_t srcnode,
+ uint32_t client_id,
+ struct ctdb_req_control_old *c,
+ bool *async_reply)
+{
+ const char *db_name = (const char *)indata.dptr;
+ struct ctdb_db_context *db;
+ struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
+ struct ctdb_client *client = NULL;
+ uint32_t opcode;
+
+ if (ctdb->tunable.allow_client_db_attach == 0) {
+ DEBUG(DEBUG_ERR, ("DB Attach to database %s denied by tunable "
+ "AllowClientDBAccess == 0\n", db_name));
+ return -1;
+ }
+
+ /* don't allow any local clients to attach while we are in recovery mode
+ * except for the recovery daemon.
+ * allow all attach from the network since these are always from remote
+ * recovery daemons.
+ */
+ if (srcnode == ctdb->pnn && client_id != 0) {
+ client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
+ }
+ if (client != NULL) {
+ /* If the node is inactive it is not part of the cluster
+ and we should not allow clients to attach to any
+ databases
+ */
+ if (node->flags & NODE_FLAGS_INACTIVE) {
+ DEBUG(DEBUG_ERR,("DB Attach to database %s refused since node is inactive (flags=0x%x)\n", db_name, node->flags));
+ return -1;
+ }
+
+ if ((c->flags & CTDB_CTRL_FLAG_ATTACH_RECOVERY) &&
+ ctdb->recovery_mode != CTDB_RECOVERY_ACTIVE) {
+ DBG_ERR("Attach from recovery refused because "
+ "recovery is not active\n");
+ return -1;
+ }
+
+ if (!(c->flags & CTDB_CTRL_FLAG_ATTACH_RECOVERY) &&
+ (ctdb->recovery_mode == CTDB_RECOVERY_ACTIVE ||
+ ctdb->runstate < CTDB_RUNSTATE_STARTUP)) {
+ struct ctdb_deferred_attach_context *da_ctx = talloc(client, struct ctdb_deferred_attach_context);
+
+ if (da_ctx == NULL) {
+ DEBUG(DEBUG_ERR,("DB Attach to database %s deferral for client with pid:%d failed due to OOM.\n", db_name, client->pid));
+ return -1;
+ }
+
+ da_ctx->ctdb = ctdb;
+ da_ctx->c = talloc_steal(da_ctx, c);
+ talloc_set_destructor(da_ctx, ctdb_deferred_attach_destructor);
+ DLIST_ADD(ctdb->deferred_attach, da_ctx);
+
+ tevent_add_timer(ctdb->ev, da_ctx,
+ timeval_current_ofs(ctdb->tunable.deferred_attach_timeout, 0),
+ ctdb_deferred_attach_timeout, da_ctx);
+
+ DEBUG(DEBUG_ERR,("DB Attach to database %s deferred for client with pid:%d since node is in recovery mode.\n", db_name, client->pid));
+ *async_reply = true;
+ return 0;
+ }
+ }
+
+ /* see if we already have this name */
+ db = ctdb_db_handle(ctdb, db_name);
+ if (db) {
+ if ((db->db_flags & db_flags) != db_flags) {
+ DEBUG(DEBUG_ERR,
+ ("Error: Failed to re-attach with 0x%x flags,"
+ " database has 0x%x flags\n", db_flags,
+ db->db_flags));
+ return -1;
+ }
+ outdata->dptr = (uint8_t *)&db->db_id;
+ outdata->dsize = sizeof(db->db_id);
+ return 0;
+ }
+
+ if (ctdb_local_attach(ctdb, db_name, db_flags, NULL) != 0) {
+ return -1;
+ }
+
+ db = ctdb_db_handle(ctdb, db_name);
+ if (!db) {
+ DEBUG(DEBUG_ERR,("Failed to find db handle for name '%s'\n", db_name));
+ return -1;
+ }
+
+ outdata->dptr = (uint8_t *)&db->db_id;
+ outdata->dsize = sizeof(db->db_id);
+
+ /* Try to ensure it's locked in mem */
+ lockdown_memory(ctdb->valgrinding);
+
+ if (ctdb_db_persistent(db)) {
+ opcode = CTDB_CONTROL_DB_ATTACH_PERSISTENT;
+ } else if (ctdb_db_replicated(db)) {
+ opcode = CTDB_CONTROL_DB_ATTACH_REPLICATED;
+ } else {
+ opcode = CTDB_CONTROL_DB_ATTACH;
+ }
+
+ /* tell all the other nodes about this database */
+ ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, opcode,
+ 0, CTDB_CTRL_FLAG_NOREPLY,
+ indata, NULL, NULL);
+
+ /* success */
+ return 0;
+}
+
+/*
+ * a client has asked to detach from a database
+ */
+int32_t ctdb_control_db_detach(struct ctdb_context *ctdb, TDB_DATA indata,
+ uint32_t client_id)
+{
+ uint32_t db_id;
+ struct ctdb_db_context *ctdb_db;
+ struct ctdb_client *client = NULL;
+
+ db_id = *(uint32_t *)indata.dptr;
+ ctdb_db = find_ctdb_db(ctdb, db_id);
+ if (ctdb_db == NULL) {
+ DEBUG(DEBUG_ERR, ("Invalid dbid 0x%08x in DB detach\n",
+ db_id));
+ return -1;
+ }
+
+ if (ctdb->tunable.allow_client_db_attach == 1) {
+ DEBUG(DEBUG_ERR, ("DB detach from database %s denied. "
+ "Clients are allowed access to databases "
+ "(AllowClientDBAccess == 1)\n",
+ ctdb_db->db_name));
+ return -1;
+ }
+
+ if (! ctdb_db_volatile(ctdb_db)) {
+ DEBUG(DEBUG_ERR,
+ ("Detaching non-volatile database %s denied\n",
+ ctdb_db->db_name));
+ return -1;
+ }
+
+ /* Cannot detach from database when in recovery */
+ if (ctdb->recovery_mode == CTDB_RECOVERY_ACTIVE) {
+ DEBUG(DEBUG_ERR, ("DB detach denied while in recovery\n"));
+ return -1;
+ }
+
+ /* If a control comes from a client, then broadcast it to all nodes.
+ * Do the actual detach only if the control comes from other daemons.
+ */
+ if (client_id != 0) {
+ client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
+ if (client != NULL) {
+ /* forward the control to all the nodes */
+ ctdb_daemon_send_control(ctdb,
+ CTDB_BROADCAST_CONNECTED, 0,
+ CTDB_CONTROL_DB_DETACH, 0,
+ CTDB_CTRL_FLAG_NOREPLY,
+ indata, NULL, NULL);
+ return 0;
+ }
+ DEBUG(DEBUG_ERR, ("Client has gone away. Failing DB detach "
+ "for database '%s'\n", ctdb_db->db_name));
+ return -1;
+ }
+
+ /* Disable vacuuming and drop all vacuuming data */
+ talloc_free(ctdb_db->vacuum_handle);
+ talloc_free(ctdb_db->delete_queue);
+ talloc_free(ctdb_db->fetch_queue);
+
+ /* Terminate any deferred fetch */
+ talloc_free(ctdb_db->deferred_fetch);
+
+ /* Terminate any traverses */
+ while (ctdb_db->traverse) {
+ talloc_free(ctdb_db->traverse);
+ }
+
+ /* Terminate any revokes */
+ while (ctdb_db->revokechild_active) {
+ talloc_free(ctdb_db->revokechild_active);
+ }
+
+ /* Free readonly tracking database */
+ if (ctdb_db_readonly(ctdb_db)) {
+ talloc_free(ctdb_db->rottdb);
+ }
+
+ DLIST_REMOVE(ctdb->db_list, ctdb_db);
+
+ DEBUG(DEBUG_NOTICE, ("Detached from database '%s'\n",
+ ctdb_db->db_name));
+ talloc_free(ctdb_db);
+
+ return 0;
+}
+
+/*
+ attach to all existing persistent databases
+ */
+static int ctdb_attach_persistent(struct ctdb_context *ctdb,
+ const char *unhealthy_reason)
+{
+ DIR *d;
+ struct dirent *de;
+
+ /* open the persistent db directory and scan it for files */
+ d = opendir(ctdb->db_directory_persistent);
+ if (d == NULL) {
+ return 0;
+ }
+
+ while ((de=readdir(d))) {
+ char *p, *s, *q;
+ size_t len = strlen(de->d_name);
+ uint32_t node;
+ int invalid_name = 0;
+
+ s = talloc_strdup(ctdb, de->d_name);
+ if (s == NULL) {
+ closedir(d);
+ CTDB_NO_MEMORY(ctdb, s);
+ }
+
+ /* only accept names ending in .tdb */
+ p = strstr(s, ".tdb.");
+ if (len < 7 || p == NULL) {
+ talloc_free(s);
+ continue;
+ }
+
+ /* only accept names ending with .tdb. and any number of digits */
+ q = p+5;
+ while (*q != 0 && invalid_name == 0) {
+ if (!isdigit(*q++)) {
+ invalid_name = 1;
+ }
+ }
+ if (invalid_name == 1 || sscanf(p+5, "%u", &node) != 1 || node != ctdb->pnn) {
+ DEBUG(DEBUG_ERR,("Ignoring persistent database '%s'\n", de->d_name));
+ talloc_free(s);
+ continue;
+ }
+ p[4] = 0;
+
+ if (ctdb_local_attach(ctdb, s, CTDB_DB_FLAGS_PERSISTENT, unhealthy_reason) != 0) {
+ DEBUG(DEBUG_ERR,("Failed to attach to persistent database '%s'\n", de->d_name));
+ closedir(d);
+ talloc_free(s);
+ return -1;
+ }
+
+ DEBUG(DEBUG_INFO,("Attached to persistent database %s\n", s));
+
+ talloc_free(s);
+ }
+ closedir(d);
+ return 0;
+}
+
+int ctdb_attach_databases(struct ctdb_context *ctdb)
+{
+ int ret;
+ char *persistent_health_path = NULL;
+ char *unhealthy_reason = NULL;
+ bool first_try = true;
+
+ persistent_health_path = talloc_asprintf(ctdb, "%s/%s.%u",
+ ctdb->db_directory_state,
+ PERSISTENT_HEALTH_TDB,
+ ctdb->pnn);
+ if (persistent_health_path == NULL) {
+ DEBUG(DEBUG_CRIT,(__location__ " talloc_asprintf() failed\n"));
+ return -1;
+ }
+
+again:
+
+ ctdb->db_persistent_health = tdb_wrap_open(ctdb, persistent_health_path,
+ 0, TDB_DISALLOW_NESTING,
+ O_CREAT | O_RDWR, 0600);
+ if (ctdb->db_persistent_health == NULL) {
+ struct tdb_wrap *tdb;
+
+ if (!first_try) {
+ DEBUG(DEBUG_CRIT,("Failed to open tdb '%s': %d - %s\n",
+ persistent_health_path,
+ errno,
+ strerror(errno)));
+ talloc_free(persistent_health_path);
+ talloc_free(unhealthy_reason);
+ return -1;
+ }
+ first_try = false;
+
+ unhealthy_reason = talloc_asprintf(ctdb, "WARNING - '%s' %s - %s",
+ persistent_health_path,
+ "was cleared after a failure",
+ "manual verification needed");
+ if (unhealthy_reason == NULL) {
+ DEBUG(DEBUG_CRIT,(__location__ " talloc_asprintf() failed\n"));
+ talloc_free(persistent_health_path);
+ return -1;
+ }
+
+ DEBUG(DEBUG_CRIT,("Failed to open tdb '%s' - retrying after CLEAR_IF_FIRST\n",
+ persistent_health_path));
+ tdb = tdb_wrap_open(ctdb, persistent_health_path,
+ 0, TDB_CLEAR_IF_FIRST | TDB_DISALLOW_NESTING,
+ O_CREAT | O_RDWR, 0600);
+ if (tdb) {
+ DEBUG(DEBUG_CRIT,("Failed to open tdb '%s' - with CLEAR_IF_FIRST: %d - %s\n",
+ persistent_health_path,
+ errno,
+ strerror(errno)));
+ talloc_free(persistent_health_path);
+ talloc_free(unhealthy_reason);
+ return -1;
+ }
+
+ talloc_free(tdb);
+ goto again;
+ }
+ ret = tdb_check(ctdb->db_persistent_health->tdb, NULL, NULL);
+ if (ret != 0) {
+ struct tdb_wrap *tdb;
+
+ talloc_free(ctdb->db_persistent_health);
+ ctdb->db_persistent_health = NULL;
+
+ if (!first_try) {
+ DEBUG(DEBUG_CRIT,("tdb_check('%s') failed\n",
+ persistent_health_path));
+ talloc_free(persistent_health_path);
+ talloc_free(unhealthy_reason);
+ return -1;
+ }
+ first_try = false;
+
+ unhealthy_reason = talloc_asprintf(ctdb, "WARNING - '%s' %s - %s",
+ persistent_health_path,
+ "was cleared after a failure",
+ "manual verification needed");
+ if (unhealthy_reason == NULL) {
+ DEBUG(DEBUG_CRIT,(__location__ " talloc_asprintf() failed\n"));
+ talloc_free(persistent_health_path);
+ return -1;
+ }
+
+ DEBUG(DEBUG_CRIT,("tdb_check('%s') failed - retrying after CLEAR_IF_FIRST\n",
+ persistent_health_path));
+ tdb = tdb_wrap_open(ctdb, persistent_health_path,
+ 0, TDB_CLEAR_IF_FIRST | TDB_DISALLOW_NESTING,
+ O_CREAT | O_RDWR, 0600);
+ if (tdb) {
+ DEBUG(DEBUG_CRIT,("Failed to open tdb '%s' - with CLEAR_IF_FIRST: %d - %s\n",
+ persistent_health_path,
+ errno,
+ strerror(errno)));
+ talloc_free(persistent_health_path);
+ talloc_free(unhealthy_reason);
+ return -1;
+ }
+
+ talloc_free(tdb);
+ goto again;
+ }
+ talloc_free(persistent_health_path);
+
+ ret = ctdb_attach_persistent(ctdb, unhealthy_reason);
+ talloc_free(unhealthy_reason);
+ if (ret != 0) {
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ called when a broadcast seqnum update comes in
+ */
+int32_t ctdb_ltdb_update_seqnum(struct ctdb_context *ctdb, uint32_t db_id, uint32_t srcnode)
+{
+ struct ctdb_db_context *ctdb_db;
+ if (srcnode == ctdb->pnn) {
+ /* don't update ourselves! */
+ return 0;
+ }
+
+ ctdb_db = find_ctdb_db(ctdb, db_id);
+ if (!ctdb_db) {
+ DEBUG(DEBUG_ERR,("Unknown db_id 0x%x in ctdb_ltdb_update_seqnum\n", db_id));
+ return -1;
+ }
+
+ if (ctdb_db->unhealthy_reason) {
+ DEBUG(DEBUG_ERR,("db(%s) unhealty in ctdb_ltdb_update_seqnum: %s\n",
+ ctdb_db->db_name, ctdb_db->unhealthy_reason));
+ return -1;
+ }
+
+ tdb_increment_seqnum_nonblock(ctdb_db->ltdb->tdb);
+ ctdb_db->seqnum = tdb_get_seqnum(ctdb_db->ltdb->tdb);
+ return 0;
+}
+
+/*
+ timer to check for seqnum changes in a ltdb and propagate them
+ */
+static void ctdb_ltdb_seqnum_check(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *p)
+{
+ struct ctdb_db_context *ctdb_db = talloc_get_type(p, struct ctdb_db_context);
+ struct ctdb_context *ctdb = ctdb_db->ctdb;
+ uint32_t new_seqnum = tdb_get_seqnum(ctdb_db->ltdb->tdb);
+ if (new_seqnum != ctdb_db->seqnum) {
+ /* something has changed - propagate it */
+ TDB_DATA data;
+ data.dptr = (uint8_t *)&ctdb_db->db_id;
+ data.dsize = sizeof(uint32_t);
+ ctdb_daemon_send_control(ctdb,
+ CTDB_BROADCAST_ACTIVE,
+ 0,
+ CTDB_CONTROL_UPDATE_SEQNUM,
+ 0,
+ CTDB_CTRL_FLAG_NOREPLY,
+ data,
+ NULL,
+ NULL);
+ }
+ ctdb_db->seqnum = new_seqnum;
+
+ /* setup a new timer */
+ ctdb_db->seqnum_update =
+ tevent_add_timer(ctdb->ev, ctdb_db,
+ timeval_current_ofs(ctdb->tunable.seqnum_interval/1000,
+ (ctdb->tunable.seqnum_interval%1000)*1000),
+ ctdb_ltdb_seqnum_check, ctdb_db);
+}
+
+/*
+ enable seqnum handling on this db
+ */
+int32_t ctdb_ltdb_enable_seqnum(struct ctdb_context *ctdb, uint32_t db_id)
+{
+ struct ctdb_db_context *ctdb_db;
+ ctdb_db = find_ctdb_db(ctdb, db_id);
+ if (!ctdb_db) {
+ DEBUG(DEBUG_ERR,("Unknown db_id 0x%x in ctdb_ltdb_enable_seqnum\n", db_id));
+ return -1;
+ }
+
+ if (ctdb_db->seqnum_update == NULL) {
+ ctdb_db->seqnum_update = tevent_add_timer(
+ ctdb->ev, ctdb_db,
+ timeval_current_ofs(ctdb->tunable.seqnum_interval/1000,
+ (ctdb->tunable.seqnum_interval%1000)*1000),
+ ctdb_ltdb_seqnum_check, ctdb_db);
+ }
+
+ tdb_enable_seqnum(ctdb_db->ltdb->tdb);
+ ctdb_db->seqnum = tdb_get_seqnum(ctdb_db->ltdb->tdb);
+ return 0;
+}
+
+int ctdb_set_db_sticky(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db)
+{
+ if (ctdb_db_sticky(ctdb_db)) {
+ return 0;
+ }
+
+ if (! ctdb_db_volatile(ctdb_db)) {
+ DEBUG(DEBUG_ERR,
+ ("Non-volatile databases do not support sticky flag\n"));
+ return -1;
+ }
+
+ ctdb_db->sticky_records = trbt_create(ctdb_db, 0);
+
+ ctdb_db_set_sticky(ctdb_db);
+
+ DEBUG(DEBUG_NOTICE,("set db sticky %s\n", ctdb_db->db_name));
+
+ return 0;
+}
+
+void ctdb_db_statistics_reset(struct ctdb_db_context *ctdb_db)
+{
+ unsigned int i;
+
+ for (i=0; i<MAX_HOT_KEYS; i++) {
+ if (ctdb_db->hot_keys[i].key.dsize > 0) {
+ TALLOC_FREE(ctdb_db->hot_keys[i].key.dptr);
+ ctdb_db->hot_keys[i].key.dsize = 0;
+ }
+ ctdb_db->hot_keys[i].count = 0;
+ ctdb_db->hot_keys[i].last_logged_count = 0;
+ }
+
+ ZERO_STRUCT(ctdb_db->statistics);
+}
+
+int32_t ctdb_control_get_db_statistics(struct ctdb_context *ctdb,
+ uint32_t db_id,
+ TDB_DATA *outdata)
+{
+ struct ctdb_db_context *ctdb_db;
+ struct ctdb_db_statistics_old *stats;
+ unsigned int i;
+ size_t len;
+ char *ptr;
+
+ ctdb_db = find_ctdb_db(ctdb, db_id);
+ if (!ctdb_db) {
+ DEBUG(DEBUG_ERR,("Unknown db_id 0x%x in get_db_statistics\n", db_id));
+ return -1;
+ }
+
+ len = offsetof(struct ctdb_db_statistics_old, hot_keys_wire);
+ for (i = 0; i < MAX_HOT_KEYS; i++) {
+ struct ctdb_db_statistics_old *s = &ctdb_db->statistics;
+
+ s->hot_keys[i].key.dsize = ctdb_db->hot_keys[i].key.dsize;
+ s->hot_keys[i].key.dptr = ctdb_db->hot_keys[i].key.dptr;
+ s->hot_keys[i].count = ctdb_db->hot_keys[i].count;
+
+ len += s->hot_keys[i].key.dsize;
+ }
+
+ stats = talloc_size(outdata, len);
+ if (stats == NULL) {
+ DEBUG(DEBUG_ERR,("Failed to allocate db statistics structure\n"));
+ return -1;
+ }
+
+ memcpy(stats, &ctdb_db->statistics,
+ offsetof(struct ctdb_db_statistics_old, hot_keys_wire));
+
+ stats->num_hot_keys = MAX_HOT_KEYS;
+
+ ptr = &stats->hot_keys_wire[0];
+ for (i = 0; i < MAX_HOT_KEYS; i++) {
+ memcpy(ptr, ctdb_db->statistics.hot_keys[i].key.dptr,
+ ctdb_db->statistics.hot_keys[i].key.dsize);
+ ptr += ctdb_db->statistics.hot_keys[i].key.dsize;
+ }
+
+ outdata->dptr = (uint8_t *)stats;
+ outdata->dsize = len;
+
+ return 0;
+}
diff --git a/ctdb/server/ctdb_monitor.c b/ctdb/server/ctdb_monitor.c
new file mode 100644
index 0000000..ab58ec4
--- /dev/null
+++ b/ctdb/server/ctdb_monitor.c
@@ -0,0 +1,509 @@
+/*
+ monitoring links to all other nodes to detect dead nodes
+
+
+ Copyright (C) Ronnie Sahlberg 2007
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/filesys.h"
+#include "system/network.h"
+#include "system/wait.h"
+
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+#include "lib/util/util_process.h"
+
+#include "ctdb_private.h"
+
+#include "common/system.h"
+#include "common/common.h"
+#include "common/logging.h"
+
+struct ctdb_monitor_state {
+ TALLOC_CTX *monitor_context;
+ uint32_t next_interval;
+ uint32_t event_script_timeouts;
+};
+
+static void ctdb_check_health(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *private_data);
+
+static int ctdb_run_notification_script_child(struct ctdb_context *ctdb, const char *event)
+{
+ struct stat st;
+ int ret;
+ char *cmd;
+
+ if (stat(ctdb->notification_script, &st) != 0) {
+ DEBUG(DEBUG_ERR,("Could not stat notification script %s. Can not send notifications.\n", ctdb->notification_script));
+ return -1;
+ }
+ if (!(st.st_mode & S_IXUSR)) {
+ DEBUG(DEBUG_ERR,("Notification script %s is not executable.\n", ctdb->notification_script));
+ return -1;
+ }
+
+ cmd = talloc_asprintf(ctdb, "%s %s\n", ctdb->notification_script, event);
+ CTDB_NO_MEMORY(ctdb, cmd);
+
+ ret = system(cmd);
+ /* if the system() call was successful, translate ret into the
+ return code from the command
+ */
+ if (ret != -1) {
+ ret = WEXITSTATUS(ret);
+ }
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,("Notification script \"%s\" failed with error %d\n", cmd, ret));
+ }
+
+ return ret;
+}
+
+void ctdb_run_notification_script(struct ctdb_context *ctdb, const char *event)
+{
+ pid_t child;
+
+ if (ctdb->notification_script == NULL) {
+ return;
+ }
+
+ child = ctdb_fork(ctdb);
+ if (child == (pid_t)-1) {
+ DEBUG(DEBUG_ERR,("Failed to fork() a notification child process\n"));
+ return;
+ }
+ if (child == 0) {
+ int ret;
+
+ prctl_set_comment("ctdb_notification");
+ ret = ctdb_run_notification_script_child(ctdb, event);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Notification script failed\n"));
+ }
+ _exit(0);
+ }
+
+ return;
+}
+
+/*
+ called when a health monitoring event script finishes
+ */
+static void ctdb_health_callback(struct ctdb_context *ctdb, int status, void *p)
+{
+ struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
+ TDB_DATA data;
+ struct ctdb_node_flag_change c;
+ uint32_t next_interval;
+ int ret;
+ TDB_DATA rddata;
+ struct ctdb_srvid_message rd;
+ const char *state_str = NULL;
+
+ c.pnn = ctdb->pnn;
+ c.old_flags = node->flags;
+
+ ZERO_STRUCT(rd);
+ rd.pnn = ctdb->pnn;
+ rd.srvid = 0;
+
+ rddata.dptr = (uint8_t *)&rd;
+ rddata.dsize = sizeof(rd);
+
+ if (status == ECANCELED) {
+ DEBUG(DEBUG_ERR,("Monitoring event was cancelled\n"));
+ goto after_change_status;
+ }
+
+ if (status == ETIMEDOUT) {
+ ctdb->monitor->event_script_timeouts++;
+
+ if (ctdb->monitor->event_script_timeouts >=
+ ctdb->tunable.monitor_timeout_count) {
+ DEBUG(DEBUG_ERR,
+ ("Maximum monitor timeout count %u reached."
+ " Making node unhealthy\n",
+ ctdb->tunable.monitor_timeout_count));
+ } else {
+ /* We pretend this is OK. */
+ goto after_change_status;
+ }
+ } else {
+ ctdb->monitor->event_script_timeouts = 0;
+ }
+
+ if (status != 0 && !(node->flags & NODE_FLAGS_UNHEALTHY)) {
+ DEBUG(DEBUG_NOTICE,("monitor event failed - disabling node\n"));
+ node->flags |= NODE_FLAGS_UNHEALTHY;
+ ctdb->monitor->next_interval = 5;
+
+ ctdb_run_notification_script(ctdb, "unhealthy");
+ } else if (status == 0 && (node->flags & NODE_FLAGS_UNHEALTHY)) {
+ DEBUG(DEBUG_NOTICE,("monitor event OK - node re-enabled\n"));
+ node->flags &= ~NODE_FLAGS_UNHEALTHY;
+ ctdb->monitor->next_interval = 5;
+
+ ctdb_run_notification_script(ctdb, "healthy");
+ }
+
+after_change_status:
+ next_interval = ctdb->monitor->next_interval;
+
+ ctdb->monitor->next_interval *= 2;
+ if (ctdb->monitor->next_interval > ctdb->tunable.monitor_interval) {
+ ctdb->monitor->next_interval = ctdb->tunable.monitor_interval;
+ }
+
+ tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
+ timeval_current_ofs(next_interval, 0),
+ ctdb_check_health, ctdb);
+
+ if (c.old_flags == node->flags) {
+ return;
+ }
+
+ c.new_flags = node->flags;
+
+ data.dptr = (uint8_t *)&c;
+ data.dsize = sizeof(c);
+
+ /* ask the recovery daemon to push these changes out to all nodes */
+ ctdb_daemon_send_message(ctdb, ctdb->pnn,
+ CTDB_SRVID_PUSH_NODE_FLAGS, data);
+
+ if (c.new_flags & NODE_FLAGS_UNHEALTHY) {
+ state_str = "UNHEALTHY";
+ } else {
+ state_str = "HEALTHY";
+ }
+
+ /* ask the recmaster to reallocate all addresses */
+ DEBUG(DEBUG_ERR,
+ ("Node became %s. Ask recovery master to reallocate IPs\n",
+ state_str));
+ ret = ctdb_daemon_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_TAKEOVER_RUN, rddata);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,
+ (__location__
+ " Failed to send IP takeover run request\n"));
+ }
+}
+
+
+static void ctdb_run_startup(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *private_data);
+/*
+ called when the startup event script finishes
+ */
+static void ctdb_startup_callback(struct ctdb_context *ctdb, int status, void *p)
+{
+ if (status != 0) {
+ DEBUG(DEBUG_ERR,("startup event failed\n"));
+ tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
+ timeval_current_ofs(5, 0),
+ ctdb_run_startup, ctdb);
+ return;
+ }
+
+ DEBUG(DEBUG_NOTICE,("startup event OK - enabling monitoring\n"));
+ ctdb_set_runstate(ctdb, CTDB_RUNSTATE_RUNNING);
+ ctdb->monitor->next_interval = 2;
+ ctdb_run_notification_script(ctdb, "startup");
+
+ /* tell all other nodes we've just started up */
+ ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED,
+ 0, CTDB_CONTROL_STARTUP, 0,
+ CTDB_CTRL_FLAG_NOREPLY,
+ tdb_null, NULL, NULL);
+
+ tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
+ timeval_current_ofs(ctdb->monitor->next_interval, 0),
+ ctdb_check_health, ctdb);
+}
+
+static void ctdb_run_startup(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *private_data)
+{
+ struct ctdb_context *ctdb = talloc_get_type(private_data,
+ struct ctdb_context);
+ int ret;
+
+ /* This is necessary to avoid the "startup" event colliding
+ * with the "ipreallocated" event from the takeover run
+ * following the first recovery. We might as well serialise
+ * these things if we can.
+ */
+ if (ctdb->runstate < CTDB_RUNSTATE_STARTUP) {
+ DEBUG(DEBUG_NOTICE,
+ ("Not yet in startup runstate. Wait one more second\n"));
+ tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
+ timeval_current_ofs(1, 0),
+ ctdb_run_startup, ctdb);
+ return;
+ }
+
+ DEBUG(DEBUG_NOTICE,("Running the \"startup\" event.\n"));
+ ret = ctdb_event_script_callback(ctdb,
+ ctdb->monitor->monitor_context,
+ ctdb_startup_callback,
+ ctdb, CTDB_EVENT_STARTUP, "%s", "");
+
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,("Unable to launch startup event script\n"));
+ tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
+ timeval_current_ofs(5, 0),
+ ctdb_run_startup, ctdb);
+ }
+}
+
+/*
+ wait until we have finished initial recoveries before we start the
+ monitoring events
+ */
+static void ctdb_wait_until_recovered(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *private_data)
+{
+ struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+ int ret;
+ static int count = 0;
+
+ count++;
+
+ if (count < 60 || count%600 == 0) {
+ DEBUG(DEBUG_NOTICE,("CTDB_WAIT_UNTIL_RECOVERED\n"));
+ if (ctdb->nodes[ctdb->pnn]->flags & NODE_FLAGS_STOPPED) {
+ DEBUG(DEBUG_NOTICE,("Node is STOPPED. Node will NOT recover.\n"));
+ }
+ }
+
+ if (ctdb->vnn_map->generation == INVALID_GENERATION) {
+ ctdb->db_persistent_startup_generation = INVALID_GENERATION;
+
+ tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
+ timeval_current_ofs(1, 0),
+ ctdb_wait_until_recovered, ctdb);
+ return;
+ }
+
+ if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
+ ctdb->db_persistent_startup_generation = INVALID_GENERATION;
+
+ DEBUG(DEBUG_NOTICE,(__location__ " in recovery. Wait one more second\n"));
+ tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
+ timeval_current_ofs(1, 0),
+ ctdb_wait_until_recovered, ctdb);
+ return;
+ }
+
+
+ if (!fast_start && timeval_elapsed(&ctdb->last_recovery_finished) < (ctdb->tunable.rerecovery_timeout + 3)) {
+ ctdb->db_persistent_startup_generation = INVALID_GENERATION;
+
+ DEBUG(DEBUG_NOTICE,(__location__ " wait for pending recoveries to end. Wait one more second.\n"));
+
+ tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
+ timeval_current_ofs(1, 0),
+ ctdb_wait_until_recovered, ctdb);
+ return;
+ }
+
+ if (ctdb->vnn_map->generation == ctdb->db_persistent_startup_generation) {
+ DEBUG(DEBUG_INFO,(__location__ " skip ctdb_recheck_persistent_health() "
+ "until the next recovery\n"));
+ tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
+ timeval_current_ofs(1, 0),
+ ctdb_wait_until_recovered, ctdb);
+ return;
+ }
+
+ ctdb->db_persistent_startup_generation = ctdb->vnn_map->generation;
+ ret = ctdb_recheck_persistent_health(ctdb);
+ if (ret != 0) {
+ ctdb->db_persistent_check_errors++;
+ if (ctdb->db_persistent_check_errors < ctdb->max_persistent_check_errors) {
+ DEBUG(DEBUG_ERR,
+ (__location__ "ctdb_recheck_persistent_health() "
+ "failed (%llu of %llu times) - retry later\n",
+ (unsigned long long)ctdb->db_persistent_check_errors,
+ (unsigned long long)ctdb->max_persistent_check_errors));
+ tevent_add_timer(ctdb->ev,
+ ctdb->monitor->monitor_context,
+ timeval_current_ofs(1, 0),
+ ctdb_wait_until_recovered, ctdb);
+ return;
+ }
+ DEBUG(DEBUG_ALERT,(__location__
+ "ctdb_recheck_persistent_health() failed (%llu times) - prepare shutdown\n",
+ (unsigned long long)ctdb->db_persistent_check_errors));
+ ctdb_shutdown_sequence(ctdb, 11);
+ /* In case above returns due to duplicate shutdown */
+ return;
+ }
+ ctdb->db_persistent_check_errors = 0;
+
+ tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
+ timeval_current(), ctdb_run_startup, ctdb);
+}
+
+
+/*
+ see if the event scripts think we are healthy
+ */
+static void ctdb_check_health(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *private_data)
+{
+ struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+ bool skip_monitoring = false;
+ int ret = 0;
+
+ if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL ||
+ ctdb->nodes[ctdb->pnn]->flags & NODE_FLAGS_INACTIVE ||
+ ctdb->runstate != CTDB_RUNSTATE_RUNNING) {
+ skip_monitoring = true;
+ } else {
+ if (ctdb_db_all_frozen(ctdb)) {
+ DEBUG(DEBUG_ERR,
+ ("Skip monitoring since databases are frozen\n"));
+ skip_monitoring = true;
+ }
+ }
+
+ if (skip_monitoring) {
+ tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
+ timeval_current_ofs(ctdb->monitor->next_interval, 0),
+ ctdb_check_health, ctdb);
+ return;
+ }
+
+ ret = ctdb_event_script_callback(ctdb,
+ ctdb->monitor->monitor_context,
+ ctdb_health_callback,
+ ctdb, CTDB_EVENT_MONITOR, "%s", "");
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,("Unable to launch monitor event script\n"));
+ ctdb->monitor->next_interval = 5;
+ tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
+ timeval_current_ofs(5, 0),
+ ctdb_check_health, ctdb);
+ }
+}
+
+/* stop any monitoring
+ this should only be done when shutting down the daemon
+*/
+void ctdb_stop_monitoring(struct ctdb_context *ctdb)
+{
+ if (ctdb->monitor == NULL) {
+ D_NOTICE("Monitoring not yet initialised\n");
+ return;
+ }
+
+ TALLOC_FREE(ctdb->monitor->monitor_context);
+
+ ctdb->monitor->next_interval = 5;
+ DEBUG(DEBUG_NOTICE,("Monitoring has been stopped\n"));
+}
+
+/*
+ start watching for nodes that might be dead
+ */
+void ctdb_wait_for_first_recovery(struct ctdb_context *ctdb)
+{
+ ctdb_set_runstate(ctdb, CTDB_RUNSTATE_FIRST_RECOVERY);
+
+ ctdb->monitor = talloc(ctdb, struct ctdb_monitor_state);
+ CTDB_NO_MEMORY_FATAL(ctdb, ctdb->monitor);
+
+ ctdb->monitor->monitor_context = talloc_new(ctdb->monitor);
+ CTDB_NO_MEMORY_FATAL(ctdb, ctdb->monitor->monitor_context);
+
+ tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
+ timeval_current_ofs(1, 0),
+ ctdb_wait_until_recovered, ctdb);
+}
+
+
+/*
+ modify flags on a node
+ */
+int32_t ctdb_control_modflags(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+ struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)indata.dptr;
+ struct ctdb_node *node;
+ uint32_t old_flags;
+
+ /*
+ * Don't let other nodes override the current node's flags.
+ * The recovery master fetches flags from this node so there's
+ * no need to push them back. Doing so is racy.
+ */
+ if (c->pnn == ctdb->pnn) {
+ DBG_DEBUG("Ignoring flag changes for current node\n");
+ return 0;
+ }
+
+ node = ctdb_find_node(ctdb, c->pnn);
+ if (node == NULL) {
+ DBG_ERR("Node %u is invalid\n", c->pnn);
+ return -1;
+ }
+
+ if (node->flags & NODE_FLAGS_DISCONNECTED) {
+ DBG_DEBUG("Ignoring flag changes for disconnected node\n");
+ return 0;
+ }
+
+ /*
+ * Remember the old flags. We don't care what some other node
+ * thought the old flags were - that's irrelevant.
+ */
+ old_flags = node->flags;
+
+ /*
+ * This node tracks nodes it is connected to, so don't let
+ * another node override this
+ */
+ node->flags =
+ (old_flags & NODE_FLAGS_DISCONNECTED) |
+ (c->new_flags & ~NODE_FLAGS_DISCONNECTED);
+
+ if (node->flags == old_flags) {
+ return 0;
+ }
+
+ D_NOTICE("Node %u has changed flags - 0x%x -> 0x%x\n",
+ c->pnn,
+ old_flags,
+ node->flags);
+
+ if (node->flags == 0 && ctdb->runstate <= CTDB_RUNSTATE_STARTUP) {
+ DBG_ERR("Node %u became healthy - force recovery for startup\n",
+ c->pnn);
+ ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
+ }
+
+ return 0;
+}
diff --git a/ctdb/server/ctdb_mutex_fcntl_helper.c b/ctdb/server/ctdb_mutex_fcntl_helper.c
new file mode 100644
index 0000000..84d3790
--- /dev/null
+++ b/ctdb/server/ctdb_mutex_fcntl_helper.c
@@ -0,0 +1,794 @@
+/*
+ CTDB mutex fcntl lock file helper
+
+ Copyright (C) Martin Schwenke 2015
+
+ wait_for_parent() code from ctdb_lock_helper.c:
+
+ Copyright (C) Amitay Isaacs 2013
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/filesys.h"
+#include "system/network.h"
+#include "system/wait.h"
+
+#include <tevent.h>
+
+#include "lib/util/sys_rw.h"
+#include "lib/util/tevent_unix.h"
+#include "lib/util/util.h"
+#include "lib/util/smb_strtox.h"
+
+/* protocol.h is just needed for ctdb_sock_addr, which is used in system.h */
+#include "protocol/protocol.h"
+#include "common/system.h"
+#include "common/tmon.h"
+
+static char progpath[PATH_MAX];
+static char *progname = NULL;
+
+static int fcntl_lock_fd(int fd, bool block, off_t start)
+{
+ static struct flock lock = {
+ .l_type = F_WRLCK,
+ .l_whence = SEEK_SET,
+ .l_len = 1,
+ .l_pid = 0,
+ };
+ int cmd = block ? F_SETLKW : F_SETLK;
+
+ lock.l_start = start;
+ if (fcntl(fd, cmd, &lock) != 0) {
+ return errno;
+ }
+
+ return 0;
+}
+
+static char fcntl_lock(const char *file, int *outfd)
+{
+ int fd;
+ int ret;
+
+ fd = open(file, O_RDWR|O_CREAT, 0600);
+ if (fd == -1) {
+ fprintf(stderr, "%s: Unable to open %s - (%s)\n",
+ progname, file, strerror(errno));
+ return '3';
+ }
+
+ ret = fcntl_lock_fd(fd, false, 0);
+ if (ret != 0) {
+ close(fd);
+ if (ret == EACCES || ret == EAGAIN) {
+ /* Lock contention, fail silently */
+ return '1';
+ }
+
+ /* Log an error for any other failure */
+ fprintf(stderr,
+ "%s: Failed to get lock on '%s' - (%s)\n",
+ progname,
+ file,
+ strerror(ret));
+ return '3';
+ }
+
+ *outfd = fd;
+
+ return '0';
+}
+
+/*
+ * Wait and see if the parent exits
+ */
+
+struct wait_for_parent_state {
+ struct tevent_context *ev;
+ pid_t ppid;
+};
+
+static void wait_for_parent_check(struct tevent_req *subreq);
+
+static struct tevent_req *wait_for_parent_send(TALLOC_CTX *mem_ctx,
+ struct tevent_context *ev,
+ pid_t ppid)
+{
+ struct tevent_req *req, *subreq;
+ struct wait_for_parent_state *state;
+
+ req = tevent_req_create(mem_ctx, &state, struct wait_for_parent_state);
+ if (req == NULL) {
+ return NULL;
+ }
+
+ state->ev = ev;
+ state->ppid = ppid;
+
+ if (ppid == 1) {
+ fprintf(stderr, "parent == 1\n");
+ tevent_req_done(req);
+ return tevent_req_post(req, ev);
+ }
+
+ subreq = tevent_wakeup_send(state, ev,
+ tevent_timeval_current_ofs(5,0));
+ if (tevent_req_nomem(subreq, req)) {
+ return tevent_req_post(req, ev);
+ }
+ tevent_req_set_callback(subreq, wait_for_parent_check, req);
+
+ return req;
+}
+
+static void wait_for_parent_check(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct wait_for_parent_state *state = tevent_req_data(
+ req, struct wait_for_parent_state);
+ bool status;
+
+ status = tevent_wakeup_recv(subreq);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ /* Ignore error */
+ fprintf(stderr, "%s: tevent_wakeup_recv() failed\n", progname);
+ }
+
+ if (kill(state->ppid, 0) == -1 && errno == ESRCH) {
+ fprintf(stderr, "parent gone\n");
+ tevent_req_done(req);
+ return;
+ }
+
+ subreq = tevent_wakeup_send(state, state->ev,
+ tevent_timeval_current_ofs(5,0));
+ if (tevent_req_nomem(subreq, req)) {
+ return;
+ }
+ tevent_req_set_callback(subreq, wait_for_parent_check, req);
+}
+
+static bool wait_for_parent_recv(struct tevent_req *req, int *perr)
+{
+ if (tevent_req_is_unix_error(req, perr)) {
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Perform I/O on lock in a loop - complete when file removed or replaced
+ */
+
+struct lock_io_check_state {
+ struct tevent_context *ev;
+ const char *lock_file;
+ ino_t inode;
+ unsigned long recheck_interval;
+};
+
+static void lock_io_check_loop(struct tevent_req *subreq);
+
+static struct tevent_req *lock_io_check_send(TALLOC_CTX *mem_ctx,
+ struct tevent_context *ev,
+ const char *lock_file,
+ ino_t inode,
+ unsigned long recheck_interval)
+{
+ struct tevent_req *req, *subreq;
+ struct lock_io_check_state *state;
+
+ req = tevent_req_create(mem_ctx, &state, struct lock_io_check_state);
+ if (req == NULL) {
+ return NULL;
+ }
+
+ state->ev = ev;
+ state->lock_file = lock_file;
+ state->inode = inode;
+ state->recheck_interval = recheck_interval;
+
+ subreq = tevent_wakeup_send(
+ state,
+ ev,
+ tevent_timeval_current_ofs(state->recheck_interval, 0));
+ if (tevent_req_nomem(subreq, req)) {
+ return tevent_req_post(req, ev);
+ }
+ tevent_req_set_callback(subreq, lock_io_check_loop, req);
+
+ return req;
+}
+
+static void lock_io_check_loop(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct lock_io_check_state *state = tevent_req_data(
+ req, struct lock_io_check_state);
+ bool status;
+ struct stat sb;
+ int fd = -1;
+ int ret;
+
+ status = tevent_wakeup_recv(subreq);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ /* Ignore error */
+ fprintf(stderr, "%s: tevent_wakeup_recv() failed\n", progname);
+ }
+
+ fd = open(state->lock_file, O_RDWR);
+ if (fd == -1) {
+ fprintf(stderr,
+ "%s: "
+ "lock lost - lock file \"%s\" open failed (ret=%d)\n",
+ progname,
+ state->lock_file,
+ errno);
+ goto done;
+ }
+
+ ret = fstat(fd, &sb);
+ if (ret != 0) {
+ fprintf(stderr,
+ "%s: "
+ "lock lost - lock file \"%s\" check failed (ret=%d)\n",
+ progname,
+ state->lock_file,
+ errno);
+ goto done;
+ }
+
+ if (sb.st_ino != state->inode) {
+ fprintf(stderr,
+ "%s: lock lost - lock file \"%s\" inode changed\n",
+ progname,
+ state->lock_file);
+ goto done;
+ }
+
+ /*
+ * Attempt to lock a 2nd byte range. Using a blocking lock
+ * encourages ping timeouts if the cluster filesystem is in a
+ * bad state. It also makes testing easier.
+ */
+ ret = fcntl_lock_fd(fd, true, 1);
+ if (ret != 0) {
+ fprintf(stderr,
+ "%s: "
+ "lock fail - lock file \"%s\" test lock error (%d)\n",
+ progname,
+ state->lock_file,
+ ret);
+ goto done;
+ }
+
+ /* Unlock occurs on close */
+ close(fd);
+
+ subreq = tevent_wakeup_send(
+ state,
+ state->ev,
+ tevent_timeval_current_ofs(state->recheck_interval, 0));
+ if (tevent_req_nomem(subreq, req)) {
+ return;
+ }
+ tevent_req_set_callback(subreq, lock_io_check_loop, req);
+
+ return;
+
+done:
+ if (fd != -1) {
+ close(fd);
+ }
+ tevent_req_done(req);
+}
+
+static bool lock_io_check_recv(struct tevent_req *req, int *perr)
+{
+ if (tevent_req_is_unix_error(req, perr)) {
+ return false;
+ }
+
+ return true;
+}
+
+struct lock_test_child_state {
+};
+
+static void lock_test_child_ping_done(struct tevent_req *subreq);
+static void lock_test_child_io_check_done(struct tevent_req *subreq);
+
+static struct tevent_req *lock_test_child_send(TALLOC_CTX *mem_ctx,
+ struct tevent_context *ev,
+ const char *lock_file,
+ int fd,
+ ino_t inode,
+ unsigned long recheck_interval,
+ bool send_pings)
+{
+ struct tevent_req *req, *subreq;
+ struct lock_test_child_state *state;
+ unsigned int interval = send_pings ? 1 : 0;
+
+ req = tevent_req_create(mem_ctx, &state, struct lock_test_child_state);
+ if (req == NULL) {
+ return NULL;
+ }
+
+ subreq = tmon_ping_send(state, ev, fd, TMON_FD_BOTH, 0, interval);
+ if (tevent_req_nomem(subreq, req)) {
+ return tevent_req_post(req, ev);
+ }
+ tevent_req_set_callback(subreq, lock_test_child_ping_done, req);
+
+ subreq = lock_io_check_send(state,
+ ev,
+ lock_file,
+ inode,
+ recheck_interval);
+ if (tevent_req_nomem(subreq, req)) {
+ return tevent_req_post(req, ev);
+ }
+ tevent_req_set_callback(subreq, lock_test_child_io_check_done, req);
+
+ return req;
+}
+
+static void lock_test_child_ping_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ bool status;
+ int err;
+
+ status = tmon_ping_recv(subreq, &err);
+ TALLOC_FREE(subreq);
+ if (!status) {
+ tevent_req_error(req, err);
+ return;
+ }
+
+ tevent_req_done(req);
+}
+
+static void lock_test_child_io_check_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ bool status;
+ int err;
+
+ status = lock_io_check_recv(subreq, &err);
+ TALLOC_FREE(subreq);
+ if (!status) {
+ tevent_req_error(req, err);
+ return;
+ }
+
+ tevent_req_done(req);
+}
+
+static bool lock_test_child_recv(struct tevent_req *req, int *perr)
+{
+ if (tevent_req_is_unix_error(req, perr)) {
+ /* Parent exit is expected */
+ if (*perr == EPIPE) {
+ return true;
+ }
+ return false;
+ }
+
+ return true;
+}
+
+static void lock_test_child(const char *lock_file,
+ int lock_fd,
+ int pipe_fd,
+ unsigned long recheck_interval,
+ bool send_pings)
+{
+ struct tevent_context *ev;
+ struct tevent_req *req;
+ struct stat sb;
+ ino_t inode;
+ bool status;
+ int ret;
+
+ ret = fstat(lock_fd, &sb);
+ if (ret != 0) {
+ fprintf(stderr,
+ "%s: lock lost - "
+ "lock file \"%s\" stat failed (ret=%d)\n",
+ progname,
+ lock_file,
+ errno);
+ _exit(1);
+ }
+ inode = sb.st_ino;
+ close(lock_fd);
+
+ ev = tevent_context_init(NULL);
+ if (ev == NULL) {
+ fprintf(stderr, "%s: tevent_context_init() failed\n", progname);
+ _exit(1);
+ }
+
+ req = lock_test_child_send(ev,
+ ev,
+ lock_file,
+ pipe_fd,
+ inode,
+ recheck_interval,
+ send_pings);
+ if (req == NULL) {
+ fprintf(stderr,
+ "%s: lock_test_child_send() failed\n",
+ progname);
+ _exit(1);
+ }
+
+ tevent_req_poll(req, ev);
+
+ status = lock_test_child_recv(req, &ret);
+ if (! status) {
+ fprintf(stderr,
+ "%s: lock_test_child_recv() failed (%d)\n",
+ progname,
+ ret);
+ _exit(1);
+ }
+
+ _exit(0);
+}
+
+struct lock_test_state {
+ int *lock_fdp;
+ int pipe_fd;
+ pid_t child_pid;
+};
+
+static void lock_test_ping_done(struct tevent_req *subreq);
+
+static struct tevent_req *lock_test_send(TALLOC_CTX *mem_ctx,
+ struct tevent_context *ev,
+ const char *lock_file,
+ int *fdp,
+ unsigned long recheck_interval,
+ unsigned long ping_timeout)
+{
+ struct tevent_req *req, *subreq;
+ struct lock_test_state *state;
+ pid_t pid;
+ int sv[2];
+ int ret;
+
+ req = tevent_req_create(mem_ctx, &state, struct lock_test_state);
+ if (req == NULL) {
+ return NULL;
+ }
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM, 0, sv);
+ if (ret != 0) {
+ fprintf(stderr,
+ "%s: socketpair() failed (errno=%d)\n",
+ progname,
+ errno);
+ tevent_req_error(req, errno);
+ return tevent_req_post(req, ev);
+ }
+
+ pid = fork();
+ if (pid == -1) {
+
+ int err = errno;
+ fprintf(stderr, "%s: fork() failed (errno=%d)\n", progname, err);
+ close(sv[0]);
+ close(sv[1]);
+ tevent_req_error(req, err);
+ return tevent_req_post(req, ev);
+ }
+ if (pid == 0) {
+ /* Child */
+ close(sv[0]);
+ TALLOC_FREE(ev);
+
+ lock_test_child(lock_file,
+ *fdp,
+ sv[1],
+ recheck_interval,
+ ping_timeout != 0);
+ /* Above does not return */
+ }
+
+ /* Parent */
+ close(sv[1]);
+
+ state->lock_fdp = fdp;
+ state->pipe_fd = sv[0];
+ state->child_pid = pid;
+
+ subreq = tmon_ping_send(state, ev, sv[0], TMON_FD_BOTH, ping_timeout, 0);
+ if (tevent_req_nomem(subreq, req)) {
+ close(sv[0]);
+ return tevent_req_post(req, ev);
+ }
+ tevent_req_set_callback(subreq, lock_test_ping_done, req);
+
+ return req;
+}
+
+static void lock_test_ping_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct lock_test_state *state = tevent_req_data(
+ req, struct lock_test_state);
+ int wstatus;
+ bool status;
+ int err;
+
+ status = tmon_ping_recv(subreq, &err);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ switch (err) {
+ case EPIPE:
+ /* Child exit, child already printed message */
+ break;
+ case ETIMEDOUT:
+ fprintf(stderr,
+ "%s: ping timeout from lock test child\n",
+ progname);
+ break;
+ default:
+ fprintf(stderr,
+ "%s: tmon_ping_recv() failed (%d)\n",
+ progname,
+ err);
+ }
+ /* Ignore error */
+ }
+
+ /*
+ * Lock checking child is gone or not sending pings. Release
+ * the lock, close this end of pipe, send SIGKILL to the child
+ * process and wait for the child to exit.
+ */
+ close(*state->lock_fdp);
+ *state->lock_fdp = -1;
+ close(state->pipe_fd);
+ kill(state->child_pid, SIGKILL);
+ waitpid(state->child_pid, &wstatus, 0);
+
+ tevent_req_done(req);
+}
+
+static bool lock_test_recv(struct tevent_req *req, int *perr)
+{
+ if (tevent_req_is_unix_error(req, perr)) {
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Wait for a reason to exit, indicating that parent has exited or I/O
+ * on lock failed
+ */
+
+struct wait_for_exit_state {
+};
+
+static void wait_for_exit_parent_done(struct tevent_req *subreq);
+static void wait_for_exit_lock_test_done(struct tevent_req *subreq);
+
+static struct tevent_req *wait_for_exit_send(TALLOC_CTX *mem_ctx,
+ struct tevent_context *ev,
+ pid_t ppid,
+ const char *lock_file,
+ int *fdp,
+ unsigned long recheck_interval,
+ unsigned long ping_timeout)
+{
+ struct tevent_req *req, *subreq;
+ struct wait_for_exit_state *state;
+
+ req = tevent_req_create(mem_ctx, &state, struct wait_for_exit_state);
+ if (req == NULL) {
+ return NULL;
+ }
+
+ subreq = wait_for_parent_send(state, ev, ppid);
+ if (tevent_req_nomem(subreq, req)) {
+ return tevent_req_post(req, ev);
+ }
+ tevent_req_set_callback(subreq, wait_for_exit_parent_done, req);
+
+ if (recheck_interval > 0) {
+ subreq = lock_test_send(state,
+ ev,
+ lock_file,
+ fdp,
+ recheck_interval,
+ ping_timeout);
+ if (tevent_req_nomem(subreq, req)) {
+ return tevent_req_post(req, ev);
+ }
+ tevent_req_set_callback(subreq,
+ wait_for_exit_lock_test_done,
+ req);
+ }
+
+ return req;
+}
+
+static void wait_for_exit_parent_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ bool status;
+ int err;
+
+ status = wait_for_parent_recv(subreq, &err);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ /* Ignore error */
+ fprintf(stderr,
+ "%s: "
+ "wait_for_parent_recv() failed (%d)\n",
+ progname,
+ err);
+ }
+
+ tevent_req_done(req);
+}
+
+static void wait_for_exit_lock_test_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ bool status;
+ int err;
+
+ status = lock_test_recv(subreq, &err);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ fprintf(stderr,
+ "%s: "
+ "lock_test_recv() failed (%d)\n",
+ progname,
+ err);
+ /* Ignore error, fall through to done */
+ }
+
+ tevent_req_done(req);
+}
+
+static bool wait_for_exit_recv(struct tevent_req *req, int *perr)
+{
+ if (tevent_req_is_unix_error(req, perr)) {
+ return false;
+ }
+
+ return true;
+}
+
+static void usage(void)
+{
+ fprintf(stderr,
+ "Usage: %s <file> [recheck_interval [ping_timeout]]\n",
+ progname);
+}
+
+int main(int argc, char *argv[])
+{
+ struct tevent_context *ev;
+ char result;
+ int ppid;
+ const char *file = NULL;
+ unsigned long recheck_interval;
+ unsigned long ping_timeout;
+ int ret;
+ int fd = -1;
+ struct tevent_req *req;
+ bool status;
+
+ strlcpy(progpath, argv[0], sizeof(progpath));
+ progname = basename(progpath);
+
+ if (argc < 2 || argc > 4) {
+ usage();
+ exit(1);
+ }
+
+ ev = tevent_context_init(NULL);
+ if (ev == NULL) {
+ fprintf(stderr, "locking: tevent_context_init() failed\n");
+ exit(1);
+ }
+
+ ppid = getppid();
+
+ file = argv[1];
+
+ recheck_interval = 5;
+ ping_timeout = 0;
+ if (argc >= 3) {
+ recheck_interval = smb_strtoul(argv[2],
+ NULL,
+ 10,
+ &ret,
+ SMB_STR_STANDARD);
+ if (ret != 0) {
+ usage();
+ exit(1);
+ }
+ }
+ if (argc >= 4) {
+ ping_timeout = smb_strtoul(argv[3],
+ NULL,
+ 10,
+ &ret,
+ SMB_STR_STANDARD);
+ if (ret != 0) {
+ usage();
+ exit(1);
+ }
+ }
+
+ result = fcntl_lock(file, &fd);
+ sys_write(STDOUT_FILENO, &result, 1);
+
+ if (result != '0') {
+ return 0;
+ }
+
+ req = wait_for_exit_send(ev,
+ ev,
+ ppid,
+ file,
+ &fd,
+ recheck_interval,
+ ping_timeout);
+ if (req == NULL) {
+ fprintf(stderr,
+ "%s: wait_for_exit_send() failed\n",
+ progname);
+ exit(1);
+ }
+
+ tevent_req_poll(req, ev);
+
+ status = wait_for_exit_recv(req, &ret);
+ if (! status) {
+ fprintf(stderr,
+ "%s: wait_for_exit_recv() failed (%d)\n",
+ progname,
+ ret);
+ }
+
+ if (fd != -1) {
+ close(fd);
+ }
+
+ return 0;
+}
diff --git a/ctdb/server/ctdb_persistent.c b/ctdb/server/ctdb_persistent.c
new file mode 100644
index 0000000..2671744
--- /dev/null
+++ b/ctdb/server/ctdb_persistent.c
@@ -0,0 +1,397 @@
+/*
+ persistent store logic
+
+ Copyright (C) Andrew Tridgell 2007
+ Copyright (C) Ronnie Sahlberg 2007
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/filesys.h"
+#include "system/network.h"
+#include "system/time.h"
+#include "system/wait.h"
+
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/tdb_wrap/tdb_wrap.h"
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+
+#include "ctdb_private.h"
+
+#include "common/reqid.h"
+#include "common/common.h"
+#include "common/logging.h"
+
+struct ctdb_persistent_state {
+ struct ctdb_context *ctdb;
+ struct ctdb_db_context *ctdb_db; /* used by trans3_commit */
+ struct ctdb_client *client; /* used by trans3_commit */
+ struct ctdb_req_control_old *c;
+ const char *errormsg;
+ uint32_t num_pending;
+ int32_t status;
+ uint32_t num_failed, num_sent;
+};
+
+/*
+ 1) all nodes fail, and all nodes reply
+ 2) some nodes fail, all nodes reply
+ 3) some nodes timeout
+ 4) all nodes succeed
+ */
+
+/*
+ called when a node has acknowledged a ctdb_control_update_record call
+ */
+static void ctdb_persistent_callback(struct ctdb_context *ctdb,
+ int32_t status, TDB_DATA data,
+ const char *errormsg,
+ void *private_data)
+{
+ struct ctdb_persistent_state *state = talloc_get_type(private_data,
+ struct ctdb_persistent_state);
+
+ if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
+ DEBUG(DEBUG_INFO, ("ctdb_persistent_callback: ignoring reply "
+ "during recovery\n"));
+ return;
+ }
+
+ if (status != 0) {
+ DEBUG(DEBUG_ERR,("ctdb_persistent_callback failed with status %d (%s)\n",
+ status, errormsg?errormsg:"no error message given"));
+ state->status = status;
+ state->errormsg = errormsg;
+ state->num_failed++;
+
+ /*
+ * If a node failed to complete the update_record control,
+ * then either a recovery is already running or something
+ * bad is going on. So trigger a recovery and let the
+ * recovery finish the transaction, sending back the reply
+ * for the trans3_commit control to the client.
+ */
+ ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
+ return;
+ }
+
+ state->num_pending--;
+
+ if (state->num_pending != 0) {
+ return;
+ }
+
+ ctdb_request_control_reply(state->ctdb, state->c, NULL, 0, state->errormsg);
+ talloc_free(state);
+}
+
+/*
+ called if persistent store times out
+ */
+static void ctdb_persistent_store_timeout(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *private_data)
+{
+ struct ctdb_persistent_state *state = talloc_get_type(private_data, struct ctdb_persistent_state);
+
+ if (state->ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
+ DEBUG(DEBUG_INFO, ("ctdb_persistent_store_timeout: ignoring "
+ "timeout during recovery\n"));
+ return;
+ }
+
+ ctdb_request_control_reply(state->ctdb, state->c, NULL, 1,
+ "timeout in ctdb_persistent_state");
+
+ talloc_free(state);
+}
+
+/**
+ * Finish pending trans3 commit controls, i.e. send
+ * reply to the client. This is called by the end-recovery
+ * control to fix the situation when a recovery interrupts
+ * the usual progress of a transaction.
+ */
+void ctdb_persistent_finish_trans3_commits(struct ctdb_context *ctdb)
+{
+ struct ctdb_db_context *ctdb_db;
+
+ if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
+ DEBUG(DEBUG_INFO, ("ctdb_persistent_finish_trans3_commits: "
+ "skipping execution when recovery is "
+ "active\n"));
+ return;
+ }
+
+ for (ctdb_db = ctdb->db_list; ctdb_db; ctdb_db = ctdb_db->next) {
+ struct ctdb_persistent_state *state;
+
+ if (ctdb_db->persistent_state == NULL) {
+ continue;
+ }
+
+ state = ctdb_db->persistent_state;
+
+ ctdb_request_control_reply(ctdb, state->c, NULL, 2,
+ "trans3 commit ended by recovery");
+
+ /* The destructor sets ctdb_db->persistent_state to NULL. */
+ talloc_free(state);
+ }
+}
+
+static int ctdb_persistent_state_destructor(struct ctdb_persistent_state *state)
+{
+ if (state->client != NULL) {
+ state->client->db_id = 0;
+ }
+
+ if (state->ctdb_db != NULL) {
+ state->ctdb_db->persistent_state = NULL;
+ }
+
+ return 0;
+}
+
+/*
+ * Store a set of persistent records.
+ * This is used to roll out a transaction to all nodes.
+ */
+int32_t ctdb_control_trans3_commit(struct ctdb_context *ctdb,
+ struct ctdb_req_control_old *c,
+ TDB_DATA recdata, bool *async_reply)
+{
+ struct ctdb_client *client;
+ struct ctdb_persistent_state *state;
+ unsigned int i;
+ struct ctdb_marshall_buffer *m = (struct ctdb_marshall_buffer *)recdata.dptr;
+ struct ctdb_db_context *ctdb_db;
+
+ if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
+ DEBUG(DEBUG_INFO,("rejecting ctdb_control_trans3_commit when recovery active\n"));
+ return -1;
+ }
+
+ client = reqid_find(ctdb->idr, c->client_id, struct ctdb_client);
+ if (client == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " can not match persistent_store "
+ "to a client. Returning error\n"));
+ return -1;
+ }
+
+ if (client->db_id != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " ERROR: trans3_commit: "
+ "client-db_id[0x%08x] != 0 "
+ "(client_id[0x%08x]): trans3_commit active?\n",
+ client->db_id, client->client_id));
+ return -1;
+ }
+
+ ctdb_db = find_ctdb_db(ctdb, m->db_id);
+ if (ctdb_db == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " ctdb_control_trans3_commit: "
+ "Unknown database db_id[0x%08x]\n", m->db_id));
+ return -1;
+ }
+
+ if (ctdb_db->persistent_state != NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " Error: "
+ "ctdb_control_trans3_commit "
+ "called while a transaction commit is "
+ "active. db_id[0x%08x]\n", m->db_id));
+ return -1;
+ }
+
+ ctdb_db->persistent_state = talloc_zero(ctdb_db,
+ struct ctdb_persistent_state);
+ CTDB_NO_MEMORY(ctdb, ctdb_db->persistent_state);
+
+ client->db_id = m->db_id;
+
+ state = ctdb_db->persistent_state;
+ state->ctdb = ctdb;
+ state->ctdb_db = ctdb_db;
+ state->c = c;
+ state->client = client;
+
+ talloc_set_destructor(state, ctdb_persistent_state_destructor);
+
+ for (i = 0; i < ctdb->vnn_map->size; i++) {
+ struct ctdb_node *node = ctdb->nodes[ctdb->vnn_map->map[i]];
+ int ret;
+
+ /* only send to active nodes */
+ if (node->flags & NODE_FLAGS_INACTIVE) {
+ continue;
+ }
+
+ ret = ctdb_daemon_send_control(ctdb, node->pnn, 0,
+ CTDB_CONTROL_UPDATE_RECORD,
+ c->client_id, 0, recdata,
+ ctdb_persistent_callback,
+ state);
+ if (ret == -1) {
+ DEBUG(DEBUG_ERR,("Unable to send "
+ "CTDB_CONTROL_UPDATE_RECORD "
+ "to pnn %u\n", node->pnn));
+ talloc_free(state);
+ return -1;
+ }
+
+ state->num_pending++;
+ state->num_sent++;
+ }
+
+ if (state->num_pending == 0) {
+ talloc_free(state);
+ return 0;
+ }
+
+ /* we need to wait for the replies */
+ *async_reply = true;
+
+ /* need to keep the control structure around */
+ talloc_steal(state, c);
+
+ /* but we won't wait forever */
+ tevent_add_timer(ctdb->ev, state,
+ timeval_current_ofs(ctdb->tunable.control_timeout, 0),
+ ctdb_persistent_store_timeout, state);
+
+ return 0;
+}
+
+
+/*
+ backwards compatibility:
+
+ start a persistent store operation. passing both the key, header and
+ data to the daemon. If the client disconnects before it has issued
+ a persistent_update call to the daemon we trigger a full recovery
+ to ensure the databases are brought back in sync.
+ for now we ignore the recdata that the client has passed to us.
+ */
+int32_t ctdb_control_start_persistent_update(struct ctdb_context *ctdb,
+ struct ctdb_req_control_old *c,
+ TDB_DATA recdata)
+{
+ struct ctdb_client *client = reqid_find(ctdb->idr, c->client_id, struct ctdb_client);
+
+ if (client == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " can not match start_persistent_update to a client. Returning error\n"));
+ return -1;
+ }
+
+ client->num_persistent_updates++;
+
+ return 0;
+}
+
+/*
+ backwards compatibility:
+
+ called to tell ctdbd that it is no longer doing a persistent update
+*/
+int32_t ctdb_control_cancel_persistent_update(struct ctdb_context *ctdb,
+ struct ctdb_req_control_old *c,
+ TDB_DATA recdata)
+{
+ struct ctdb_client *client = reqid_find(ctdb->idr, c->client_id, struct ctdb_client);
+
+ if (client == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " can not match cancel_persistent_update to a client. Returning error\n"));
+ return -1;
+ }
+
+ if (client->num_persistent_updates > 0) {
+ client->num_persistent_updates--;
+ }
+
+ return 0;
+}
+
+static int32_t ctdb_get_db_seqnum(struct ctdb_context *ctdb,
+ uint32_t db_id,
+ uint64_t *seqnum)
+{
+ int32_t ret;
+ struct ctdb_db_context *ctdb_db;
+ const char *keyname = CTDB_DB_SEQNUM_KEY;
+ TDB_DATA key;
+ TDB_DATA data;
+ TALLOC_CTX *mem_ctx = talloc_new(ctdb);
+ struct ctdb_ltdb_header header;
+
+ ctdb_db = find_ctdb_db(ctdb, db_id);
+ if (!ctdb_db) {
+ DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", db_id));
+ ret = -1;
+ goto done;
+ }
+
+ if (! ctdb_db_allow_access(ctdb_db)) {
+ ret = -1;
+ goto done;
+ }
+
+ key.dptr = (uint8_t *)discard_const(keyname);
+ key.dsize = strlen(keyname) + 1;
+
+ ret = (int32_t)ctdb_ltdb_fetch(ctdb_db, key, &header, mem_ctx, &data);
+ if (ret != 0) {
+ goto done;
+ }
+
+ if (data.dsize != sizeof(uint64_t)) {
+ *seqnum = 0;
+ goto done;
+ }
+
+ *seqnum = *(uint64_t *)data.dptr;
+
+done:
+ talloc_free(mem_ctx);
+ return ret;
+}
+
+/**
+ * Get the sequence number of a persistent database.
+ */
+int32_t ctdb_control_get_db_seqnum(struct ctdb_context *ctdb,
+ TDB_DATA indata,
+ TDB_DATA *outdata)
+{
+ uint32_t db_id;
+ int32_t ret;
+ uint64_t seqnum;
+
+ db_id = *(uint32_t *)indata.dptr;
+ ret = ctdb_get_db_seqnum(ctdb, db_id, &seqnum);
+ if (ret != 0) {
+ goto done;
+ }
+
+ outdata->dsize = sizeof(uint64_t);
+ outdata->dptr = talloc_memdup(outdata, &seqnum, sizeof(uint64_t));
+ if (outdata->dptr == NULL) {
+ ret = -1;
+ }
+
+done:
+ return ret;
+}
diff --git a/ctdb/server/ctdb_recover.c b/ctdb/server/ctdb_recover.c
new file mode 100644
index 0000000..004ddb3
--- /dev/null
+++ b/ctdb/server/ctdb_recover.c
@@ -0,0 +1,1243 @@
+/*
+ ctdb recovery code
+
+ Copyright (C) Andrew Tridgell 2007
+ Copyright (C) Ronnie Sahlberg 2007
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "replace.h"
+#include "system/time.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "system/wait.h"
+
+#include <talloc.h>
+#include <tevent.h>
+#include <tdb.h>
+
+#include "lib/tdb_wrap/tdb_wrap.h"
+#include "lib/util/dlinklist.h"
+#include "lib/util/debug.h"
+#include "lib/util/time.h"
+#include "lib/util/util_process.h"
+
+#include "ctdb_private.h"
+#include "ctdb_client.h"
+
+#include "common/system.h"
+#include "common/common.h"
+#include "common/logging.h"
+
+#include "ctdb_cluster_mutex.h"
+
+int
+ctdb_control_getvnnmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
+{
+ struct ctdb_vnn_map_wire *map;
+ size_t len;
+
+ CHECK_CONTROL_DATA_SIZE(0);
+
+ len = offsetof(struct ctdb_vnn_map_wire, map) + sizeof(uint32_t)*ctdb->vnn_map->size;
+ map = talloc_size(outdata, len);
+ CTDB_NO_MEMORY(ctdb, map);
+
+ map->generation = ctdb->vnn_map->generation;
+ map->size = ctdb->vnn_map->size;
+ memcpy(map->map, ctdb->vnn_map->map, sizeof(uint32_t)*map->size);
+
+ outdata->dsize = len;
+ outdata->dptr = (uint8_t *)map;
+
+ return 0;
+}
+
+int
+ctdb_control_setvnnmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
+{
+ struct ctdb_vnn_map_wire *map = (struct ctdb_vnn_map_wire *)indata.dptr;
+
+ if (ctdb->recovery_mode != CTDB_RECOVERY_ACTIVE) {
+ DEBUG(DEBUG_ERR, ("Attempt to set vnnmap when not in recovery\n"));
+ return -1;
+ }
+
+ talloc_free(ctdb->vnn_map);
+
+ ctdb->vnn_map = talloc(ctdb, struct ctdb_vnn_map);
+ CTDB_NO_MEMORY(ctdb, ctdb->vnn_map);
+
+ ctdb->vnn_map->generation = map->generation;
+ ctdb->vnn_map->size = map->size;
+ ctdb->vnn_map->map = talloc_array(ctdb->vnn_map, uint32_t, map->size);
+ CTDB_NO_MEMORY(ctdb, ctdb->vnn_map->map);
+
+ memcpy(ctdb->vnn_map->map, map->map, sizeof(uint32_t)*map->size);
+
+ return 0;
+}
+
+int
+ctdb_control_getdbmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
+{
+ uint32_t i, len;
+ struct ctdb_db_context *ctdb_db;
+ struct ctdb_dbid_map_old *dbid_map;
+
+ CHECK_CONTROL_DATA_SIZE(0);
+
+ len = 0;
+ for(ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next){
+ len++;
+ }
+
+
+ outdata->dsize = offsetof(struct ctdb_dbid_map_old, dbs) + sizeof(dbid_map->dbs[0])*len;
+ outdata->dptr = (unsigned char *)talloc_zero_size(outdata, outdata->dsize);
+ if (!outdata->dptr) {
+ DEBUG(DEBUG_ALERT, (__location__ " Failed to allocate dbmap array\n"));
+ exit(1);
+ }
+
+ dbid_map = (struct ctdb_dbid_map_old *)outdata->dptr;
+ dbid_map->num = len;
+ for (i=0,ctdb_db=ctdb->db_list;ctdb_db;i++,ctdb_db=ctdb_db->next){
+ dbid_map->dbs[i].db_id = ctdb_db->db_id;
+ dbid_map->dbs[i].flags = ctdb_db->db_flags;
+ }
+
+ return 0;
+}
+
+int
+ctdb_control_getnodemap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
+{
+ CHECK_CONTROL_DATA_SIZE(0);
+
+ outdata->dptr = (unsigned char *)ctdb_node_list_to_map(ctdb->nodes,
+ ctdb->num_nodes,
+ outdata);
+ if (outdata->dptr == NULL) {
+ return -1;
+ }
+
+ outdata->dsize = talloc_get_size(outdata->dptr);
+
+ return 0;
+}
+
+/*
+ reload the nodes file
+*/
+int
+ctdb_control_reload_nodes_file(struct ctdb_context *ctdb, uint32_t opcode)
+{
+ unsigned int i, num_nodes;
+ TALLOC_CTX *tmp_ctx;
+ struct ctdb_node **nodes;
+
+ tmp_ctx = talloc_new(ctdb);
+
+ /* steal the old nodes file for a while */
+ talloc_steal(tmp_ctx, ctdb->nodes);
+ nodes = ctdb->nodes;
+ ctdb->nodes = NULL;
+ num_nodes = ctdb->num_nodes;
+ ctdb->num_nodes = 0;
+
+ /* load the new nodes file */
+ ctdb_load_nodes_file(ctdb);
+
+ for (i=0; i<ctdb->num_nodes; i++) {
+ /* keep any identical pre-existing nodes and connections */
+ if ((i < num_nodes) && ctdb_same_address(&ctdb->nodes[i]->address, &nodes[i]->address)) {
+ talloc_free(ctdb->nodes[i]);
+ ctdb->nodes[i] = talloc_steal(ctdb->nodes, nodes[i]);
+ continue;
+ }
+
+ if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
+ continue;
+ }
+
+ /* any new or different nodes must be added */
+ if (ctdb->methods->add_node(ctdb->nodes[i]) != 0) {
+ DEBUG(DEBUG_CRIT, (__location__ " methods->add_node failed at %d\n", i));
+ ctdb_fatal(ctdb, "failed to add node. shutting down\n");
+ }
+ if (ctdb->methods->connect_node(ctdb->nodes[i]) != 0) {
+ DEBUG(DEBUG_CRIT, (__location__ " methods->add_connect failed at %d\n", i));
+ ctdb_fatal(ctdb, "failed to connect to node. shutting down\n");
+ }
+ }
+
+ /* tell the recovery daemon to reload the nodes file too */
+ ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELOAD_NODES, tdb_null);
+
+ talloc_free(tmp_ctx);
+
+ return 0;
+}
+
+struct db_pull_state {
+ struct ctdb_context *ctdb;
+ struct ctdb_db_context *ctdb_db;
+ struct ctdb_marshall_buffer *recs;
+ uint32_t pnn;
+ uint64_t srvid;
+ uint32_t num_records;
+};
+
+static int traverse_db_pull(struct tdb_context *tdb, TDB_DATA key,
+ TDB_DATA data, void *private_data)
+{
+ struct db_pull_state *state = (struct db_pull_state *)private_data;
+ struct ctdb_marshall_buffer *recs;
+
+ recs = ctdb_marshall_add(state->ctdb, state->recs,
+ state->ctdb_db->db_id, 0, key, NULL, data);
+ if (recs == NULL) {
+ TALLOC_FREE(state->recs);
+ return -1;
+ }
+ state->recs = recs;
+
+ if (talloc_get_size(state->recs) >=
+ state->ctdb->tunable.rec_buffer_size_limit) {
+ TDB_DATA buffer;
+ int ret;
+
+ buffer = ctdb_marshall_finish(state->recs);
+ ret = ctdb_daemon_send_message(state->ctdb, state->pnn,
+ state->srvid, buffer);
+ if (ret != 0) {
+ TALLOC_FREE(state->recs);
+ return -1;
+ }
+
+ state->num_records += state->recs->count;
+ TALLOC_FREE(state->recs);
+ }
+
+ return 0;
+}
+
+int32_t ctdb_control_db_pull(struct ctdb_context *ctdb,
+ struct ctdb_req_control_old *c,
+ TDB_DATA indata, TDB_DATA *outdata)
+{
+ struct ctdb_pulldb_ext *pulldb_ext;
+ struct ctdb_db_context *ctdb_db;
+ struct db_pull_state state;
+ int ret;
+
+ pulldb_ext = (struct ctdb_pulldb_ext *)indata.dptr;
+
+ ctdb_db = find_ctdb_db(ctdb, pulldb_ext->db_id);
+ if (ctdb_db == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n",
+ pulldb_ext->db_id));
+ return -1;
+ }
+
+ if (!ctdb_db_frozen(ctdb_db)) {
+ DEBUG(DEBUG_ERR,
+ ("rejecting ctdb_control_pull_db when not frozen\n"));
+ return -1;
+ }
+
+ if (ctdb_db->unhealthy_reason) {
+ /* this is just a warning, as the tdb should be empty anyway */
+ DEBUG(DEBUG_WARNING,
+ ("db(%s) unhealty in ctdb_control_db_pull: %s\n",
+ ctdb_db->db_name, ctdb_db->unhealthy_reason));
+ }
+
+ state.ctdb = ctdb;
+ state.ctdb_db = ctdb_db;
+ state.recs = NULL;
+ state.pnn = c->hdr.srcnode;
+ state.srvid = pulldb_ext->srvid;
+ state.num_records = 0;
+
+ /* If the records are invalid, we are done */
+ if (ctdb_db->invalid_records) {
+ goto done;
+ }
+
+ if (ctdb_lockdb_mark(ctdb_db) != 0) {
+ DEBUG(DEBUG_ERR,
+ (__location__ " Failed to get lock on entire db - failing\n"));
+ return -1;
+ }
+
+ ret = tdb_traverse_read(ctdb_db->ltdb->tdb, traverse_db_pull, &state);
+ if (ret == -1) {
+ DEBUG(DEBUG_ERR,
+ (__location__ " Failed to get traverse db '%s'\n",
+ ctdb_db->db_name));
+ ctdb_lockdb_unmark(ctdb_db);
+ return -1;
+ }
+
+ /* Last few records */
+ if (state.recs != NULL) {
+ TDB_DATA buffer;
+
+ buffer = ctdb_marshall_finish(state.recs);
+ ret = ctdb_daemon_send_message(state.ctdb, state.pnn,
+ state.srvid, buffer);
+ if (ret != 0) {
+ TALLOC_FREE(state.recs);
+ ctdb_lockdb_unmark(ctdb_db);
+ return -1;
+ }
+
+ state.num_records += state.recs->count;
+ TALLOC_FREE(state.recs);
+ }
+
+ ctdb_lockdb_unmark(ctdb_db);
+
+done:
+ outdata->dptr = talloc_size(outdata, sizeof(uint32_t));
+ if (outdata->dptr == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " Memory allocation error\n"));
+ return -1;
+ }
+
+ memcpy(outdata->dptr, (uint8_t *)&state.num_records, sizeof(uint32_t));
+ outdata->dsize = sizeof(uint32_t);
+
+ return 0;
+}
+
+struct db_push_state {
+ struct ctdb_context *ctdb;
+ struct ctdb_db_context *ctdb_db;
+ uint64_t srvid;
+ uint32_t num_records;
+ bool failed;
+};
+
+static void db_push_msg_handler(uint64_t srvid, TDB_DATA indata,
+ void *private_data)
+{
+ struct db_push_state *state = talloc_get_type(
+ private_data, struct db_push_state);
+ struct ctdb_marshall_buffer *recs;
+ struct ctdb_rec_data_old *rec;
+ unsigned int i;
+ int ret;
+
+ if (state->failed) {
+ return;
+ }
+
+ recs = (struct ctdb_marshall_buffer *)indata.dptr;
+ rec = (struct ctdb_rec_data_old *)&recs->data[0];
+
+ DEBUG(DEBUG_INFO, ("starting push of %u records for dbid 0x%x\n",
+ recs->count, recs->db_id));
+
+ for (i=0; i<recs->count; i++) {
+ TDB_DATA key, data;
+ struct ctdb_ltdb_header *hdr;
+
+ key.dptr = &rec->data[0];
+ key.dsize = rec->keylen;
+ data.dptr = &rec->data[key.dsize];
+ data.dsize = rec->datalen;
+
+ if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
+ DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
+ goto failed;
+ }
+
+ hdr = (struct ctdb_ltdb_header *)data.dptr;
+ /* Strip off any read only record flags.
+ * All readonly records are revoked implicitely by a recovery.
+ */
+ hdr->flags &= ~CTDB_REC_RO_FLAGS;
+
+ data.dptr += sizeof(*hdr);
+ data.dsize -= sizeof(*hdr);
+
+ ret = ctdb_ltdb_store(state->ctdb_db, key, hdr, data);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,
+ (__location__ " Unable to store record\n"));
+ goto failed;
+ }
+
+ rec = (struct ctdb_rec_data_old *)(rec->length + (uint8_t *)rec);
+ }
+
+ DEBUG(DEBUG_DEBUG, ("finished push of %u records for dbid 0x%x\n",
+ recs->count, recs->db_id));
+
+ state->num_records += recs->count;
+ return;
+
+failed:
+ state->failed = true;
+}
+
+int32_t ctdb_control_db_push_start(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+ struct ctdb_pulldb_ext *pulldb_ext;
+ struct ctdb_db_context *ctdb_db;
+ struct db_push_state *state;
+ int ret;
+
+ pulldb_ext = (struct ctdb_pulldb_ext *)indata.dptr;
+
+ ctdb_db = find_ctdb_db(ctdb, pulldb_ext->db_id);
+ if (ctdb_db == NULL) {
+ DEBUG(DEBUG_ERR,
+ (__location__ " Unknown db 0x%08x\n", pulldb_ext->db_id));
+ return -1;
+ }
+
+ if (!ctdb_db_frozen(ctdb_db)) {
+ DEBUG(DEBUG_ERR,
+ ("rejecting ctdb_control_db_push_start when not frozen\n"));
+ return -1;
+ }
+
+ if (ctdb_db->push_started) {
+ DEBUG(DEBUG_WARNING,
+ (__location__ " DB push already started for %s\n",
+ ctdb_db->db_name));
+
+ /* De-register old state */
+ state = (struct db_push_state *)ctdb_db->push_state;
+ if (state != NULL) {
+ srvid_deregister(ctdb->srv, state->srvid, state);
+ talloc_free(state);
+ ctdb_db->push_state = NULL;
+ }
+ }
+
+ state = talloc_zero(ctdb_db, struct db_push_state);
+ if (state == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " Memory allocation error\n"));
+ return -1;
+ }
+
+ state->ctdb = ctdb;
+ state->ctdb_db = ctdb_db;
+ state->srvid = pulldb_ext->srvid;
+ state->failed = false;
+
+ ret = srvid_register(ctdb->srv, state, state->srvid,
+ db_push_msg_handler, state);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,
+ (__location__ " Failed to register srvid for db push\n"));
+ talloc_free(state);
+ return -1;
+ }
+
+ if (ctdb_lockdb_mark(ctdb_db) != 0) {
+ DEBUG(DEBUG_ERR,
+ (__location__ " Failed to get lock on entire db - failing\n"));
+ srvid_deregister(ctdb->srv, state->srvid, state);
+ talloc_free(state);
+ return -1;
+ }
+
+ ctdb_db->push_started = true;
+ ctdb_db->push_state = state;
+
+ return 0;
+}
+
+int32_t ctdb_control_db_push_confirm(struct ctdb_context *ctdb,
+ TDB_DATA indata, TDB_DATA *outdata)
+{
+ uint32_t db_id;
+ struct ctdb_db_context *ctdb_db;
+ struct db_push_state *state;
+
+ db_id = *(uint32_t *)indata.dptr;
+
+ ctdb_db = find_ctdb_db(ctdb, db_id);
+ if (ctdb_db == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", db_id));
+ return -1;
+ }
+
+ if (!ctdb_db_frozen(ctdb_db)) {
+ DEBUG(DEBUG_ERR,
+ ("rejecting ctdb_control_db_push_confirm when not frozen\n"));
+ return -1;
+ }
+
+ if (!ctdb_db->push_started) {
+ DEBUG(DEBUG_ERR, (__location__ " DB push not started\n"));
+ return -1;
+ }
+
+ if (ctdb_db_readonly(ctdb_db)) {
+ DEBUG(DEBUG_ERR,
+ ("Clearing the tracking database for dbid 0x%x\n",
+ ctdb_db->db_id));
+ if (tdb_wipe_all(ctdb_db->rottdb) != 0) {
+ DEBUG(DEBUG_ERR,
+ ("Failed to wipe tracking database for 0x%x."
+ " Dropping read-only delegation support\n",
+ ctdb_db->db_id));
+ tdb_close(ctdb_db->rottdb);
+ ctdb_db->rottdb = NULL;
+ ctdb_db_reset_readonly(ctdb_db);
+ }
+
+ while (ctdb_db->revokechild_active != NULL) {
+ talloc_free(ctdb_db->revokechild_active);
+ }
+ }
+
+ ctdb_lockdb_unmark(ctdb_db);
+
+ state = (struct db_push_state *)ctdb_db->push_state;
+ if (state == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " Missing push db state\n"));
+ return -1;
+ }
+
+ srvid_deregister(ctdb->srv, state->srvid, state);
+
+ outdata->dptr = talloc_size(outdata, sizeof(uint32_t));
+ if (outdata->dptr == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " Memory allocation error\n"));
+ talloc_free(state);
+ ctdb_db->push_state = NULL;
+ return -1;
+ }
+
+ memcpy(outdata->dptr, (uint8_t *)&state->num_records, sizeof(uint32_t));
+ outdata->dsize = sizeof(uint32_t);
+
+ talloc_free(state);
+ ctdb_db->push_started = false;
+ ctdb_db->push_state = NULL;
+
+ return 0;
+}
+
+struct set_recmode_state {
+ struct ctdb_context *ctdb;
+ struct ctdb_req_control_old *c;
+};
+
+static void set_recmode_handler(char status,
+ double latency,
+ void *private_data)
+{
+ struct set_recmode_state *state = talloc_get_type_abort(
+ private_data, struct set_recmode_state);
+ int s = 0;
+ const char *err = NULL;
+
+ switch (status) {
+ case '0':
+ /* Mutex taken */
+ DEBUG(DEBUG_ERR,
+ ("ERROR: Daemon able to take recovery lock on \"%s\" during recovery\n",
+ state->ctdb->recovery_lock));
+ s = -1;
+ err = "Took recovery lock from daemon during recovery - probably a cluster filesystem lock coherence problem";
+ break;
+
+ case '1':
+ /* Contention */
+ DEBUG(DEBUG_DEBUG, (__location__ " Recovery lock check OK\n"));
+ state->ctdb->recovery_mode = CTDB_RECOVERY_NORMAL;
+ ctdb_process_deferred_attach(state->ctdb);
+
+ s = 0;
+
+ CTDB_UPDATE_RECLOCK_LATENCY(state->ctdb, "daemon reclock",
+ reclock.ctdbd, latency);
+ break;
+
+ case '2':
+ /* Timeout. Consider this a success, not a failure,
+ * as we failed to set the recovery lock which is what
+ * we wanted. This can be caused by the cluster
+ * filesystem being very slow to arbitrate locks
+ * immediately after a node failure. */
+ DEBUG(DEBUG_WARNING,
+ (__location__
+ "Time out getting recovery lock, allowing recmode set anyway\n"));
+ state->ctdb->recovery_mode = CTDB_RECOVERY_NORMAL;
+ ctdb_process_deferred_attach(state->ctdb);
+
+ s = 0;
+ break;
+
+ default:
+ DEBUG(DEBUG_ERR,
+ ("Unexpected error when testing recovery lock\n"));
+ s = -1;
+ err = "Unexpected error when testing recovery lock";
+ }
+
+ ctdb_request_control_reply(state->ctdb, state->c, NULL, s, err);
+ talloc_free(state);
+}
+
+static void
+ctdb_drop_all_ips_event(struct tevent_context *ev, struct tevent_timer *te,
+ struct timeval t, void *private_data)
+{
+ struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+
+ DEBUG(DEBUG_ERR,(__location__ " Been in recovery mode for too long. Dropping all IPS\n"));
+ talloc_free(ctdb->release_ips_ctx);
+ ctdb->release_ips_ctx = NULL;
+
+ ctdb_release_all_ips(ctdb);
+}
+
+/*
+ * Set up an event to drop all public ips if we remain in recovery for too
+ * long
+ */
+int ctdb_deferred_drop_all_ips(struct ctdb_context *ctdb)
+{
+ if (ctdb->release_ips_ctx != NULL) {
+ talloc_free(ctdb->release_ips_ctx);
+ }
+ ctdb->release_ips_ctx = talloc_new(ctdb);
+ CTDB_NO_MEMORY(ctdb, ctdb->release_ips_ctx);
+
+ tevent_add_timer(ctdb->ev, ctdb->release_ips_ctx,
+ timeval_current_ofs(ctdb->tunable.recovery_drop_all_ips, 0),
+ ctdb_drop_all_ips_event, ctdb);
+ return 0;
+}
+
+/*
+ set the recovery mode
+ */
+int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb,
+ struct ctdb_req_control_old *c,
+ TDB_DATA indata, bool *async_reply,
+ const char **errormsg)
+{
+ uint32_t recmode = *(uint32_t *)indata.dptr;
+ struct ctdb_db_context *ctdb_db;
+ struct set_recmode_state *state;
+ struct ctdb_cluster_mutex_handle *h;
+
+ if (recmode == ctdb->recovery_mode) {
+ D_INFO("Recovery mode already set to %s\n",
+ recmode == CTDB_RECOVERY_NORMAL ? "NORMAL" : "ACTIVE");
+ return 0;
+ }
+
+ D_NOTICE("Recovery mode set to %s\n",
+ recmode == CTDB_RECOVERY_NORMAL ? "NORMAL" : "ACTIVE");
+
+ /* if we enter recovery but stay in recovery for too long
+ we will eventually drop all our ip addresses
+ */
+ if (recmode == CTDB_RECOVERY_ACTIVE) {
+ if (ctdb_deferred_drop_all_ips(ctdb) != 0) {
+ D_ERR("Failed to set up deferred drop all ips\n");
+ }
+
+ ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
+ return 0;
+ }
+
+ /* From this point: recmode == CTDB_RECOVERY_NORMAL
+ *
+ * Therefore, what follows is special handling when setting
+ * recovery mode back to normal */
+
+ TALLOC_FREE(ctdb->release_ips_ctx);
+
+ for (ctdb_db = ctdb->db_list; ctdb_db != NULL; ctdb_db = ctdb_db->next) {
+ if (ctdb_db->generation != ctdb->vnn_map->generation) {
+ DEBUG(DEBUG_ERR,
+ ("Inconsistent DB generation %u for %s\n",
+ ctdb_db->generation, ctdb_db->db_name));
+ DEBUG(DEBUG_ERR, ("Recovery mode set to ACTIVE\n"));
+ return -1;
+ }
+ }
+
+ /* force the databases to thaw */
+ if (ctdb_db_all_frozen(ctdb)) {
+ ctdb_control_thaw(ctdb, false);
+ }
+
+ if (ctdb->recovery_lock == NULL) {
+ /* Not using recovery lock file */
+ ctdb->recovery_mode = CTDB_RECOVERY_NORMAL;
+ ctdb_process_deferred_attach(ctdb);
+ return 0;
+ }
+
+ state = talloc_zero(ctdb, struct set_recmode_state);
+ if (state == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
+ return -1;
+ }
+ state->ctdb = ctdb;
+ state->c = NULL;
+
+ h = ctdb_cluster_mutex(state, ctdb, ctdb->recovery_lock, 5,
+ set_recmode_handler, state, NULL, NULL);
+ if (h == NULL) {
+ talloc_free(state);
+ return -1;
+ }
+
+ state->c = talloc_steal(state, c);
+ *async_reply = true;
+
+ return 0;
+}
+
+
+/*
+ delete a record as part of the vacuum process
+ only delete if we are not lmaster or dmaster, and our rsn is <= the provided rsn
+ use non-blocking locks
+
+ return 0 if the record was successfully deleted (i.e. it does not exist
+ when the function returns)
+ or !0 is the record still exists in the tdb after returning.
+ */
+static int delete_tdb_record(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, struct ctdb_rec_data_old *rec)
+{
+ TDB_DATA key, data, data2;
+ struct ctdb_ltdb_header *hdr, *hdr2;
+
+ /* these are really internal tdb functions - but we need them here for
+ non-blocking lock of the freelist */
+ int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype);
+ int tdb_unlock(struct tdb_context *tdb, int list, int ltype);
+
+
+ key.dsize = rec->keylen;
+ key.dptr = &rec->data[0];
+ data.dsize = rec->datalen;
+ data.dptr = &rec->data[rec->keylen];
+
+ if (ctdb_lmaster(ctdb, &key) == ctdb->pnn) {
+ DBG_INFO("Called delete on record where we are lmaster\n");
+ return -1;
+ }
+
+ if (data.dsize != sizeof(struct ctdb_ltdb_header)) {
+ DBG_ERR("Bad record size\n");
+ return -1;
+ }
+
+ hdr = (struct ctdb_ltdb_header *)data.dptr;
+
+ /* use a non-blocking lock */
+ if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, key) != 0) {
+ DBG_INFO("Failed to get non-blocking chain lock\n");
+ return -1;
+ }
+
+ data2 = tdb_fetch(ctdb_db->ltdb->tdb, key);
+ if (data2.dptr == NULL) {
+ tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+ return 0;
+ }
+
+ if (data2.dsize < sizeof(struct ctdb_ltdb_header)) {
+ if (tdb_lock_nonblock(ctdb_db->ltdb->tdb, -1, F_WRLCK) == 0) {
+ if (tdb_delete(ctdb_db->ltdb->tdb, key) != 0) {
+ DBG_ERR("Failed to delete corrupt record\n");
+ }
+ tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
+ DBG_ERR("Deleted corrupt record\n");
+ }
+ tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+ free(data2.dptr);
+ return 0;
+ }
+
+ hdr2 = (struct ctdb_ltdb_header *)data2.dptr;
+
+ if (hdr2->rsn > hdr->rsn) {
+ tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+ DBG_INFO("Skipping record with rsn=%llu - called with rsn=%llu\n",
+ (unsigned long long)hdr2->rsn,
+ (unsigned long long)hdr->rsn);
+ free(data2.dptr);
+ return -1;
+ }
+
+ /* do not allow deleting record that have readonly flags set. */
+ if (hdr->flags & CTDB_REC_RO_FLAGS) {
+ tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+ DBG_INFO("Skipping record with readonly flags set\n");
+ free(data2.dptr);
+ return -1;
+ }
+ if (hdr2->flags & CTDB_REC_RO_FLAGS) {
+ tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+ DBG_INFO("Skipping record with readonly flags set locally\n");
+ free(data2.dptr);
+ return -1;
+ }
+
+ if (hdr2->dmaster == ctdb->pnn) {
+ tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+ DBG_INFO("Attempted delete record where we are the dmaster\n");
+ free(data2.dptr);
+ return -1;
+ }
+
+ if (tdb_lock_nonblock(ctdb_db->ltdb->tdb, -1, F_WRLCK) != 0) {
+ tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+ DBG_INFO("Failed to get non-blocking freelist lock\n");
+ free(data2.dptr);
+ return -1;
+ }
+
+ if (tdb_delete(ctdb_db->ltdb->tdb, key) != 0) {
+ tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
+ tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+ DBG_INFO("Failed to delete record\n");
+ free(data2.dptr);
+ return -1;
+ }
+
+ tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
+ tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+ free(data2.dptr);
+ return 0;
+}
+
+
+
+struct recovery_callback_state {
+ struct ctdb_req_control_old *c;
+};
+
+
+/*
+ called when the 'recovered' event script has finished
+ */
+static void ctdb_end_recovery_callback(struct ctdb_context *ctdb, int status, void *p)
+{
+ struct recovery_callback_state *state = talloc_get_type(p, struct recovery_callback_state);
+
+ CTDB_INCREMENT_STAT(ctdb, num_recoveries);
+
+ if (status != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " recovered event script failed (status %d)\n", status));
+ if (status == -ETIMEDOUT) {
+ ctdb_ban_self(ctdb);
+ }
+ }
+
+ ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
+ talloc_free(state);
+
+ gettimeofday(&ctdb->last_recovery_finished, NULL);
+
+ if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) {
+ ctdb_set_runstate(ctdb, CTDB_RUNSTATE_STARTUP);
+ }
+}
+
+/*
+ recovery has finished
+ */
+int32_t ctdb_control_end_recovery(struct ctdb_context *ctdb,
+ struct ctdb_req_control_old *c,
+ bool *async_reply)
+{
+ int ret;
+ struct recovery_callback_state *state;
+
+ DEBUG(DEBUG_ERR,("Recovery has finished\n"));
+
+ ctdb_persistent_finish_trans3_commits(ctdb);
+
+ state = talloc(ctdb, struct recovery_callback_state);
+ CTDB_NO_MEMORY(ctdb, state);
+
+ state->c = c;
+
+ ret = ctdb_event_script_callback(ctdb, state,
+ ctdb_end_recovery_callback,
+ state,
+ CTDB_EVENT_RECOVERED, "%s", "");
+
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to end recovery\n"));
+ talloc_free(state);
+ return -1;
+ }
+
+ /* tell the control that we will be reply asynchronously */
+ state->c = talloc_steal(state, c);
+ *async_reply = true;
+ return 0;
+}
+
+/*
+ called when the 'startrecovery' event script has finished
+ */
+static void ctdb_start_recovery_callback(struct ctdb_context *ctdb, int status, void *p)
+{
+ struct recovery_callback_state *state = talloc_get_type(p, struct recovery_callback_state);
+
+ if (status != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " startrecovery event script failed (status %d)\n", status));
+ }
+
+ ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
+ talloc_free(state);
+}
+
+static void run_start_recovery_event(struct ctdb_context *ctdb,
+ struct recovery_callback_state *state)
+{
+ int ret;
+
+ ret = ctdb_event_script_callback(ctdb, state,
+ ctdb_start_recovery_callback,
+ state,
+ CTDB_EVENT_START_RECOVERY,
+ "%s", "");
+
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,("Unable to run startrecovery event\n"));
+ ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
+ talloc_free(state);
+ return;
+ }
+
+ return;
+}
+
+static bool reclock_strings_equal(const char *a, const char *b)
+{
+ return (a == NULL && b == NULL) ||
+ (a != NULL && b != NULL && strcmp(a, b) == 0);
+}
+
+static void start_recovery_reclock_callback(struct ctdb_context *ctdb,
+ int32_t status,
+ TDB_DATA data,
+ const char *errormsg,
+ void *private_data)
+{
+ struct recovery_callback_state *state = talloc_get_type_abort(
+ private_data, struct recovery_callback_state);
+ const char *local = ctdb->recovery_lock;
+ const char *remote = NULL;
+
+ if (status != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " GET_RECLOCK failed\n"));
+ ctdb_request_control_reply(ctdb, state->c, NULL,
+ status, errormsg);
+ talloc_free(state);
+ return;
+ }
+
+ /* Check reclock consistency */
+ if (data.dsize > 0) {
+ /* Ensure NUL-termination */
+ data.dptr[data.dsize-1] = '\0';
+ remote = (const char *)data.dptr;
+ }
+ if (! reclock_strings_equal(local, remote)) {
+ /* Inconsistent */
+ ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
+ DEBUG(DEBUG_ERR,
+ ("Recovery lock configuration inconsistent: "
+ "recmaster has %s, this node has %s, shutting down\n",
+ remote == NULL ? "NULL" : remote,
+ local == NULL ? "NULL" : local));
+ talloc_free(state);
+ ctdb_shutdown_sequence(ctdb, 1);
+ }
+ DEBUG(DEBUG_INFO,
+ ("Recovery lock consistency check successful\n"));
+
+ run_start_recovery_event(ctdb, state);
+}
+
+/* Check recovery lock consistency and run eventscripts for the
+ * "startrecovery" event */
+int32_t ctdb_control_start_recovery(struct ctdb_context *ctdb,
+ struct ctdb_req_control_old *c,
+ bool *async_reply)
+{
+ int ret;
+ struct recovery_callback_state *state;
+ uint32_t recmaster = c->hdr.srcnode;
+
+ DEBUG(DEBUG_ERR, ("Recovery has started\n"));
+ gettimeofday(&ctdb->last_recovery_started, NULL);
+
+ state = talloc(ctdb, struct recovery_callback_state);
+ CTDB_NO_MEMORY(ctdb, state);
+
+ state->c = c;
+
+ /* Although the recovery master sent this node a start
+ * recovery control, this node might still think the recovery
+ * master is disconnected. In this case defer the recovery
+ * lock consistency check. */
+ if (ctdb->nodes[recmaster]->flags & NODE_FLAGS_DISCONNECTED) {
+ run_start_recovery_event(ctdb, state);
+ } else {
+ /* Ask the recovery master about its reclock setting */
+ ret = ctdb_daemon_send_control(ctdb,
+ recmaster,
+ 0,
+ CTDB_CONTROL_GET_RECLOCK_FILE,
+ 0, 0,
+ tdb_null,
+ start_recovery_reclock_callback,
+ state);
+
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " GET_RECLOCK failed\n"));
+ talloc_free(state);
+ return -1;
+ }
+ }
+
+ /* tell the control that we will be reply asynchronously */
+ state->c = talloc_steal(state, c);
+ *async_reply = true;
+
+ return 0;
+}
+
+/*
+ try to delete all these records as part of the vacuuming process
+ and return the records we failed to delete
+*/
+int32_t ctdb_control_try_delete_records(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
+{
+ struct ctdb_marshall_buffer *reply = (struct ctdb_marshall_buffer *)indata.dptr;
+ struct ctdb_db_context *ctdb_db;
+ unsigned int i;
+ struct ctdb_rec_data_old *rec;
+ struct ctdb_marshall_buffer *records;
+
+ if (indata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
+ DEBUG(DEBUG_ERR,(__location__ " invalid data in try_delete_records\n"));
+ return -1;
+ }
+
+ ctdb_db = find_ctdb_db(ctdb, reply->db_id);
+ if (!ctdb_db) {
+ DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", reply->db_id));
+ return -1;
+ }
+
+
+ DEBUG(DEBUG_DEBUG,("starting try_delete_records of %u records for dbid 0x%x\n",
+ reply->count, reply->db_id));
+
+
+ /* create a blob to send back the records we couldnt delete */
+ records = (struct ctdb_marshall_buffer *)
+ talloc_zero_size(outdata,
+ offsetof(struct ctdb_marshall_buffer, data));
+ if (records == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+ return -1;
+ }
+ records->db_id = ctdb_db->db_id;
+
+
+ rec = (struct ctdb_rec_data_old *)&reply->data[0];
+ for (i=0;i<reply->count;i++) {
+ TDB_DATA key, data;
+
+ key.dptr = &rec->data[0];
+ key.dsize = rec->keylen;
+ data.dptr = &rec->data[key.dsize];
+ data.dsize = rec->datalen;
+
+ if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
+ DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record in indata\n"));
+ talloc_free(records);
+ return -1;
+ }
+
+ /* If we cant delete the record we must add it to the reply
+ so the lmaster knows it may not purge this record
+ */
+ if (delete_tdb_record(ctdb, ctdb_db, rec) != 0) {
+ size_t old_size;
+ struct ctdb_ltdb_header *hdr;
+
+ hdr = (struct ctdb_ltdb_header *)data.dptr;
+ data.dptr += sizeof(*hdr);
+ data.dsize -= sizeof(*hdr);
+
+ DEBUG(DEBUG_INFO, (__location__ " Failed to vacuum delete record with hash 0x%08x\n", ctdb_hash(&key)));
+
+ old_size = talloc_get_size(records);
+ records = talloc_realloc_size(outdata, records, old_size + rec->length);
+ if (records == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to expand\n"));
+ return -1;
+ }
+ records->count++;
+ memcpy(old_size+(uint8_t *)records, rec, rec->length);
+ }
+
+ rec = (struct ctdb_rec_data_old *)(rec->length + (uint8_t *)rec);
+ }
+
+
+ *outdata = ctdb_marshall_finish(records);
+
+ return 0;
+}
+
+/*
+ report capabilities
+ */
+int32_t ctdb_control_get_capabilities(struct ctdb_context *ctdb, TDB_DATA *outdata)
+{
+ uint32_t *capabilities = NULL;
+
+ capabilities = talloc(outdata, uint32_t);
+ CTDB_NO_MEMORY(ctdb, capabilities);
+ *capabilities = ctdb->capabilities;
+
+ outdata->dsize = sizeof(uint32_t);
+ outdata->dptr = (uint8_t *)capabilities;
+
+ return 0;
+}
+
+/* The recovery daemon will ping us at regular intervals.
+ If we havent been pinged for a while we assume the recovery
+ daemon is inoperable and we restart.
+*/
+static void ctdb_recd_ping_timeout(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *p)
+{
+ struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
+ uint32_t *count = talloc_get_type(ctdb->recd_ping_count, uint32_t);
+
+ DEBUG(DEBUG_ERR, ("Recovery daemon ping timeout. Count : %u\n", *count));
+
+ if (*count < ctdb->tunable.recd_ping_failcount) {
+ (*count)++;
+ tevent_add_timer(ctdb->ev, ctdb->recd_ping_count,
+ timeval_current_ofs(ctdb->tunable.recd_ping_timeout, 0),
+ ctdb_recd_ping_timeout, ctdb);
+ return;
+ }
+
+ DEBUG(DEBUG_ERR, ("Final timeout for recovery daemon ping. Restarting recovery daemon. (This can be caused if the cluster filesystem has hung)\n"));
+
+ ctdb_stop_recoverd(ctdb);
+ ctdb_start_recoverd(ctdb);
+}
+
+int32_t ctdb_control_recd_ping(struct ctdb_context *ctdb)
+{
+ talloc_free(ctdb->recd_ping_count);
+
+ ctdb->recd_ping_count = talloc_zero(ctdb, uint32_t);
+ CTDB_NO_MEMORY(ctdb, ctdb->recd_ping_count);
+
+ if (ctdb->tunable.recd_ping_timeout != 0) {
+ tevent_add_timer(ctdb->ev, ctdb->recd_ping_count,
+ timeval_current_ofs(ctdb->tunable.recd_ping_timeout, 0),
+ ctdb_recd_ping_timeout, ctdb);
+ }
+
+ return 0;
+}
+
+void ctdb_node_become_inactive(struct ctdb_context *ctdb)
+{
+ struct ctdb_db_context *ctdb_db;
+
+ D_WARNING("Making node INACTIVE\n");
+
+ /*
+ * Do not service database calls - reset generation to invalid
+ * so this node ignores any REQ/REPLY CALL/DMASTER
+ */
+ ctdb->vnn_map->generation = INVALID_GENERATION;
+ for (ctdb_db = ctdb->db_list; ctdb_db != NULL; ctdb_db = ctdb_db->next) {
+ ctdb_db->generation = INVALID_GENERATION;
+ }
+
+ /*
+ * Although this bypasses the control, the only thing missing
+ * is the deferred drop of all public IPs, which isn't
+ * necessary because they are dropped below
+ */
+ if (ctdb->recovery_mode != CTDB_RECOVERY_ACTIVE) {
+ D_NOTICE("Recovery mode set to ACTIVE\n");
+ ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
+ }
+
+ /*
+ * Initiate database freeze - this will be scheduled for
+ * immediate execution and will be in progress long before the
+ * calling control returns
+ */
+ ctdb_daemon_send_control(ctdb,
+ ctdb->pnn,
+ 0,
+ CTDB_CONTROL_FREEZE,
+ 0,
+ CTDB_CTRL_FLAG_NOREPLY,
+ tdb_null,
+ NULL,
+ NULL);
+
+ D_NOTICE("Dropping all public IP addresses\n");
+ ctdb_release_all_ips(ctdb);
+}
+
+int32_t ctdb_control_stop_node(struct ctdb_context *ctdb)
+{
+ DEBUG(DEBUG_ERR, ("Stopping node\n"));
+ ctdb->nodes[ctdb->pnn]->flags |= NODE_FLAGS_STOPPED;
+
+ ctdb_node_become_inactive(ctdb);
+
+ return 0;
+}
+
+int32_t ctdb_control_continue_node(struct ctdb_context *ctdb)
+{
+ DEBUG(DEBUG_ERR, ("Continue node\n"));
+ ctdb->nodes[ctdb->pnn]->flags &= ~NODE_FLAGS_STOPPED;
+
+ return 0;
+}
+
diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c
new file mode 100644
index 0000000..bf3a66b
--- /dev/null
+++ b/ctdb/server/ctdb_recoverd.c
@@ -0,0 +1,3286 @@
+/*
+ ctdb recovery daemon
+
+ Copyright (C) Ronnie Sahlberg 2007
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/filesys.h"
+#include "system/time.h"
+#include "system/network.h"
+#include "system/wait.h"
+
+#include <popt.h>
+#include <talloc.h>
+#include <tevent.h>
+#include <tdb.h>
+
+#include "lib/tdb_wrap/tdb_wrap.h"
+#include "lib/util/dlinklist.h"
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+#include "lib/util/sys_rw.h"
+#include "lib/util/util_process.h"
+
+#include "ctdb_private.h"
+#include "ctdb_client.h"
+
+#include "protocol/protocol_basic.h"
+
+#include "common/system_socket.h"
+#include "common/common.h"
+#include "common/logging.h"
+
+#include "server/ctdb_config.h"
+
+#include "ctdb_cluster_mutex.h"
+
+/* List of SRVID requests that need to be processed */
+struct srvid_list {
+ struct srvid_list *next, *prev;
+ struct ctdb_srvid_message *request;
+};
+
+struct srvid_requests {
+ struct srvid_list *requests;
+};
+
+static void srvid_request_reply(struct ctdb_context *ctdb,
+ struct ctdb_srvid_message *request,
+ TDB_DATA result)
+{
+ /* Someone that sent srvid==0 does not want a reply */
+ if (request->srvid == 0) {
+ talloc_free(request);
+ return;
+ }
+
+ if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
+ result) == 0) {
+ DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
+ (unsigned)request->pnn,
+ (unsigned long long)request->srvid));
+ } else {
+ DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
+ (unsigned)request->pnn,
+ (unsigned long long)request->srvid));
+ }
+
+ talloc_free(request);
+}
+
+static void srvid_requests_reply(struct ctdb_context *ctdb,
+ struct srvid_requests **requests,
+ TDB_DATA result)
+{
+ struct srvid_list *r;
+
+ if (*requests == NULL) {
+ return;
+ }
+
+ for (r = (*requests)->requests; r != NULL; r = r->next) {
+ srvid_request_reply(ctdb, r->request, result);
+ }
+
+ /* Free the list structure... */
+ TALLOC_FREE(*requests);
+}
+
+static void srvid_request_add(struct ctdb_context *ctdb,
+ struct srvid_requests **requests,
+ struct ctdb_srvid_message *request)
+{
+ struct srvid_list *t;
+ int32_t ret;
+ TDB_DATA result;
+
+ if (*requests == NULL) {
+ *requests = talloc_zero(ctdb, struct srvid_requests);
+ if (*requests == NULL) {
+ goto nomem;
+ }
+ }
+
+ t = talloc_zero(*requests, struct srvid_list);
+ if (t == NULL) {
+ /* If *requests was just allocated above then free it */
+ if ((*requests)->requests == NULL) {
+ TALLOC_FREE(*requests);
+ }
+ goto nomem;
+ }
+
+ t->request = (struct ctdb_srvid_message *)talloc_steal(t, request);
+ DLIST_ADD((*requests)->requests, t);
+
+ return;
+
+nomem:
+ /* Failed to add the request to the list. Send a fail. */
+ DEBUG(DEBUG_ERR, (__location__
+ " Out of memory, failed to queue SRVID request\n"));
+ ret = -ENOMEM;
+ result.dsize = sizeof(ret);
+ result.dptr = (uint8_t *)&ret;
+ srvid_request_reply(ctdb, request, result);
+}
+
+/* An abstraction to allow an operation (takeover runs, recoveries,
+ * ...) to be disabled for a given timeout */
+struct ctdb_op_state {
+ struct tevent_timer *timer;
+ bool in_progress;
+ const char *name;
+};
+
+static struct ctdb_op_state *ctdb_op_init(TALLOC_CTX *mem_ctx, const char *name)
+{
+ struct ctdb_op_state *state = talloc_zero(mem_ctx, struct ctdb_op_state);
+
+ if (state != NULL) {
+ state->in_progress = false;
+ state->name = name;
+ }
+
+ return state;
+}
+
+static bool ctdb_op_is_disabled(struct ctdb_op_state *state)
+{
+ return state->timer != NULL;
+}
+
+static bool ctdb_op_begin(struct ctdb_op_state *state)
+{
+ if (ctdb_op_is_disabled(state)) {
+ DEBUG(DEBUG_NOTICE,
+ ("Unable to begin - %s are disabled\n", state->name));
+ return false;
+ }
+
+ state->in_progress = true;
+ return true;
+}
+
+static bool ctdb_op_end(struct ctdb_op_state *state)
+{
+ return state->in_progress = false;
+}
+
+static bool ctdb_op_is_in_progress(struct ctdb_op_state *state)
+{
+ return state->in_progress;
+}
+
+static void ctdb_op_enable(struct ctdb_op_state *state)
+{
+ TALLOC_FREE(state->timer);
+}
+
+static void ctdb_op_timeout_handler(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval yt, void *p)
+{
+ struct ctdb_op_state *state =
+ talloc_get_type(p, struct ctdb_op_state);
+
+ DEBUG(DEBUG_NOTICE,("Reenabling %s after timeout\n", state->name));
+ ctdb_op_enable(state);
+}
+
+static int ctdb_op_disable(struct ctdb_op_state *state,
+ struct tevent_context *ev,
+ uint32_t timeout)
+{
+ if (timeout == 0) {
+ DEBUG(DEBUG_NOTICE,("Reenabling %s\n", state->name));
+ ctdb_op_enable(state);
+ return 0;
+ }
+
+ if (state->in_progress) {
+ DEBUG(DEBUG_ERR,
+ ("Unable to disable %s - in progress\n", state->name));
+ return -EAGAIN;
+ }
+
+ DEBUG(DEBUG_NOTICE,("Disabling %s for %u seconds\n",
+ state->name, timeout));
+
+ /* Clear any old timers */
+ talloc_free(state->timer);
+
+ /* Arrange for the timeout to occur */
+ state->timer = tevent_add_timer(ev, state,
+ timeval_current_ofs(timeout, 0),
+ ctdb_op_timeout_handler, state);
+ if (state->timer == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Unable to setup timer\n"));
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+struct ctdb_banning_state {
+ uint32_t pnn;
+ uint32_t count;
+ struct timeval last_reported_time;
+};
+
+struct ctdb_cluster_lock_handle;
+
+/*
+ private state of recovery daemon
+ */
+struct ctdb_recoverd {
+ struct ctdb_context *ctdb;
+ uint32_t leader;
+ struct tevent_timer *leader_broadcast_te;
+ struct tevent_timer *leader_broadcast_timeout_te;
+ uint32_t pnn;
+ uint32_t last_culprit_node;
+ struct ctdb_banning_state *banning_state;
+ struct ctdb_node_map_old *nodemap;
+ struct timeval priority_time;
+ bool need_takeover_run;
+ bool need_recovery;
+ uint32_t node_flags;
+ struct tevent_timer *send_election_te;
+ bool election_in_progress;
+ struct tevent_timer *election_timeout;
+ struct srvid_requests *reallocate_requests;
+ struct ctdb_op_state *takeover_run;
+ struct ctdb_op_state *recovery;
+ struct ctdb_iface_list_old *ifaces;
+ uint32_t *force_rebalance_nodes;
+ struct ctdb_node_capabilities *caps;
+ bool frozen_on_inactive;
+ struct ctdb_cluster_lock_handle *cluster_lock_handle;
+ pid_t helper_pid;
+};
+
+#define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
+#define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
+
+static void ctdb_restart_recd(struct tevent_context *ev,
+ struct tevent_timer *te, struct timeval t,
+ void *private_data);
+
+static bool this_node_is_leader(struct ctdb_recoverd *rec)
+{
+ return rec->leader == rec->pnn;
+}
+
+static bool this_node_can_be_leader(struct ctdb_recoverd *rec)
+{
+ return (rec->node_flags & NODE_FLAGS_INACTIVE) == 0 &&
+ (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) != 0;
+}
+
+static bool node_flags(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t *flags)
+{
+ size_t i;
+
+ for (i = 0; i < rec->nodemap->num; i++) {
+ struct ctdb_node_and_flags *node = &rec->nodemap->nodes[i];
+ if (node->pnn == pnn) {
+ if (flags != NULL) {
+ *flags = node->flags;
+ }
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/*
+ ban a node for a period of time
+ */
+static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn)
+{
+ int ret;
+ struct ctdb_context *ctdb = rec->ctdb;
+ uint32_t ban_time = ctdb->tunable.recovery_ban_period;
+ struct ctdb_ban_state bantime;
+
+ if (!ctdb_validate_pnn(ctdb, pnn)) {
+ DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
+ return;
+ }
+
+ DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
+
+ bantime.pnn = pnn;
+ bantime.time = ban_time;
+
+ ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
+ return;
+ }
+
+}
+
+enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
+
+
+/*
+ remember the trouble maker
+ */
+static void ctdb_set_culprit_count(struct ctdb_recoverd *rec,
+ uint32_t culprit,
+ uint32_t count)
+{
+ struct ctdb_context *ctdb = talloc_get_type_abort(
+ rec->ctdb, struct ctdb_context);
+ struct ctdb_banning_state *ban_state = NULL;
+ size_t len;
+ bool ok;
+
+ ok = node_flags(rec, culprit, NULL);
+ if (!ok) {
+ DBG_WARNING("Unknown culprit node %"PRIu32"\n", culprit);
+ return;
+ }
+
+ /* If we are banned or stopped, do not set other nodes as culprits */
+ if (rec->node_flags & NODE_FLAGS_INACTIVE) {
+ D_WARNING("This node is INACTIVE, cannot set culprit node %d\n",
+ culprit);
+ return;
+ }
+
+ if (rec->banning_state == NULL) {
+ len = 0;
+ } else {
+ size_t i;
+
+ len = talloc_array_length(rec->banning_state);
+
+ for (i = 0 ; i < len; i++) {
+ if (rec->banning_state[i].pnn == culprit) {
+ ban_state= &rec->banning_state[i];
+ break;
+ }
+ }
+ }
+
+ /* Not found, so extend (or allocate new) array */
+ if (ban_state == NULL) {
+ struct ctdb_banning_state *t;
+
+ len += 1;
+ /*
+ * talloc_realloc() handles the corner case where
+ * rec->banning_state is NULL
+ */
+ t = talloc_realloc(rec,
+ rec->banning_state,
+ struct ctdb_banning_state,
+ len);
+ if (t == NULL) {
+ DBG_WARNING("Memory allocation errror");
+ return;
+ }
+ rec->banning_state = t;
+
+ /* New element is always at the end - initialise it... */
+ ban_state = &rec->banning_state[len - 1];
+ *ban_state = (struct ctdb_banning_state) {
+ .pnn = culprit,
+ .count = 0,
+ };
+ } else if (ban_state->count > 0 &&
+ timeval_elapsed(&ban_state->last_reported_time) >
+ ctdb->tunable.recovery_grace_period) {
+ /*
+ * Forgive old transgressions beyond the tunable time-limit
+ */
+ ban_state->count = 0;
+ }
+
+ ban_state->count += count;
+ ban_state->last_reported_time = timeval_current();
+ rec->last_culprit_node = culprit;
+}
+
+static void ban_counts_reset(struct ctdb_recoverd *rec)
+{
+ D_NOTICE("Resetting ban count to 0 for all nodes\n");
+ TALLOC_FREE(rec->banning_state);
+}
+
+/*
+ remember the trouble maker
+ */
+static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
+{
+ ctdb_set_culprit_count(rec, culprit, 1);
+}
+
+/*
+ Retrieve capabilities from all connected nodes
+ */
+static int update_capabilities(struct ctdb_recoverd *rec,
+ struct ctdb_node_map_old *nodemap)
+{
+ uint32_t *capp;
+ TALLOC_CTX *tmp_ctx;
+ struct ctdb_node_capabilities *caps;
+ struct ctdb_context *ctdb = rec->ctdb;
+
+ tmp_ctx = talloc_new(rec);
+ CTDB_NO_MEMORY(ctdb, tmp_ctx);
+
+ caps = ctdb_get_capabilities(ctdb, tmp_ctx,
+ CONTROL_TIMEOUT(), nodemap);
+
+ if (caps == NULL) {
+ DEBUG(DEBUG_ERR,
+ (__location__ " Failed to get node capabilities\n"));
+ talloc_free(tmp_ctx);
+ return -1;
+ }
+
+ capp = ctdb_get_node_capabilities(caps, rec->pnn);
+ if (capp == NULL) {
+ DEBUG(DEBUG_ERR,
+ (__location__
+ " Capabilities don't include current node.\n"));
+ talloc_free(tmp_ctx);
+ return -1;
+ }
+ ctdb->capabilities = *capp;
+
+ TALLOC_FREE(rec->caps);
+ rec->caps = talloc_steal(rec, caps);
+
+ talloc_free(tmp_ctx);
+ return 0;
+}
+
+/*
+ change recovery mode on all nodes
+ */
+static int set_recovery_mode(struct ctdb_context *ctdb,
+ struct ctdb_recoverd *rec,
+ struct ctdb_node_map_old *nodemap,
+ uint32_t rec_mode)
+{
+ TDB_DATA data;
+ uint32_t *nodes;
+ TALLOC_CTX *tmp_ctx;
+
+ tmp_ctx = talloc_new(ctdb);
+ CTDB_NO_MEMORY(ctdb, tmp_ctx);
+
+ nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
+
+ data.dsize = sizeof(uint32_t);
+ data.dptr = (unsigned char *)&rec_mode;
+
+ if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
+ nodes, 0,
+ CONTROL_TIMEOUT(),
+ false, data,
+ NULL, NULL,
+ NULL) != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
+ talloc_free(tmp_ctx);
+ return -1;
+ }
+
+ talloc_free(tmp_ctx);
+ return 0;
+}
+
+/*
+ * Update flags on all connected nodes
+ */
+static int update_flags_on_all_nodes(struct ctdb_recoverd *rec,
+ uint32_t pnn,
+ uint32_t flags)
+{
+ struct ctdb_context *ctdb = rec->ctdb;
+ struct timeval timeout = CONTROL_TIMEOUT();
+ TDB_DATA data;
+ struct ctdb_node_map_old *nodemap=NULL;
+ struct ctdb_node_flag_change c;
+ TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+ uint32_t *nodes;
+ uint32_t i;
+ int ret;
+
+ nodemap = rec->nodemap;
+
+ for (i = 0; i < nodemap->num; i++) {
+ if (pnn == nodemap->nodes[i].pnn) {
+ break;
+ }
+ }
+ if (i >= nodemap->num) {
+ DBG_ERR("Nodemap does not contain node %d\n", pnn);
+ talloc_free(tmp_ctx);
+ return -1;
+ }
+
+ c.pnn = pnn;
+ c.old_flags = nodemap->nodes[i].flags;
+ c.new_flags = flags;
+
+ data.dsize = sizeof(c);
+ data.dptr = (unsigned char *)&c;
+
+ /* send the flags update to all connected nodes */
+ nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
+
+ ret = ctdb_client_async_control(ctdb,
+ CTDB_CONTROL_MODIFY_FLAGS,
+ nodes,
+ 0,
+ timeout,
+ false,
+ data,
+ NULL,
+ NULL,
+ NULL);
+ if (ret != 0) {
+ DBG_ERR("Unable to update flags on remote nodes\n");
+ talloc_free(tmp_ctx);
+ return -1;
+ }
+
+ talloc_free(tmp_ctx);
+ return 0;
+}
+
+static bool _cluster_lock_lock(struct ctdb_recoverd *rec);
+static bool cluster_lock_held(struct ctdb_recoverd *rec);
+
+static bool cluster_lock_enabled(struct ctdb_recoverd *rec)
+{
+ return rec->ctdb->recovery_lock != NULL;
+}
+
+static bool cluster_lock_take(struct ctdb_recoverd *rec)
+{
+ struct ctdb_context *ctdb = rec->ctdb;
+ bool have_lock;
+
+ if (!cluster_lock_enabled(rec)) {
+ return true;
+ }
+
+ if (cluster_lock_held(rec)) {
+ D_NOTICE("Already holding cluster lock\n");
+ return true;
+ }
+
+ D_NOTICE("Attempting to take cluster lock (%s)\n", ctdb->recovery_lock);
+ have_lock = _cluster_lock_lock(rec);
+ if (!have_lock) {
+ return false;
+ }
+
+ D_NOTICE("Cluster lock taken successfully\n");
+ return true;
+}
+
+/*
+ called when ctdb_wait_timeout should finish
+ */
+static void ctdb_wait_handler(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval yt, void *p)
+{
+ uint32_t *timed_out = (uint32_t *)p;
+ (*timed_out) = 1;
+}
+
+/*
+ wait for a given number of seconds
+ */
+static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
+{
+ uint32_t timed_out = 0;
+ time_t usecs = (secs - (time_t)secs) * 1000000;
+ tevent_add_timer(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs),
+ ctdb_wait_handler, &timed_out);
+ while (!timed_out) {
+ tevent_loop_once(ctdb->ev);
+ }
+}
+
+/*
+ * Broadcast cluster leader
+ */
+
+static int leader_broadcast_send(struct ctdb_recoverd *rec, uint32_t pnn)
+{
+ struct ctdb_context *ctdb = rec->ctdb;
+ TDB_DATA data;
+ int ret;
+
+ data.dptr = (uint8_t *)&pnn;
+ data.dsize = sizeof(pnn);
+
+ ret = ctdb_client_send_message(ctdb,
+ CTDB_BROADCAST_CONNECTED,
+ CTDB_SRVID_LEADER,
+ data);
+ return ret;
+}
+
+static int leader_broadcast_loop(struct ctdb_recoverd *rec);
+static void cluster_lock_release(struct ctdb_recoverd *rec);
+
+/* This runs continously but only sends the broadcast when leader */
+static void leader_broadcast_loop_handler(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval current_time,
+ void *private_data)
+{
+ struct ctdb_recoverd *rec = talloc_get_type_abort(
+ private_data, struct ctdb_recoverd);
+ int ret;
+
+ if (!this_node_can_be_leader(rec)) {
+ if (this_node_is_leader(rec)) {
+ rec->leader = CTDB_UNKNOWN_PNN;
+ }
+ if (cluster_lock_enabled(rec) && cluster_lock_held(rec)) {
+ cluster_lock_release(rec);
+ }
+ goto done;
+ }
+
+ if (!this_node_is_leader(rec)) {
+ goto done;
+ }
+
+ if (rec->election_in_progress) {
+ goto done;
+ }
+
+ ret = leader_broadcast_send(rec, rec->leader);
+ if (ret != 0) {
+ DBG_WARNING("Failed to send leader broadcast\n");
+ }
+
+done:
+ ret = leader_broadcast_loop(rec);
+ if (ret != 0) {
+ D_WARNING("Failed to set up leader broadcast\n");
+ }
+}
+
+static int leader_broadcast_loop(struct ctdb_recoverd *rec)
+{
+ struct ctdb_context *ctdb = rec->ctdb;
+
+ TALLOC_FREE(rec->leader_broadcast_te);
+ rec->leader_broadcast_te =
+ tevent_add_timer(ctdb->ev,
+ rec,
+ timeval_current_ofs(1, 0),
+ leader_broadcast_loop_handler,
+ rec);
+ if (rec->leader_broadcast_te == NULL) {
+ return ENOMEM;
+ }
+
+ return 0;
+}
+
+static bool leader_broadcast_loop_active(struct ctdb_recoverd *rec)
+{
+ return rec->leader_broadcast_te != NULL;
+}
+
+/*
+ called when an election times out (ends)
+ */
+static void ctdb_election_timeout(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *p)
+{
+ struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
+ bool ok;
+
+ rec->election_in_progress = false;
+ rec->election_timeout = NULL;
+ fast_start = false;
+
+ D_WARNING("Election period ended, leader=%u\n", rec->leader);
+
+ if (!this_node_is_leader(rec)) {
+ return;
+ }
+
+ ok = cluster_lock_take(rec);
+ if (!ok) {
+ D_ERR("Unable to get cluster lock, banning node\n");
+ ctdb_ban_node(rec, rec->pnn);
+ }
+}
+
+
+/*
+ wait for an election to finish. It finished election_timeout seconds after
+ the last election packet is received
+ */
+static void ctdb_wait_election(struct ctdb_recoverd *rec)
+{
+ struct ctdb_context *ctdb = rec->ctdb;
+ while (rec->election_in_progress) {
+ tevent_loop_once(ctdb->ev);
+ }
+}
+
+/*
+ * Update local flags from all remote connected nodes and push out
+ * flags changes to all nodes. This is only run by the leader.
+ */
+static int update_flags(struct ctdb_recoverd *rec,
+ struct ctdb_node_map_old *nodemap,
+ struct ctdb_node_map_old **remote_nodemaps)
+{
+ unsigned int j;
+ struct ctdb_context *ctdb = rec->ctdb;
+ TALLOC_CTX *mem_ctx = talloc_new(ctdb);
+
+ /* Check flags from remote nodes */
+ for (j=0; j<nodemap->num; j++) {
+ struct ctdb_node_map_old *remote_nodemap=NULL;
+ uint32_t local_flags = nodemap->nodes[j].flags;
+ uint32_t remote_pnn = nodemap->nodes[j].pnn;
+ uint32_t remote_flags;
+ unsigned int i;
+ int ret;
+
+ if (local_flags & NODE_FLAGS_DISCONNECTED) {
+ continue;
+ }
+ if (remote_pnn == rec->pnn) {
+ /*
+ * No remote nodemap for this node since this
+ * is the local nodemap. However, still need
+ * to check this against the remote nodes and
+ * push it if they are out-of-date.
+ */
+ goto compare_remotes;
+ }
+
+ remote_nodemap = remote_nodemaps[j];
+ remote_flags = remote_nodemap->nodes[j].flags;
+
+ if (local_flags != remote_flags) {
+ /*
+ * Update the local copy of the flags in the
+ * recovery daemon.
+ */
+ D_NOTICE("Remote node %u had flags 0x%x, "
+ "local had 0x%x - updating local\n",
+ remote_pnn,
+ remote_flags,
+ local_flags);
+ nodemap->nodes[j].flags = remote_flags;
+ local_flags = remote_flags;
+ goto push;
+ }
+
+compare_remotes:
+ for (i = 0; i < nodemap->num; i++) {
+ if (i == j) {
+ continue;
+ }
+ if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
+ continue;
+ }
+ if (nodemap->nodes[i].pnn == rec->pnn) {
+ continue;
+ }
+
+ remote_nodemap = remote_nodemaps[i];
+ remote_flags = remote_nodemap->nodes[j].flags;
+
+ if (local_flags != remote_flags) {
+ goto push;
+ }
+ }
+
+ continue;
+
+push:
+ D_NOTICE("Pushing updated flags for node %u (0x%x)\n",
+ remote_pnn,
+ local_flags);
+ ret = update_flags_on_all_nodes(rec, remote_pnn, local_flags);
+ if (ret != 0) {
+ DBG_ERR("Unable to update flags on remote nodes\n");
+ talloc_free(mem_ctx);
+ return -1;
+ }
+ }
+ talloc_free(mem_ctx);
+ return 0;
+}
+
+
+/* Create a new random generation id.
+ The generation id can not be the INVALID_GENERATION id
+*/
+static uint32_t new_generation(void)
+{
+ uint32_t generation;
+
+ while (1) {
+ generation = random();
+
+ if (generation != INVALID_GENERATION) {
+ break;
+ }
+ }
+
+ return generation;
+}
+
+static bool cluster_lock_held(struct ctdb_recoverd *rec)
+{
+ return (rec->cluster_lock_handle != NULL);
+}
+
+struct ctdb_cluster_lock_handle {
+ bool done;
+ bool locked;
+ double latency;
+ struct ctdb_cluster_mutex_handle *h;
+ struct ctdb_recoverd *rec;
+};
+
+static void take_cluster_lock_handler(char status,
+ double latency,
+ void *private_data)
+{
+ struct ctdb_cluster_lock_handle *s =
+ (struct ctdb_cluster_lock_handle *) private_data;
+
+ s->locked = (status == '0') ;
+
+ /*
+ * If unsuccessful then ensure the process has exited and that
+ * the file descriptor event handler has been cancelled
+ */
+ if (! s->locked) {
+ TALLOC_FREE(s->h);
+ }
+
+ switch (status) {
+ case '0':
+ s->latency = latency;
+ break;
+
+ case '1':
+ D_ERR("Unable to take cluster lock - contention\n");
+ break;
+
+ case '2':
+ D_ERR("Unable to take cluster lock - timeout\n");
+ break;
+
+ default:
+ D_ERR("Unable to take cluster lock - unknown error\n");
+ }
+
+ s->done = true;
+}
+
+static void force_election(struct ctdb_recoverd *rec);
+
+static void lost_cluster_lock_handler(void *private_data)
+{
+ struct ctdb_recoverd *rec = talloc_get_type_abort(
+ private_data, struct ctdb_recoverd);
+
+ D_ERR("Cluster lock helper terminated\n");
+ TALLOC_FREE(rec->cluster_lock_handle);
+
+ if (this_node_can_be_leader(rec)) {
+ force_election(rec);
+ }
+}
+
+static bool _cluster_lock_lock(struct ctdb_recoverd *rec)
+{
+ struct ctdb_context *ctdb = rec->ctdb;
+ struct ctdb_cluster_mutex_handle *h;
+ struct ctdb_cluster_lock_handle *s;
+
+ s = talloc_zero(rec, struct ctdb_cluster_lock_handle);
+ if (s == NULL) {
+ DBG_ERR("Memory allocation error\n");
+ return false;
+ };
+
+ s->rec = rec;
+
+ h = ctdb_cluster_mutex(s,
+ ctdb,
+ ctdb->recovery_lock,
+ 120,
+ take_cluster_lock_handler,
+ s,
+ lost_cluster_lock_handler,
+ rec);
+ if (h == NULL) {
+ talloc_free(s);
+ return false;
+ }
+
+ rec->cluster_lock_handle = s;
+ s->h = h;
+
+ while (! s->done) {
+ tevent_loop_once(ctdb->ev);
+ }
+
+ if (! s->locked) {
+ TALLOC_FREE(rec->cluster_lock_handle);
+ return false;
+ }
+
+ ctdb_ctrl_report_recd_lock_latency(ctdb,
+ CONTROL_TIMEOUT(),
+ s->latency);
+
+ return true;
+}
+
+static void cluster_lock_release(struct ctdb_recoverd *rec)
+{
+ if (rec->cluster_lock_handle == NULL) {
+ return;
+ }
+
+ if (! rec->cluster_lock_handle->done) {
+ /*
+ * Taking of cluster lock still in progress. Free
+ * the cluster mutex handle to release it but leave
+ * the cluster lock handle in place to allow taking
+ * of the lock to fail.
+ */
+ D_NOTICE("Cancelling cluster lock\n");
+ TALLOC_FREE(rec->cluster_lock_handle->h);
+ rec->cluster_lock_handle->done = true;
+ rec->cluster_lock_handle->locked = false;
+ return;
+ }
+
+ D_NOTICE("Releasing cluster lock\n");
+ TALLOC_FREE(rec->cluster_lock_handle);
+}
+
+static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
+{
+ size_t len = talloc_array_length(rec->banning_state);
+ size_t i;
+
+
+ *self_ban = false;
+ for (i = 0; i < len; i++) {
+ struct ctdb_banning_state *ban_state = &rec->banning_state[i];
+
+ if (ban_state->count < 2 * rec->nodemap->num) {
+ continue;
+ }
+
+ D_NOTICE("Node %u reached %u banning credits\n",
+ ban_state->pnn,
+ ban_state->count);
+ ctdb_ban_node(rec, ban_state->pnn);
+ ban_state->count = 0;
+
+ /* Banning ourself? */
+ if (ban_state->pnn == rec->pnn) {
+ *self_ban = true;
+ }
+ }
+}
+
+struct helper_state {
+ int fd[2];
+ pid_t pid;
+ int result;
+ bool done;
+};
+
+static void helper_handler(struct tevent_context *ev,
+ struct tevent_fd *fde,
+ uint16_t flags, void *private_data)
+{
+ struct helper_state *state = talloc_get_type_abort(
+ private_data, struct helper_state);
+ int ret;
+
+ ret = sys_read(state->fd[0], &state->result, sizeof(state->result));
+ if (ret != sizeof(state->result)) {
+ state->result = EPIPE;
+ }
+
+ state->done = true;
+}
+
+static int helper_run(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx,
+ const char *prog, const char *arg, const char *type)
+{
+ struct helper_state *state;
+ struct tevent_fd *fde;
+ const char **args;
+ int nargs, ret;
+
+ state = talloc_zero(mem_ctx, struct helper_state);
+ if (state == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
+ return -1;
+ }
+
+ state->pid = -1;
+
+ ret = pipe(state->fd);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,
+ ("Failed to create pipe for %s helper\n", type));
+ goto fail;
+ }
+
+ set_close_on_exec(state->fd[0]);
+
+ nargs = 4;
+ args = talloc_array(state, const char *, nargs);
+ if (args == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
+ goto fail;
+ }
+
+ args[0] = talloc_asprintf(args, "%d", state->fd[1]);
+ if (args[0] == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
+ goto fail;
+ }
+ args[1] = rec->ctdb->daemon.name;
+ args[2] = arg;
+ args[3] = NULL;
+
+ if (args[2] == NULL) {
+ nargs = 3;
+ }
+
+ state->pid = ctdb_vfork_exec(state, rec->ctdb, prog, nargs, args);
+ if (state->pid == -1) {
+ DEBUG(DEBUG_ERR,
+ ("Failed to create child for %s helper\n", type));
+ goto fail;
+ }
+
+ close(state->fd[1]);
+ state->fd[1] = -1;
+
+ rec->helper_pid = state->pid;
+ state->done = false;
+
+ fde = tevent_add_fd(rec->ctdb->ev, state, state->fd[0],
+ TEVENT_FD_READ, helper_handler, state);
+ if (fde == NULL) {
+ goto fail;
+ }
+ tevent_fd_set_auto_close(fde);
+
+ while (!state->done) {
+ tevent_loop_once(rec->ctdb->ev);
+
+ if (!this_node_is_leader(rec)) {
+ D_ERR("Leader changed to %u, aborting %s\n",
+ rec->leader,
+ type);
+ state->result = 1;
+ break;
+ }
+ }
+
+ close(state->fd[0]);
+ state->fd[0] = -1;
+
+ if (state->result != 0) {
+ goto fail;
+ }
+
+ rec->helper_pid = -1;
+ ctdb_kill(rec->ctdb, state->pid, SIGKILL);
+ talloc_free(state);
+ return 0;
+
+fail:
+ if (state->fd[0] != -1) {
+ close(state->fd[0]);
+ }
+ if (state->fd[1] != -1) {
+ close(state->fd[1]);
+ }
+ rec->helper_pid = -1;
+ if (state->pid != -1) {
+ ctdb_kill(rec->ctdb, state->pid, SIGKILL);
+ }
+ talloc_free(state);
+ return -1;
+}
+
+
+static int ctdb_takeover(struct ctdb_recoverd *rec,
+ uint32_t *force_rebalance_nodes)
+{
+ static char prog[PATH_MAX+1] = "";
+ char *arg;
+ unsigned int i;
+ int ret;
+
+ if (!ctdb_set_helper("takeover_helper", prog, sizeof(prog),
+ "CTDB_TAKEOVER_HELPER", CTDB_HELPER_BINDIR,
+ "ctdb_takeover_helper")) {
+ ctdb_die(rec->ctdb, "Unable to set takeover helper\n");
+ }
+
+ arg = NULL;
+ for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
+ uint32_t pnn = force_rebalance_nodes[i];
+ if (arg == NULL) {
+ arg = talloc_asprintf(rec, "%u", pnn);
+ } else {
+ arg = talloc_asprintf_append(arg, ",%u", pnn);
+ }
+ if (arg == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
+ return -1;
+ }
+ }
+
+ if (ctdb_config.failover_disabled) {
+ ret = setenv("CTDB_DISABLE_IP_FAILOVER", "1", 1);
+ if (ret != 0) {
+ D_ERR("Failed to set CTDB_DISABLE_IP_FAILOVER variable\n");
+ return -1;
+ }
+ }
+
+ return helper_run(rec, rec, prog, arg, "takeover");
+}
+
+static bool do_takeover_run(struct ctdb_recoverd *rec,
+ struct ctdb_node_map_old *nodemap)
+{
+ uint32_t *nodes = NULL;
+ struct ctdb_disable_message dtr;
+ TDB_DATA data;
+ size_t i;
+ uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
+ int ret;
+ bool ok;
+
+ DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
+
+ if (ctdb_op_is_in_progress(rec->takeover_run)) {
+ DEBUG(DEBUG_ERR, (__location__
+ " takeover run already in progress \n"));
+ ok = false;
+ goto done;
+ }
+
+ if (!ctdb_op_begin(rec->takeover_run)) {
+ ok = false;
+ goto done;
+ }
+
+ /* Disable IP checks (takeover runs, really) on other nodes
+ * while doing this takeover run. This will stop those other
+ * nodes from triggering takeover runs when think they should
+ * be hosting an IP but it isn't yet on an interface. Don't
+ * wait for replies since a failure here might cause some
+ * noise in the logs but will not actually cause a problem.
+ */
+ ZERO_STRUCT(dtr);
+ dtr.srvid = 0; /* No reply */
+ dtr.pnn = -1;
+
+ data.dptr = (uint8_t*)&dtr;
+ data.dsize = sizeof(dtr);
+
+ nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
+
+ /* Disable for 60 seconds. This can be a tunable later if
+ * necessary.
+ */
+ dtr.timeout = 60;
+ for (i = 0; i < talloc_array_length(nodes); i++) {
+ if (ctdb_client_send_message(rec->ctdb, nodes[i],
+ CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
+ data) != 0) {
+ DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
+ }
+ }
+
+ ret = ctdb_takeover(rec, rec->force_rebalance_nodes);
+
+ /* Reenable takeover runs and IP checks on other nodes */
+ dtr.timeout = 0;
+ for (i = 0; i < talloc_array_length(nodes); i++) {
+ if (ctdb_client_send_message(rec->ctdb, nodes[i],
+ CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
+ data) != 0) {
+ DEBUG(DEBUG_INFO,("Failed to re-enable takeover runs\n"));
+ }
+ }
+
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
+ ok = false;
+ goto done;
+ }
+
+ ok = true;
+ /* Takeover run was successful so clear force rebalance targets */
+ if (rebalance_nodes == rec->force_rebalance_nodes) {
+ TALLOC_FREE(rec->force_rebalance_nodes);
+ } else {
+ DEBUG(DEBUG_WARNING,
+ ("Rebalance target nodes changed during takeover run - not clearing\n"));
+ }
+done:
+ rec->need_takeover_run = !ok;
+ talloc_free(nodes);
+ ctdb_op_end(rec->takeover_run);
+
+ DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
+ return ok;
+}
+
+static int db_recovery_parallel(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx)
+{
+ static char prog[PATH_MAX+1] = "";
+ const char *arg;
+
+ if (!ctdb_set_helper("recovery_helper", prog, sizeof(prog),
+ "CTDB_RECOVERY_HELPER", CTDB_HELPER_BINDIR,
+ "ctdb_recovery_helper")) {
+ ctdb_die(rec->ctdb, "Unable to set recovery helper\n");
+ }
+
+ arg = talloc_asprintf(mem_ctx, "%u", new_generation());
+ if (arg == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
+ return -1;
+ }
+
+ setenv("CTDB_DBDIR_STATE", rec->ctdb->db_directory_state, 1);
+
+ return helper_run(rec, mem_ctx, prog, arg, "recovery");
+}
+
+/*
+ * Main recovery function, only run by leader
+ */
+static int do_recovery(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx)
+{
+ struct ctdb_context *ctdb = rec->ctdb;
+ struct ctdb_node_map_old *nodemap = rec->nodemap;
+ unsigned int i;
+ int ret;
+ bool self_ban;
+
+ DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
+
+ /* Check if the current node is still the leader. It's possible that
+ * re-election has changed the leader.
+ */
+ if (!this_node_is_leader(rec)) {
+ D_NOTICE("Leader changed to %u, aborting recovery\n",
+ rec->leader);
+ return -1;
+ }
+
+ /* if recovery fails, force it again */
+ rec->need_recovery = true;
+
+ if (!ctdb_op_begin(rec->recovery)) {
+ return -1;
+ }
+
+ if (rec->election_in_progress) {
+ /* an election is in progress */
+ DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
+ goto fail;
+ }
+
+ ban_misbehaving_nodes(rec, &self_ban);
+ if (self_ban) {
+ DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
+ goto fail;
+ }
+
+ if (cluster_lock_enabled(rec) && !cluster_lock_held(rec)) {
+ /* Leader can change in ban_misbehaving_nodes() */
+ if (!this_node_is_leader(rec)) {
+ D_NOTICE("Leader changed to %u, aborting recovery\n",
+ rec->leader);
+ rec->need_recovery = false;
+ goto fail;
+ }
+
+ D_ERR("Cluster lock not held - abort recovery, ban node\n");
+ ctdb_ban_node(rec, rec->pnn);
+ goto fail;
+ }
+
+ DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
+
+ /* Retrieve capabilities from all connected nodes */
+ ret = update_capabilities(rec, nodemap);
+ if (ret!=0) {
+ DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
+ return -1;
+ }
+
+ /*
+ update all nodes to have the same flags that we have
+ */
+ for (i=0;i<nodemap->num;i++) {
+ if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
+ continue;
+ }
+
+ ret = update_flags_on_all_nodes(rec,
+ nodemap->nodes[i].pnn,
+ nodemap->nodes[i].flags);
+ if (ret != 0) {
+ if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
+ DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
+ } else {
+ DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
+ return -1;
+ }
+ }
+ }
+
+ DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
+
+ ret = db_recovery_parallel(rec, mem_ctx);
+ if (ret != 0) {
+ goto fail;
+ }
+
+ do_takeover_run(rec, nodemap);
+
+ /* send a message to all clients telling them that the cluster
+ has been reconfigured */
+ ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
+ CTDB_SRVID_RECONFIGURE, tdb_null);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
+ goto fail;
+ }
+
+ DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
+
+ rec->need_recovery = false;
+ ctdb_op_end(rec->recovery);
+
+ /*
+ * Completed a full recovery so forgive any past transgressions
+ */
+ ban_counts_reset(rec);
+
+ /* We just finished a recovery successfully.
+ We now wait for rerecovery_timeout before we allow
+ another recovery to take place.
+ */
+ DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be suppressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
+ ctdb_op_disable(rec->recovery, ctdb->ev,
+ ctdb->tunable.rerecovery_timeout);
+ return 0;
+
+fail:
+ ctdb_op_end(rec->recovery);
+ return -1;
+}
+
+
+/*
+ elections are won by first checking the number of connected nodes, then
+ the priority time, then the pnn
+ */
+struct election_message {
+ uint32_t num_connected;
+ struct timeval priority_time;
+ uint32_t pnn;
+ uint32_t node_flags;
+};
+
+/*
+ form this nodes election data
+ */
+static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
+{
+ unsigned int i;
+ int ret;
+ struct ctdb_node_map_old *nodemap;
+ struct ctdb_context *ctdb = rec->ctdb;
+ bool ok;
+
+ ZERO_STRUCTP(em);
+
+ em->pnn = rec->pnn;
+ em->priority_time = rec->priority_time;
+
+ ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
+ return;
+ }
+
+ ok = node_flags(rec, rec->pnn, &rec->node_flags);
+ if (!ok) {
+ DBG_ERR("Unable to get node flags for this node\n");
+ return;
+ }
+ em->node_flags = rec->node_flags;
+
+ for (i=0;i<nodemap->num;i++) {
+ if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
+ em->num_connected++;
+ }
+ }
+
+ if (!this_node_can_be_leader(rec)) {
+ /* Try to lose... */
+ em->num_connected = 0;
+ em->priority_time = timeval_current();
+ }
+
+ talloc_free(nodemap);
+}
+
+/*
+ see if the given election data wins
+ */
+static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
+{
+ struct election_message myem;
+ int cmp = 0;
+
+ ctdb_election_data(rec, &myem);
+
+ if (!this_node_can_be_leader(rec)) {
+ return false;
+ }
+
+ /* Automatically win if other node is banned or stopped */
+ if (em->node_flags & NODE_FLAGS_INACTIVE) {
+ return true;
+ }
+
+ /* then the longest running node */
+ if (cmp == 0) {
+ cmp = timeval_compare(&em->priority_time, &myem.priority_time);
+ }
+
+ if (cmp == 0) {
+ cmp = (int)myem.pnn - (int)em->pnn;
+ }
+
+ return cmp > 0;
+}
+
+/*
+ send out an election request
+ */
+static int send_election_request(struct ctdb_recoverd *rec)
+{
+ TDB_DATA election_data;
+ struct election_message emsg;
+ uint64_t srvid;
+ struct ctdb_context *ctdb = rec->ctdb;
+
+ srvid = CTDB_SRVID_ELECTION;
+
+ ctdb_election_data(rec, &emsg);
+
+ election_data.dsize = sizeof(struct election_message);
+ election_data.dptr = (unsigned char *)&emsg;
+
+
+ /* Assume this node will win the election, set leader accordingly */
+ rec->leader = rec->pnn;
+
+ /* send an election message to all active nodes */
+ DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
+ return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
+}
+
+/*
+ we think we are winning the election - send a broadcast election request
+ */
+static void election_send_request(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *p)
+{
+ struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
+ int ret;
+
+ ret = send_election_request(rec);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
+ }
+
+ TALLOC_FREE(rec->send_election_te);
+}
+
+/*
+ handler for memory dumps
+*/
+static void mem_dump_handler(uint64_t srvid, TDB_DATA data, void *private_data)
+{
+ struct ctdb_recoverd *rec = talloc_get_type(
+ private_data, struct ctdb_recoverd);
+ struct ctdb_context *ctdb = rec->ctdb;
+ TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+ TDB_DATA *dump;
+ int ret;
+ struct ctdb_srvid_message *rd;
+
+ if (data.dsize != sizeof(struct ctdb_srvid_message)) {
+ DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
+ talloc_free(tmp_ctx);
+ return;
+ }
+ rd = (struct ctdb_srvid_message *)data.dptr;
+
+ dump = talloc_zero(tmp_ctx, TDB_DATA);
+ if (dump == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
+ talloc_free(tmp_ctx);
+ return;
+ }
+ ret = ctdb_dump_memory(ctdb, dump);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
+ talloc_free(tmp_ctx);
+ return;
+ }
+
+ DBG_ERR("recovery daemon memory dump\n");
+
+ ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
+ talloc_free(tmp_ctx);
+ return;
+ }
+
+ talloc_free(tmp_ctx);
+}
+
+/*
+ handler for reload_nodes
+*/
+static void reload_nodes_handler(uint64_t srvid, TDB_DATA data,
+ void *private_data)
+{
+ struct ctdb_recoverd *rec = talloc_get_type(
+ private_data, struct ctdb_recoverd);
+
+ DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
+
+ ctdb_load_nodes_file(rec->ctdb);
+}
+
+
+static void recd_node_rebalance_handler(uint64_t srvid, TDB_DATA data,
+ void *private_data)
+{
+ struct ctdb_recoverd *rec = talloc_get_type(
+ private_data, struct ctdb_recoverd);
+ struct ctdb_context *ctdb = rec->ctdb;
+ uint32_t pnn;
+ uint32_t *t;
+ int len;
+
+ if (!this_node_is_leader(rec)) {
+ return;
+ }
+
+ if (data.dsize != sizeof(uint32_t)) {
+ DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
+ return;
+ }
+
+ pnn = *(uint32_t *)&data.dptr[0];
+
+ DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
+
+ /* Copy any existing list of nodes. There's probably some
+ * sort of realloc variant that will do this but we need to
+ * make sure that freeing the old array also cancels the timer
+ * event for the timeout... not sure if realloc will do that.
+ */
+ len = (rec->force_rebalance_nodes != NULL) ?
+ talloc_array_length(rec->force_rebalance_nodes) :
+ 0;
+
+ /* This allows duplicates to be added but they don't cause
+ * harm. A call to add a duplicate PNN arguably means that
+ * the timeout should be reset, so this is the simplest
+ * solution.
+ */
+ t = talloc_zero_array(rec, uint32_t, len+1);
+ CTDB_NO_MEMORY_VOID(ctdb, t);
+ if (len > 0) {
+ memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
+ }
+ t[len] = pnn;
+
+ talloc_free(rec->force_rebalance_nodes);
+
+ rec->force_rebalance_nodes = t;
+}
+
+
+
+static void srvid_disable_and_reply(struct ctdb_recoverd *rec,
+ TDB_DATA data,
+ struct ctdb_op_state *op_state)
+{
+ struct ctdb_context *ctdb = rec->ctdb;
+ struct ctdb_disable_message *r;
+ uint32_t timeout;
+ TDB_DATA result;
+ int32_t ret = 0;
+
+ /* Validate input data */
+ if (data.dsize != sizeof(struct ctdb_disable_message)) {
+ DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
+ "expecting %lu\n", (long unsigned)data.dsize,
+ (long unsigned)sizeof(struct ctdb_srvid_message)));
+ return;
+ }
+ if (data.dptr == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
+ return;
+ }
+
+ r = (struct ctdb_disable_message *)data.dptr;
+ timeout = r->timeout;
+
+ ret = ctdb_op_disable(op_state, ctdb->ev, timeout);
+ if (ret != 0) {
+ goto done;
+ }
+
+ /* Returning our PNN tells the caller that we succeeded */
+ ret = rec->pnn;
+done:
+ result.dsize = sizeof(int32_t);
+ result.dptr = (uint8_t *)&ret;
+ srvid_request_reply(ctdb, (struct ctdb_srvid_message *)r, result);
+}
+
+static void disable_takeover_runs_handler(uint64_t srvid, TDB_DATA data,
+ void *private_data)
+{
+ struct ctdb_recoverd *rec = talloc_get_type(
+ private_data, struct ctdb_recoverd);
+
+ srvid_disable_and_reply(rec, data, rec->takeover_run);
+}
+
+/* Backward compatibility for this SRVID */
+static void disable_ip_check_handler(uint64_t srvid, TDB_DATA data,
+ void *private_data)
+{
+ struct ctdb_recoverd *rec = talloc_get_type(
+ private_data, struct ctdb_recoverd);
+ uint32_t timeout;
+
+ if (data.dsize != sizeof(uint32_t)) {
+ DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
+ "expecting %lu\n", (long unsigned)data.dsize,
+ (long unsigned)sizeof(uint32_t)));
+ return;
+ }
+ if (data.dptr == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
+ return;
+ }
+
+ timeout = *((uint32_t *)data.dptr);
+
+ ctdb_op_disable(rec->takeover_run, rec->ctdb->ev, timeout);
+}
+
+static void disable_recoveries_handler(uint64_t srvid, TDB_DATA data,
+ void *private_data)
+{
+ struct ctdb_recoverd *rec = talloc_get_type(
+ private_data, struct ctdb_recoverd);
+
+ srvid_disable_and_reply(rec, data, rec->recovery);
+}
+
+/*
+ handler for ip reallocate, just add it to the list of requests and
+ handle this later in the monitor_cluster loop so we do not recurse
+ with other requests to takeover_run()
+*/
+static void ip_reallocate_handler(uint64_t srvid, TDB_DATA data,
+ void *private_data)
+{
+ struct ctdb_srvid_message *request;
+ struct ctdb_recoverd *rec = talloc_get_type(
+ private_data, struct ctdb_recoverd);
+
+ if (data.dsize != sizeof(struct ctdb_srvid_message)) {
+ DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
+ return;
+ }
+
+ request = (struct ctdb_srvid_message *)data.dptr;
+
+ srvid_request_add(rec->ctdb, &rec->reallocate_requests, request);
+}
+
+static void process_ipreallocate_requests(struct ctdb_context *ctdb,
+ struct ctdb_recoverd *rec)
+{
+ TDB_DATA result;
+ int32_t ret;
+ struct srvid_requests *current;
+
+ /* Only process requests that are currently pending. More
+ * might come in while the takeover run is in progress and
+ * they will need to be processed later since they might
+ * be in response flag changes.
+ */
+ current = rec->reallocate_requests;
+ rec->reallocate_requests = NULL;
+
+ if (do_takeover_run(rec, rec->nodemap)) {
+ ret = rec->pnn;
+ } else {
+ ret = -1;
+ }
+
+ result.dsize = sizeof(int32_t);
+ result.dptr = (uint8_t *)&ret;
+
+ srvid_requests_reply(ctdb, &current, result);
+}
+
+/*
+ * handler for assigning banning credits
+ */
+static void banning_handler(uint64_t srvid, TDB_DATA data, void *private_data)
+{
+ struct ctdb_recoverd *rec = talloc_get_type(
+ private_data, struct ctdb_recoverd);
+ uint32_t ban_pnn;
+
+ /* Ignore if we are not leader */
+ if (!this_node_is_leader(rec)) {
+ return;
+ }
+
+ if (data.dsize != sizeof(uint32_t)) {
+ DEBUG(DEBUG_ERR, (__location__ "invalid data size %zu\n",
+ data.dsize));
+ return;
+ }
+
+ ban_pnn = *(uint32_t *)data.dptr;
+
+ ctdb_set_culprit_count(rec, ban_pnn, rec->nodemap->num);
+}
+
+/*
+ * Handler for leader elections
+ */
+static void election_handler(uint64_t srvid, TDB_DATA data, void *private_data)
+{
+ struct ctdb_recoverd *rec = talloc_get_type(
+ private_data, struct ctdb_recoverd);
+ struct ctdb_context *ctdb = rec->ctdb;
+ struct election_message *em = (struct election_message *)data.dptr;
+
+ /* Ignore election packets from ourself */
+ if (rec->pnn == em->pnn) {
+ return;
+ }
+
+ /* we got an election packet - update the timeout for the election */
+ talloc_free(rec->election_timeout);
+ rec->election_in_progress = true;
+ rec->election_timeout = tevent_add_timer(
+ ctdb->ev, ctdb,
+ fast_start ?
+ timeval_current_ofs(0, 500000) :
+ timeval_current_ofs(ctdb->tunable.election_timeout, 0),
+ ctdb_election_timeout, rec);
+
+ /* someone called an election. check their election data
+ and if we disagree and we would rather be the elected node,
+ send a new election message to all other nodes
+ */
+ if (ctdb_election_win(rec, em)) {
+ if (!rec->send_election_te) {
+ rec->send_election_te = tevent_add_timer(
+ ctdb->ev, rec,
+ timeval_current_ofs(0, 500000),
+ election_send_request, rec);
+ }
+ return;
+ }
+
+ /* we didn't win */
+ TALLOC_FREE(rec->send_election_te);
+
+ /* Release the cluster lock file */
+ if (cluster_lock_held(rec)) {
+ cluster_lock_release(rec);
+ }
+
+ /* Set leader to the winner of this round */
+ rec->leader = em->pnn;
+
+ return;
+}
+
+static void cluster_lock_election(struct ctdb_recoverd *rec)
+{
+ bool ok;
+
+ if (!this_node_can_be_leader(rec)) {
+ if (cluster_lock_held(rec)) {
+ cluster_lock_release(rec);
+ }
+ goto done;
+ }
+
+ /*
+ * Don't need to unconditionally release the lock and then
+ * attempt to retake it. This provides stability.
+ */
+ if (cluster_lock_held(rec)) {
+ goto done;
+ }
+
+ rec->leader = CTDB_UNKNOWN_PNN;
+
+ ok = cluster_lock_take(rec);
+ if (ok) {
+ rec->leader = rec->pnn;
+ D_WARNING("Took cluster lock, leader=%"PRIu32"\n", rec->leader);
+ }
+
+done:
+ rec->election_in_progress = false;
+}
+
+/*
+ force the start of the election process
+ */
+static void force_election(struct ctdb_recoverd *rec)
+{
+ int ret;
+ struct ctdb_context *ctdb = rec->ctdb;
+
+ D_ERR("Start election\n");
+
+ /* set all nodes to recovery mode to stop all internode traffic */
+ ret = set_recovery_mode(ctdb, rec, rec->nodemap, CTDB_RECOVERY_ACTIVE);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
+ return;
+ }
+
+ rec->election_in_progress = true;
+ /* Let other nodes know that an election is underway */
+ leader_broadcast_send(rec, CTDB_UNKNOWN_PNN);
+
+ if (cluster_lock_enabled(rec)) {
+ cluster_lock_election(rec);
+ return;
+ }
+
+ talloc_free(rec->election_timeout);
+ rec->election_timeout = tevent_add_timer(
+ ctdb->ev, ctdb,
+ fast_start ?
+ timeval_current_ofs(0, 500000) :
+ timeval_current_ofs(ctdb->tunable.election_timeout, 0),
+ ctdb_election_timeout, rec);
+
+ ret = send_election_request(rec);
+ if (ret!=0) {
+ DBG_ERR("Failed to initiate leader election");
+ return;
+ }
+
+ /* wait for a few seconds to collect all responses */
+ ctdb_wait_election(rec);
+}
+
+
+static void srvid_not_implemented(uint64_t srvid,
+ TDB_DATA data,
+ void *private_data)
+{
+ const char *s;
+
+ switch (srvid) {
+ case CTDB_SRVID_SET_NODE_FLAGS:
+ s = "CTDB_SRVID_SET_NODE_FLAGS";
+ break;
+ default:
+ s = "UNKNOWN";
+ }
+
+ D_WARNING("SRVID %s (0x%" PRIx64 ") is obsolete\n", s, srvid);
+}
+
+/*
+ handler for when we need to push out flag changes to all other nodes
+*/
+static void push_flags_handler(uint64_t srvid, TDB_DATA data,
+ void *private_data)
+{
+ struct ctdb_recoverd *rec = talloc_get_type(
+ private_data, struct ctdb_recoverd);
+ struct ctdb_context *ctdb = rec->ctdb;
+ int ret;
+ struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
+ struct ctdb_node_map_old *nodemap=NULL;
+ TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+ uint32_t *nodes;
+
+ /* read the node flags from the leader */
+ ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->leader,
+ tmp_ctx, &nodemap);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
+ talloc_free(tmp_ctx);
+ return;
+ }
+ if (c->pnn >= nodemap->num) {
+ DBG_ERR("Nodemap from leader does not contain node %d\n",
+ c->pnn);
+ talloc_free(tmp_ctx);
+ return;
+ }
+
+ /* send the flags update to all connected nodes */
+ nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
+
+ if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
+ nodes, 0, CONTROL_TIMEOUT(),
+ false, data,
+ NULL, NULL,
+ NULL) != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
+
+ talloc_free(tmp_ctx);
+ return;
+ }
+
+ talloc_free(tmp_ctx);
+}
+
+static void leader_broadcast_timeout_handler(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval current_time,
+ void *private_data)
+{
+ struct ctdb_recoverd *rec = talloc_get_type_abort(
+ private_data, struct ctdb_recoverd);
+
+ rec->leader_broadcast_timeout_te = NULL;
+
+ D_NOTICE("Leader broadcast timeout\n");
+
+ force_election(rec);
+}
+
+static void leader_broadcast_timeout_cancel(struct ctdb_recoverd *rec)
+{
+ TALLOC_FREE(rec->leader_broadcast_timeout_te);
+}
+
+static int leader_broadcast_timeout_start(struct ctdb_recoverd *rec)
+{
+ struct ctdb_context *ctdb = rec->ctdb;
+
+ /*
+ * This should not be necessary. However, there will be
+ * interactions with election code here. It will want to
+ * cancel and restart the timer around potentially long
+ * elections.
+ */
+ leader_broadcast_timeout_cancel(rec);
+
+ rec->leader_broadcast_timeout_te =
+ tevent_add_timer(
+ ctdb->ev,
+ rec,
+ timeval_current_ofs(ctdb_config.leader_timeout, 0),
+ leader_broadcast_timeout_handler,
+ rec);
+ if (rec->leader_broadcast_timeout_te == NULL) {
+ D_ERR("Unable to start leader broadcast timeout\n");
+ return ENOMEM;
+ }
+
+ return 0;
+}
+
+static bool leader_broadcast_timeout_active(struct ctdb_recoverd *rec)
+{
+ return rec->leader_broadcast_timeout_te != NULL;
+}
+
+static void leader_handler(uint64_t srvid, TDB_DATA data, void *private_data)
+{
+ struct ctdb_recoverd *rec = talloc_get_type_abort(
+ private_data, struct ctdb_recoverd);
+ uint32_t pnn;
+ size_t npull;
+ int ret;
+
+ ret = ctdb_uint32_pull(data.dptr, data.dsize, &pnn, &npull);
+ if (ret != 0) {
+ DBG_WARNING("Unable to parse leader broadcast, ret=%d\n", ret);
+ return;
+ }
+
+ leader_broadcast_timeout_cancel(rec);
+
+ if (pnn == rec->leader) {
+ goto done;
+ }
+
+ if (pnn == CTDB_UNKNOWN_PNN) {
+ bool was_election_in_progress = rec->election_in_progress;
+
+ /*
+ * Leader broadcast timeout was cancelled above - stop
+ * main loop from restarting it until election is
+ * complete
+ */
+ rec->election_in_progress = true;
+
+ /*
+ * This is the only notification for a cluster lock
+ * election, so handle it here...
+ */
+ if (cluster_lock_enabled(rec) && !was_election_in_progress) {
+ cluster_lock_election(rec);
+ }
+
+ return;
+ }
+
+ D_NOTICE("Received leader broadcast, leader=%"PRIu32"\n", pnn);
+ rec->leader = pnn;
+
+done:
+ leader_broadcast_timeout_start(rec);
+}
+
+struct verify_recmode_normal_data {
+ uint32_t count;
+ enum monitor_result status;
+};
+
+static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
+{
+ struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
+
+
+ /* one more node has responded with recmode data*/
+ rmdata->count--;
+
+ /* if we failed to get the recmode, then return an error and let
+ the main loop try again.
+ */
+ if (state->state != CTDB_CONTROL_DONE) {
+ if (rmdata->status == MONITOR_OK) {
+ rmdata->status = MONITOR_FAILED;
+ }
+ return;
+ }
+
+ /* if we got a response, then the recmode will be stored in the
+ status field
+ */
+ if (state->status != CTDB_RECOVERY_NORMAL) {
+ DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
+ rmdata->status = MONITOR_RECOVERY_NEEDED;
+ }
+
+ return;
+}
+
+
+/* verify that all nodes are in normal recovery mode */
+static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap)
+{
+ struct verify_recmode_normal_data *rmdata;
+ TALLOC_CTX *mem_ctx = talloc_new(ctdb);
+ struct ctdb_client_control_state *state;
+ enum monitor_result status;
+ unsigned int j;
+
+ rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
+ CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
+ rmdata->count = 0;
+ rmdata->status = MONITOR_OK;
+
+ /* loop over all active nodes and send an async getrecmode call to
+ them*/
+ for (j=0; j<nodemap->num; j++) {
+ if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
+ continue;
+ }
+ state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
+ CONTROL_TIMEOUT(),
+ nodemap->nodes[j].pnn);
+ if (state == NULL) {
+ /* we failed to send the control, treat this as
+ an error and try again next iteration
+ */
+ DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
+ talloc_free(mem_ctx);
+ return MONITOR_FAILED;
+ }
+
+ /* set up the callback functions */
+ state->async.fn = verify_recmode_normal_callback;
+ state->async.private_data = rmdata;
+
+ /* one more control to wait for to complete */
+ rmdata->count++;
+ }
+
+
+ /* now wait for up to the maximum number of seconds allowed
+ or until all nodes we expect a response from has replied
+ */
+ while (rmdata->count > 0) {
+ tevent_loop_once(ctdb->ev);
+ }
+
+ status = rmdata->status;
+ talloc_free(mem_ctx);
+ return status;
+}
+
+
+static bool interfaces_have_changed(struct ctdb_context *ctdb,
+ struct ctdb_recoverd *rec)
+{
+ struct ctdb_iface_list_old *ifaces = NULL;
+ TALLOC_CTX *mem_ctx;
+ bool ret = false;
+
+ mem_ctx = talloc_new(NULL);
+
+ /* Read the interfaces from the local node */
+ if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
+ CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
+ D_ERR("Unable to get interfaces from local node %u\n", rec->pnn);
+ /* We could return an error. However, this will be
+ * rare so we'll decide that the interfaces have
+ * actually changed, just in case.
+ */
+ talloc_free(mem_ctx);
+ return true;
+ }
+
+ if (!rec->ifaces) {
+ /* We haven't been here before so things have changed */
+ DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
+ ret = true;
+ } else if (rec->ifaces->num != ifaces->num) {
+ /* Number of interfaces has changed */
+ DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
+ rec->ifaces->num, ifaces->num));
+ ret = true;
+ } else {
+ /* See if interface names or link states have changed */
+ unsigned int i;
+ for (i = 0; i < rec->ifaces->num; i++) {
+ struct ctdb_iface * iface = &rec->ifaces->ifaces[i];
+ if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
+ DEBUG(DEBUG_NOTICE,
+ ("Interface in slot %d changed: %s => %s\n",
+ i, iface->name, ifaces->ifaces[i].name));
+ ret = true;
+ break;
+ }
+ if (iface->link_state != ifaces->ifaces[i].link_state) {
+ DEBUG(DEBUG_NOTICE,
+ ("Interface %s changed state: %d => %d\n",
+ iface->name, iface->link_state,
+ ifaces->ifaces[i].link_state));
+ ret = true;
+ break;
+ }
+ }
+ }
+
+ talloc_free(rec->ifaces);
+ rec->ifaces = talloc_steal(rec, ifaces);
+
+ talloc_free(mem_ctx);
+ return ret;
+}
+
+/* Check that the local allocation of public IP addresses is correct
+ * and do some house-keeping */
+static int verify_local_ip_allocation(struct ctdb_recoverd *rec)
+{
+ TALLOC_CTX *mem_ctx = talloc_new(NULL);
+ struct ctdb_context *ctdb = rec->ctdb;
+ unsigned int j;
+ int ret;
+ bool need_takeover_run = false;
+ struct ctdb_public_ip_list_old *ips = NULL;
+
+ /* If we are not the leader then do some housekeeping */
+ if (!this_node_is_leader(rec)) {
+ /* Ignore any IP reallocate requests - only leader
+ * processes them
+ */
+ TALLOC_FREE(rec->reallocate_requests);
+ /* Clear any nodes that should be force rebalanced in
+ * the next takeover run. If the leader has changed
+ * then we don't want to process these some time in
+ * the future.
+ */
+ TALLOC_FREE(rec->force_rebalance_nodes);
+ }
+
+ /* Return early if disabled... */
+ if (ctdb_config.failover_disabled ||
+ ctdb_op_is_disabled(rec->takeover_run)) {
+ talloc_free(mem_ctx);
+ return 0;
+ }
+
+ if (interfaces_have_changed(ctdb, rec)) {
+ need_takeover_run = true;
+ }
+
+ /* If there are unhosted IPs but this node can host them then
+ * trigger an IP reallocation */
+
+ /* Read *available* IPs from local node */
+ ret = ctdb_ctrl_get_public_ips_flags(
+ ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx,
+ CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, ("Unable to retrieve available public IPs\n"));
+ talloc_free(mem_ctx);
+ return -1;
+ }
+
+ for (j=0; j<ips->num; j++) {
+ if (ips->ips[j].pnn == CTDB_UNKNOWN_PNN &&
+ rec->nodemap->nodes[rec->pnn].flags == 0) {
+ DEBUG(DEBUG_WARNING,
+ ("Unassigned IP %s can be served by this node\n",
+ ctdb_addr_to_str(&ips->ips[j].addr)));
+ need_takeover_run = true;
+ }
+ }
+
+ talloc_free(ips);
+
+ if (!ctdb->do_checkpublicip) {
+ goto done;
+ }
+
+ /* Validate the IP addresses that this node has on network
+ * interfaces. If there is an inconsistency between reality
+ * and the state expected by CTDB then try to fix it by
+ * triggering an IP reallocation or releasing extraneous IP
+ * addresses. */
+
+ /* Read *known* IPs from local node */
+ ret = ctdb_ctrl_get_public_ips_flags(
+ ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, ("Unable to retrieve known public IPs\n"));
+ talloc_free(mem_ctx);
+ return -1;
+ }
+
+ for (j=0; j<ips->num; j++) {
+ if (ips->ips[j].pnn == rec->pnn) {
+ if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
+ DEBUG(DEBUG_ERR,
+ ("Assigned IP %s not on an interface\n",
+ ctdb_addr_to_str(&ips->ips[j].addr)));
+ need_takeover_run = true;
+ }
+ } else {
+ if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
+ DEBUG(DEBUG_ERR,
+ ("IP %s incorrectly on an interface\n",
+ ctdb_addr_to_str(&ips->ips[j].addr)));
+ need_takeover_run = true;
+ }
+ }
+ }
+
+done:
+ if (need_takeover_run) {
+ struct ctdb_srvid_message rd;
+ TDB_DATA data;
+
+ DEBUG(DEBUG_NOTICE,("Trigger takeoverrun\n"));
+
+ ZERO_STRUCT(rd);
+ rd.pnn = rec->pnn;
+ rd.srvid = 0;
+ data.dptr = (uint8_t *)&rd;
+ data.dsize = sizeof(rd);
+
+ ret = ctdb_client_send_message(ctdb,
+ CTDB_BROADCAST_CONNECTED,
+ CTDB_SRVID_TAKEOVER_RUN,
+ data);
+ if (ret != 0) {
+ D_ERR("Failed to send takeover run request\n");
+ }
+ }
+ talloc_free(mem_ctx);
+ return 0;
+}
+
+
+struct remote_nodemaps_state {
+ struct ctdb_node_map_old **remote_nodemaps;
+ struct ctdb_recoverd *rec;
+};
+
+static void async_getnodemap_callback(struct ctdb_context *ctdb,
+ uint32_t node_pnn,
+ int32_t res,
+ TDB_DATA outdata,
+ void *callback_data)
+{
+ struct remote_nodemaps_state *state =
+ (struct remote_nodemaps_state *)callback_data;
+ struct ctdb_node_map_old **remote_nodemaps = state->remote_nodemaps;
+ struct ctdb_node_map_old *nodemap = state->rec->nodemap;
+ size_t i;
+
+ for (i = 0; i < nodemap->num; i++) {
+ if (nodemap->nodes[i].pnn == node_pnn) {
+ break;
+ }
+ }
+
+ if (i >= nodemap->num) {
+ DBG_ERR("Invalid PNN %"PRIu32"\n", node_pnn);
+ return;
+ }
+
+ remote_nodemaps[i] = (struct ctdb_node_map_old *)talloc_steal(
+ remote_nodemaps, outdata.dptr);
+
+}
+
+static void async_getnodemap_error(struct ctdb_context *ctdb,
+ uint32_t node_pnn,
+ int32_t res,
+ TDB_DATA outdata,
+ void *callback_data)
+{
+ struct remote_nodemaps_state *state =
+ (struct remote_nodemaps_state *)callback_data;
+ struct ctdb_recoverd *rec = state->rec;
+
+ DBG_ERR("Failed to retrieve nodemap from node %u\n", node_pnn);
+ ctdb_set_culprit(rec, node_pnn);
+}
+
+static int get_remote_nodemaps(struct ctdb_recoverd *rec,
+ TALLOC_CTX *mem_ctx,
+ struct ctdb_node_map_old ***remote_nodemaps)
+{
+ struct ctdb_context *ctdb = rec->ctdb;
+ struct ctdb_node_map_old **t;
+ uint32_t *nodes;
+ struct remote_nodemaps_state state;
+ int ret;
+
+ t = talloc_zero_array(mem_ctx,
+ struct ctdb_node_map_old *,
+ rec->nodemap->num);
+ if (t == NULL) {
+ DBG_ERR("Memory allocation error\n");
+ return -1;
+ }
+
+ nodes = list_of_connected_nodes(ctdb, rec->nodemap, mem_ctx, false);
+
+ state.remote_nodemaps = t;
+ state.rec = rec;
+
+ ret = ctdb_client_async_control(ctdb,
+ CTDB_CONTROL_GET_NODEMAP,
+ nodes,
+ 0,
+ CONTROL_TIMEOUT(),
+ false,
+ tdb_null,
+ async_getnodemap_callback,
+ async_getnodemap_error,
+ &state);
+ talloc_free(nodes);
+
+ if (ret != 0) {
+ talloc_free(t);
+ return ret;
+ }
+
+ *remote_nodemaps = t;
+ return 0;
+}
+
+static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
+ TALLOC_CTX *mem_ctx)
+{
+ struct ctdb_node_map_old *nodemap=NULL;
+ struct ctdb_node_map_old **remote_nodemaps=NULL;
+ struct ctdb_vnn_map *vnnmap=NULL;
+ struct ctdb_vnn_map *remote_vnnmap=NULL;
+ uint32_t num_lmasters;
+ int32_t debug_level;
+ unsigned int i, j;
+ int ret;
+ bool self_ban;
+
+
+ /* verify that the main daemon is still running */
+ if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
+ DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
+ exit(-1);
+ }
+
+ /* ping the local daemon to tell it we are alive */
+ ctdb_ctrl_recd_ping(ctdb);
+
+ if (rec->election_in_progress) {
+ /* an election is in progress */
+ return;
+ }
+
+ /*
+ * Start leader broadcasts if they are not active (1st time
+ * through main loop? Memory allocation error?)
+ */
+ if (!leader_broadcast_loop_active(rec)) {
+ ret = leader_broadcast_loop(rec);
+ if (ret != 0) {
+ D_ERR("Failed to set up leader broadcast\n");
+ ctdb_set_culprit(rec, rec->pnn);
+ }
+ }
+ /*
+ * Similar for leader broadcast timeouts. These can also have
+ * been stopped by another node receiving a leader broadcast
+ * timeout and transmitting an "unknown leader broadcast".
+ * Note that this should never be done during an election - at
+ * the moment there is nothing between here and the above
+ * election-in-progress check that can process an election
+ * result (i.e. no event loop).
+ */
+ if (!leader_broadcast_timeout_active(rec)) {
+ ret = leader_broadcast_timeout_start(rec);
+ if (ret != 0) {
+ ctdb_set_culprit(rec, rec->pnn);
+ }
+ }
+
+
+ /* read the debug level from the parent and update locally */
+ ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
+ if (ret !=0) {
+ DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
+ return;
+ }
+ debuglevel_set(debug_level);
+
+ /* get relevant tunables */
+ ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
+ return;
+ }
+
+ /* get runstate */
+ ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
+ CTDB_CURRENT_NODE, &ctdb->runstate);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
+ return;
+ }
+
+ /* get nodemap */
+ ret = ctdb_ctrl_getnodemap(ctdb,
+ CONTROL_TIMEOUT(),
+ rec->pnn,
+ rec,
+ &nodemap);
+ if (ret != 0) {
+ DBG_ERR("Unable to get nodemap from node %"PRIu32"\n", rec->pnn);
+ return;
+ }
+ talloc_free(rec->nodemap);
+ rec->nodemap = nodemap;
+
+ /* remember our own node flags */
+ rec->node_flags = nodemap->nodes[rec->pnn].flags;
+
+ ban_misbehaving_nodes(rec, &self_ban);
+ if (self_ban) {
+ DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
+ return;
+ }
+
+ ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(),
+ CTDB_CURRENT_NODE, &ctdb->recovery_mode);
+ if (ret != 0) {
+ D_ERR("Failed to read recmode from local node\n");
+ return;
+ }
+
+ /* if the local daemon is STOPPED or BANNED, we verify that the databases are
+ also frozen and that the recmode is set to active.
+ */
+ if (rec->node_flags & NODE_FLAGS_INACTIVE) {
+ /* If this node has become inactive then we want to
+ * reduce the chances of it taking over the leader
+ * role when it becomes active again. This
+ * helps to stabilise the leader role so that
+ * it stays on the most stable node.
+ */
+ rec->priority_time = timeval_current();
+
+ if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
+ DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
+
+ ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
+
+ return;
+ }
+ }
+ if (! rec->frozen_on_inactive) {
+ ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(),
+ CTDB_CURRENT_NODE);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,
+ (__location__ " Failed to freeze node "
+ "in STOPPED or BANNED state\n"));
+ return;
+ }
+
+ rec->frozen_on_inactive = true;
+ }
+
+ /* If this node is stopped or banned then it is not the recovery
+ * master, so don't do anything. This prevents stopped or banned
+ * node from starting election and sending unnecessary controls.
+ */
+ return;
+ }
+
+ rec->frozen_on_inactive = false;
+
+ /* Retrieve capabilities from all connected nodes */
+ ret = update_capabilities(rec, nodemap);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
+ return;
+ }
+
+ if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
+ /* Check if an IP takeover run is needed and trigger one if
+ * necessary */
+ verify_local_ip_allocation(rec);
+ }
+
+ /* If this node is not the leader then skip recovery checks */
+ if (!this_node_is_leader(rec)) {
+ return;
+ }
+
+
+ /* Get the nodemaps for all connected remote nodes */
+ ret = get_remote_nodemaps(rec, mem_ctx, &remote_nodemaps);
+ if (ret != 0) {
+ DBG_ERR("Failed to read remote nodemaps\n");
+ return;
+ }
+
+ /* Ensure our local and remote flags are correct */
+ ret = update_flags(rec, nodemap, remote_nodemaps);
+ if (ret != 0) {
+ D_ERR("Unable to update flags\n");
+ return;
+ }
+
+ if (ctdb->num_nodes != nodemap->num) {
+ DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
+ ctdb_load_nodes_file(ctdb);
+ return;
+ }
+
+ /* get the vnnmap */
+ ret = ctdb_ctrl_getvnnmap(ctdb,
+ CONTROL_TIMEOUT(),
+ rec->pnn,
+ mem_ctx,
+ &vnnmap);
+ if (ret != 0) {
+ DBG_ERR("Unable to get vnnmap from node %u\n", rec->pnn);
+ return;
+ }
+
+ if (rec->need_recovery) {
+ /* a previous recovery didn't finish */
+ do_recovery(rec, mem_ctx);
+ return;
+ }
+
+ /* verify that all active nodes are in normal mode
+ and not in recovery mode
+ */
+ switch (verify_recmode(ctdb, nodemap)) {
+ case MONITOR_RECOVERY_NEEDED:
+ do_recovery(rec, mem_ctx);
+ return;
+ case MONITOR_FAILED:
+ return;
+ case MONITOR_ELECTION_NEEDED:
+ /* can not happen */
+ case MONITOR_OK:
+ break;
+ }
+
+ if (cluster_lock_enabled(rec)) {
+ /* We must already hold the cluster lock */
+ if (!cluster_lock_held(rec)) {
+ D_ERR("Failed cluster lock sanity check\n");
+ ctdb_set_culprit(rec, rec->pnn);
+ do_recovery(rec, mem_ctx);
+ return;
+ }
+ }
+
+
+ /* If recoveries are disabled then there is no use doing any
+ * nodemap or flags checks. Recoveries might be disabled due
+ * to "reloadnodes", so doing these checks might cause an
+ * unnecessary recovery. */
+ if (ctdb_op_is_disabled(rec->recovery)) {
+ goto takeover_run_checks;
+ }
+
+ /* verify that all other nodes have the same nodemap as we have
+ */
+ for (j=0; j<nodemap->num; j++) {
+ if (nodemap->nodes[j].pnn == rec->pnn) {
+ continue;
+ }
+ if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
+ continue;
+ }
+
+ /* if the nodes disagree on how many nodes there are
+ then this is a good reason to try recovery
+ */
+ if (remote_nodemaps[j]->num != nodemap->num) {
+ DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
+ nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
+ ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
+ do_recovery(rec, mem_ctx);
+ return;
+ }
+
+ /* if the nodes disagree on which nodes exist and are
+ active, then that is also a good reason to do recovery
+ */
+ for (i=0;i<nodemap->num;i++) {
+ if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
+ DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
+ nodemap->nodes[j].pnn, i,
+ remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
+ ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
+ do_recovery(rec, mem_ctx);
+ return;
+ }
+ }
+ }
+
+ /* count how many active nodes there are */
+ num_lmasters = 0;
+ for (i=0; i<nodemap->num; i++) {
+ if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
+ if (ctdb_node_has_capabilities(rec->caps,
+ ctdb->nodes[i]->pnn,
+ CTDB_CAP_LMASTER)) {
+ num_lmasters++;
+ }
+ }
+ }
+
+
+ /* There must be the same number of lmasters in the vnn map as
+ * there are active nodes with the lmaster capability... or
+ * do a recovery.
+ */
+ if (vnnmap->size != num_lmasters) {
+ DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
+ vnnmap->size, num_lmasters));
+ ctdb_set_culprit(rec, rec->pnn);
+ do_recovery(rec, mem_ctx);
+ return;
+ }
+
+ /*
+ * Verify that all active lmaster nodes in the nodemap also
+ * exist in the vnnmap
+ */
+ for (j=0; j<nodemap->num; j++) {
+ if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
+ continue;
+ }
+ if (! ctdb_node_has_capabilities(rec->caps,
+ nodemap->nodes[j].pnn,
+ CTDB_CAP_LMASTER)) {
+ continue;
+ }
+ if (nodemap->nodes[j].pnn == rec->pnn) {
+ continue;
+ }
+
+ for (i=0; i<vnnmap->size; i++) {
+ if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
+ break;
+ }
+ }
+ if (i == vnnmap->size) {
+ D_ERR("Active LMASTER node %u is not in the vnnmap\n",
+ nodemap->nodes[j].pnn);
+ ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
+ do_recovery(rec, mem_ctx);
+ return;
+ }
+ }
+
+
+ /* verify that all other nodes have the same vnnmap
+ and are from the same generation
+ */
+ for (j=0; j<nodemap->num; j++) {
+ if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
+ continue;
+ }
+ if (nodemap->nodes[j].pnn == rec->pnn) {
+ continue;
+ }
+
+ ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
+ mem_ctx, &remote_vnnmap);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
+ nodemap->nodes[j].pnn));
+ return;
+ }
+
+ /* verify the vnnmap generation is the same */
+ if (vnnmap->generation != remote_vnnmap->generation) {
+ DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
+ nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
+ ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
+ do_recovery(rec, mem_ctx);
+ return;
+ }
+
+ /* verify the vnnmap size is the same */
+ if (vnnmap->size != remote_vnnmap->size) {
+ DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
+ nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
+ ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
+ do_recovery(rec, mem_ctx);
+ return;
+ }
+
+ /* verify the vnnmap is the same */
+ for (i=0;i<vnnmap->size;i++) {
+ if (remote_vnnmap->map[i] != vnnmap->map[i]) {
+ DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
+ nodemap->nodes[j].pnn));
+ ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
+ do_recovery(rec, mem_ctx);
+ return;
+ }
+ }
+ }
+
+ /* FIXME: Add remote public IP checking to ensure that nodes
+ * have the IP addresses that are allocated to them. */
+
+takeover_run_checks:
+
+ /* If there are IP takeover runs requested or the previous one
+ * failed then perform one and notify the waiters */
+ if (!ctdb_op_is_disabled(rec->takeover_run) &&
+ (rec->reallocate_requests || rec->need_takeover_run)) {
+ process_ipreallocate_requests(ctdb, rec);
+ }
+}
+
+static void recd_sig_term_handler(struct tevent_context *ev,
+ struct tevent_signal *se, int signum,
+ int count, void *dont_care,
+ void *private_data)
+{
+ struct ctdb_recoverd *rec = talloc_get_type_abort(
+ private_data, struct ctdb_recoverd);
+
+ DEBUG(DEBUG_ERR, ("Received SIGTERM, exiting\n"));
+ cluster_lock_release(rec);
+ exit(0);
+}
+
+/*
+ * Periodically log elements of the cluster state
+ *
+ * This can be used to confirm a split brain has occurred
+ */
+static void maybe_log_cluster_state(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval current_time,
+ void *private_data)
+{
+ struct ctdb_recoverd *rec = talloc_get_type_abort(
+ private_data, struct ctdb_recoverd);
+ struct ctdb_context *ctdb = rec->ctdb;
+ struct tevent_timer *tt;
+
+ static struct timeval start_incomplete = {
+ .tv_sec = 0,
+ };
+
+ bool is_complete;
+ bool was_complete;
+ unsigned int i;
+ double seconds;
+ unsigned int minutes;
+ unsigned int num_connected;
+
+ if (!this_node_is_leader(rec)) {
+ goto done;
+ }
+
+ if (rec->nodemap == NULL) {
+ goto done;
+ }
+
+ is_complete = true;
+ num_connected = 0;
+ for (i = 0; i < rec->nodemap->num; i++) {
+ struct ctdb_node_and_flags *n = &rec->nodemap->nodes[i];
+
+ if (n->pnn == rec->pnn) {
+ continue;
+ }
+ if ((n->flags & NODE_FLAGS_DELETED) != 0) {
+ continue;
+ }
+ if ((n->flags & NODE_FLAGS_DISCONNECTED) != 0) {
+ is_complete = false;
+ continue;
+ }
+
+ num_connected++;
+ }
+
+ was_complete = timeval_is_zero(&start_incomplete);
+
+ if (is_complete) {
+ if (! was_complete) {
+ D_WARNING("Cluster complete with leader=%u\n",
+ rec->leader);
+ start_incomplete = timeval_zero();
+ }
+ goto done;
+ }
+
+ /* Cluster is newly incomplete... */
+ if (was_complete) {
+ start_incomplete = current_time;
+ minutes = 0;
+ goto log;
+ }
+
+ /*
+ * Cluster has been incomplete since previous check, so figure
+ * out how long (in minutes) and decide whether to log anything
+ */
+ seconds = timeval_elapsed2(&start_incomplete, &current_time);
+ minutes = (unsigned int)seconds / 60;
+ if (minutes >= 60) {
+ /* Over an hour, log every hour */
+ if (minutes % 60 != 0) {
+ goto done;
+ }
+ } else if (minutes >= 10) {
+ /* Over 10 minutes, log every 10 minutes */
+ if (minutes % 10 != 0) {
+ goto done;
+ }
+ }
+
+log:
+ D_WARNING("Cluster incomplete with leader=%u, elapsed=%u minutes, "
+ "connected=%u\n",
+ rec->leader,
+ minutes,
+ num_connected);
+
+done:
+ tt = tevent_add_timer(ctdb->ev,
+ rec,
+ timeval_current_ofs(60, 0),
+ maybe_log_cluster_state,
+ rec);
+ if (tt == NULL) {
+ DBG_WARNING("Failed to set up cluster state timer\n");
+ }
+}
+
+static void recd_sighup_hook(void *private_data)
+{
+ struct ctdb_recoverd *rec = talloc_get_type_abort(
+ private_data, struct ctdb_recoverd);
+
+ if (rec->helper_pid > 0) {
+ kill(rec->helper_pid, SIGHUP);
+ }
+}
+
+/*
+ the main monitoring loop
+ */
+static void monitor_cluster(struct ctdb_context *ctdb)
+{
+ struct tevent_signal *se;
+ struct ctdb_recoverd *rec;
+ bool status;
+
+ DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
+
+ rec = talloc_zero(ctdb, struct ctdb_recoverd);
+ CTDB_NO_MEMORY_FATAL(ctdb, rec);
+
+ rec->ctdb = ctdb;
+ rec->leader = CTDB_UNKNOWN_PNN;
+ rec->pnn = ctdb_get_pnn(ctdb);
+ rec->cluster_lock_handle = NULL;
+ rec->helper_pid = -1;
+
+ rec->takeover_run = ctdb_op_init(rec, "takeover runs");
+ CTDB_NO_MEMORY_FATAL(ctdb, rec->takeover_run);
+
+ rec->recovery = ctdb_op_init(rec, "recoveries");
+ CTDB_NO_MEMORY_FATAL(ctdb, rec->recovery);
+
+ rec->priority_time = timeval_current();
+ rec->frozen_on_inactive = false;
+
+ status = logging_setup_sighup_handler(rec->ctdb->ev,
+ rec,
+ recd_sighup_hook,
+ rec);
+ if (!status) {
+ D_ERR("Failed to install SIGHUP handler\n");
+ exit(1);
+ }
+
+ se = tevent_add_signal(ctdb->ev, ctdb, SIGTERM, 0,
+ recd_sig_term_handler, rec);
+ if (se == NULL) {
+ DEBUG(DEBUG_ERR, ("Failed to install SIGTERM handler\n"));
+ exit(1);
+ }
+
+ if (!cluster_lock_enabled(rec)) {
+ struct tevent_timer *tt;
+
+ tt = tevent_add_timer(ctdb->ev,
+ rec,
+ timeval_current_ofs(60, 0),
+ maybe_log_cluster_state,
+ rec);
+ if (tt == NULL) {
+ DBG_WARNING("Failed to set up cluster state timer\n");
+ }
+ }
+
+ /* register a message port for sending memory dumps */
+ ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
+
+ /* when a node is assigned banning credits */
+ ctdb_client_set_message_handler(ctdb, CTDB_SRVID_BANNING,
+ banning_handler, rec);
+
+ /* register a message port for recovery elections */
+ ctdb_client_set_message_handler(ctdb, CTDB_SRVID_ELECTION, election_handler, rec);
+
+ ctdb_client_set_message_handler(ctdb,
+ CTDB_SRVID_SET_NODE_FLAGS,
+ srvid_not_implemented,
+ rec);
+
+ /* when we are asked to puch out a flag change */
+ ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
+
+ /* register a message port for reloadnodes */
+ ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
+
+ /* register a message port for performing a takeover run */
+ ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
+
+ /* register a message port for disabling the ip check for a short while */
+ ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
+
+ /* register a message port for forcing a rebalance of a node next
+ reallocation */
+ ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
+
+ /* Register a message port for disabling takeover runs */
+ ctdb_client_set_message_handler(ctdb,
+ CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
+ disable_takeover_runs_handler, rec);
+
+ /* Register a message port for disabling recoveries */
+ ctdb_client_set_message_handler(ctdb,
+ CTDB_SRVID_DISABLE_RECOVERIES,
+ disable_recoveries_handler, rec);
+
+ ctdb_client_set_message_handler(ctdb,
+ CTDB_SRVID_LEADER,
+ leader_handler,
+ rec);
+
+ for (;;) {
+ TALLOC_CTX *mem_ctx = talloc_new(ctdb);
+ struct timeval start;
+ double elapsed;
+
+ if (!mem_ctx) {
+ DEBUG(DEBUG_CRIT,(__location__
+ " Failed to create temp context\n"));
+ exit(-1);
+ }
+
+ start = timeval_current();
+ main_loop(ctdb, rec, mem_ctx);
+ talloc_free(mem_ctx);
+
+ /* we only check for recovery once every second */
+ elapsed = timeval_elapsed(&start);
+ if (elapsed < ctdb->tunable.recover_interval) {
+ ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
+ - elapsed);
+ }
+ }
+}
+
+/*
+ event handler for when the main ctdbd dies
+ */
+static void ctdb_recoverd_parent(struct tevent_context *ev,
+ struct tevent_fd *fde,
+ uint16_t flags, void *private_data)
+{
+ DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
+ _exit(1);
+}
+
+/*
+ called regularly to verify that the recovery daemon is still running
+ */
+static void ctdb_check_recd(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval yt, void *p)
+{
+ struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
+
+ if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
+ DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
+
+ tevent_add_timer(ctdb->ev, ctdb, timeval_zero(),
+ ctdb_restart_recd, ctdb);
+
+ return;
+ }
+
+ tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
+ timeval_current_ofs(30, 0),
+ ctdb_check_recd, ctdb);
+}
+
+static void recd_sig_child_handler(struct tevent_context *ev,
+ struct tevent_signal *se, int signum,
+ int count, void *dont_care,
+ void *private_data)
+{
+// struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+ int status;
+ pid_t pid = -1;
+
+ while (pid != 0) {
+ pid = waitpid(-1, &status, WNOHANG);
+ if (pid == -1) {
+ if (errno != ECHILD) {
+ DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
+ }
+ return;
+ }
+ if (pid > 0) {
+ DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
+ }
+ }
+}
+
+/*
+ startup the recovery daemon as a child of the main ctdb daemon
+ */
+int ctdb_start_recoverd(struct ctdb_context *ctdb)
+{
+ int fd[2];
+ struct tevent_signal *se;
+ struct tevent_fd *fde;
+ int ret;
+
+ if (pipe(fd) != 0) {
+ return -1;
+ }
+
+ ctdb->recoverd_pid = ctdb_fork(ctdb);
+ if (ctdb->recoverd_pid == -1) {
+ return -1;
+ }
+
+ if (ctdb->recoverd_pid != 0) {
+ talloc_free(ctdb->recd_ctx);
+ ctdb->recd_ctx = talloc_new(ctdb);
+ CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
+
+ close(fd[0]);
+ tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
+ timeval_current_ofs(30, 0),
+ ctdb_check_recd, ctdb);
+ return 0;
+ }
+
+ close(fd[1]);
+
+ srandom(getpid() ^ time(NULL));
+
+ ret = logging_init(ctdb, NULL, NULL, "ctdb-recoverd");
+ if (ret != 0) {
+ return -1;
+ }
+
+ prctl_set_comment("ctdb_recoverd");
+ if (switch_from_server_to_client(ctdb) != 0) {
+ DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
+ exit(1);
+ }
+
+ DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
+
+ fde = tevent_add_fd(ctdb->ev, ctdb, fd[0], TEVENT_FD_READ,
+ ctdb_recoverd_parent, &fd[0]);
+ tevent_fd_set_auto_close(fde);
+
+ /* set up a handler to pick up sigchld */
+ se = tevent_add_signal(ctdb->ev, ctdb, SIGCHLD, 0,
+ recd_sig_child_handler, ctdb);
+ if (se == NULL) {
+ DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
+ exit(1);
+ }
+
+ monitor_cluster(ctdb);
+
+ DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
+ return -1;
+}
+
+/*
+ shutdown the recovery daemon
+ */
+void ctdb_stop_recoverd(struct ctdb_context *ctdb)
+{
+ if (ctdb->recoverd_pid == 0) {
+ return;
+ }
+
+ DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
+ ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
+
+ TALLOC_FREE(ctdb->recd_ctx);
+ TALLOC_FREE(ctdb->recd_ping_count);
+}
+
+static void ctdb_restart_recd(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *private_data)
+{
+ struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+
+ DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
+ ctdb_stop_recoverd(ctdb);
+ ctdb_start_recoverd(ctdb);
+}
diff --git a/ctdb/server/ctdb_recovery_helper.c b/ctdb/server/ctdb_recovery_helper.c
new file mode 100644
index 0000000..e0d3219
--- /dev/null
+++ b/ctdb/server/ctdb_recovery_helper.c
@@ -0,0 +1,3200 @@
+/*
+ ctdb parallel database recovery
+
+ Copyright (C) Amitay Isaacs 2015
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+#include "system/filesys.h"
+
+#include <talloc.h>
+#include <tevent.h>
+#include <tdb.h>
+#include <libgen.h>
+
+#include "lib/tdb_wrap/tdb_wrap.h"
+#include "lib/util/dlinklist.h"
+#include "lib/util/sys_rw.h"
+#include "lib/util/time.h"
+#include "lib/util/tevent_unix.h"
+#include "lib/util/util.h"
+#include "lib/util/smb_strtox.h"
+
+#include "protocol/protocol.h"
+#include "protocol/protocol_api.h"
+#include "client/client.h"
+
+#include "common/logging.h"
+
+static int recover_timeout = 30;
+
+#define NUM_RETRIES 3
+
+#define TIMEOUT() timeval_current_ofs(recover_timeout, 0)
+
+/*
+ * Utility functions
+ */
+
+static bool generic_recv(struct tevent_req *req, int *perr)
+{
+ int err;
+
+ if (tevent_req_is_unix_error(req, &err)) {
+ if (perr != NULL) {
+ *perr = err;
+ }
+ return false;
+ }
+
+ return true;
+}
+
+static uint64_t rec_srvid = CTDB_SRVID_RECOVERY;
+
+static uint64_t srvid_next(void)
+{
+ rec_srvid += 1;
+ return rec_srvid;
+}
+
+/*
+ * Node related functions
+ */
+
+struct node_list {
+ uint32_t *pnn_list;
+ uint32_t *caps;
+ uint32_t *ban_credits;
+ unsigned int size;
+ unsigned int count;
+};
+
+static struct node_list *node_list_init(TALLOC_CTX *mem_ctx, unsigned int size)
+{
+ struct node_list *nlist;
+ unsigned int i;
+
+ nlist = talloc_zero(mem_ctx, struct node_list);
+ if (nlist == NULL) {
+ return NULL;
+ }
+
+ nlist->pnn_list = talloc_array(nlist, uint32_t, size);
+ nlist->caps = talloc_zero_array(nlist, uint32_t, size);
+ nlist->ban_credits = talloc_zero_array(nlist, uint32_t, size);
+
+ if (nlist->pnn_list == NULL ||
+ nlist->caps == NULL ||
+ nlist->ban_credits == NULL) {
+ talloc_free(nlist);
+ return NULL;
+ }
+ nlist->size = size;
+
+ for (i=0; i<nlist->size; i++) {
+ nlist->pnn_list[i] = CTDB_UNKNOWN_PNN;
+ }
+
+ return nlist;
+}
+
+static bool node_list_add(struct node_list *nlist, uint32_t pnn)
+{
+ unsigned int i;
+
+ if (nlist->count == nlist->size) {
+ return false;
+ }
+
+ for (i=0; i<nlist->count; i++) {
+ if (nlist->pnn_list[i] == pnn) {
+ return false;
+ }
+ }
+
+ nlist->pnn_list[nlist->count] = pnn;
+ nlist->count += 1;
+
+ return true;
+}
+
+static uint32_t *node_list_lmaster(struct node_list *nlist,
+ TALLOC_CTX *mem_ctx,
+ unsigned int *pnn_count)
+{
+ uint32_t *pnn_list;
+ unsigned int count, i;
+
+ pnn_list = talloc_zero_array(mem_ctx, uint32_t, nlist->count);
+ if (pnn_list == NULL) {
+ return NULL;
+ }
+
+ count = 0;
+ for (i=0; i<nlist->count; i++) {
+ if (!(nlist->caps[i] & CTDB_CAP_LMASTER)) {
+ continue;
+ }
+
+ pnn_list[count] = nlist->pnn_list[i];
+ count += 1;
+ }
+
+ *pnn_count = count;
+ return pnn_list;
+}
+
+static void node_list_ban_credits(struct node_list *nlist, uint32_t pnn)
+{
+ unsigned int i;
+
+ for (i=0; i<nlist->count; i++) {
+ if (nlist->pnn_list[i] == pnn) {
+ nlist->ban_credits[i] += 1;
+ break;
+ }
+ }
+}
+
+/*
+ * Database list functions
+ *
+ * Simple, naive implementation that could be updated to a db_hash or similar
+ */
+
+struct db {
+ struct db *prev, *next;
+
+ uint32_t db_id;
+ uint32_t db_flags;
+ uint32_t *pnn_list;
+ unsigned int num_nodes;
+};
+
+struct db_list {
+ unsigned int num_dbs;
+ struct db *db;
+ unsigned int num_nodes;
+};
+
+static struct db_list *db_list_init(TALLOC_CTX *mem_ctx, unsigned int num_nodes)
+{
+ struct db_list *l;
+
+ l = talloc_zero(mem_ctx, struct db_list);
+ l->num_nodes = num_nodes;
+
+ return l;
+}
+
+static struct db *db_list_find(struct db_list *dblist, uint32_t db_id)
+{
+ struct db *db;
+
+ if (dblist == NULL) {
+ return NULL;
+ }
+
+ db = dblist->db;
+ while (db != NULL && db->db_id != db_id) {
+ db = db->next;
+ }
+
+ return db;
+}
+
+static int db_list_add(struct db_list *dblist,
+ uint32_t db_id,
+ uint32_t db_flags,
+ uint32_t node)
+{
+ struct db *db = NULL;
+
+ if (dblist == NULL) {
+ return EINVAL;
+ }
+
+ db = talloc_zero(dblist, struct db);
+ if (db == NULL) {
+ return ENOMEM;
+ }
+
+ db->db_id = db_id;
+ db->db_flags = db_flags;
+ db->pnn_list = talloc_zero_array(db, uint32_t, dblist->num_nodes);
+ if (db->pnn_list == NULL) {
+ talloc_free(db);
+ return ENOMEM;
+ }
+ db->pnn_list[0] = node;
+ db->num_nodes = 1;
+
+ DLIST_ADD_END(dblist->db, db);
+ dblist->num_dbs++;
+
+ return 0;
+}
+
+static int db_list_check_and_add(struct db_list *dblist,
+ uint32_t db_id,
+ uint32_t db_flags,
+ uint32_t node)
+{
+ struct db *db = NULL;
+ int ret;
+
+ /*
+ * These flags are masked out because they are only set on a
+ * node when a client attaches to that node, so they might not
+ * be set yet. They can't be passed as part of the attch, so
+ * they're no use here.
+ */
+ db_flags &= ~(CTDB_DB_FLAGS_READONLY | CTDB_DB_FLAGS_STICKY);
+
+ if (dblist == NULL) {
+ return EINVAL;
+ }
+
+ db = db_list_find(dblist, db_id);
+ if (db == NULL) {
+ ret = db_list_add(dblist, db_id, db_flags, node);
+ return ret;
+ }
+
+ if (db->db_flags != db_flags) {
+ D_ERR("Incompatible database flags for 0x%"PRIx32" "
+ "(0x%"PRIx32" != 0x%"PRIx32")\n",
+ db_id,
+ db_flags,
+ db->db_flags);
+ return EINVAL;
+ }
+
+ if (db->num_nodes >= dblist->num_nodes) {
+ return EINVAL;
+ }
+
+ db->pnn_list[db->num_nodes] = node;
+ db->num_nodes++;
+
+ return 0;
+}
+
+/*
+ * Create database on nodes where it is missing
+ */
+
+struct db_create_missing_state {
+ struct tevent_context *ev;
+ struct ctdb_client_context *client;
+
+ struct node_list *nlist;
+
+ const char *db_name;
+ uint32_t *missing_pnn_list;
+ int missing_num_nodes;
+};
+
+static void db_create_missing_done(struct tevent_req *subreq);
+
+static struct tevent_req *db_create_missing_send(
+ TALLOC_CTX *mem_ctx,
+ struct tevent_context *ev,
+ struct ctdb_client_context *client,
+ struct node_list *nlist,
+ const char *db_name,
+ struct db *db)
+{
+ struct tevent_req *req, *subreq;
+ struct db_create_missing_state *state;
+ struct ctdb_req_control request;
+ unsigned int i, j;
+
+ req = tevent_req_create(mem_ctx,
+ &state,
+ struct db_create_missing_state);
+ if (req == NULL) {
+ return NULL;
+ }
+
+ state->ev = ev;
+ state->client = client;
+ state->nlist = nlist;
+ state->db_name = db_name;
+
+ if (nlist->count == db->num_nodes) {
+ tevent_req_done(req);
+ return tevent_req_post(req, ev);
+ }
+
+ state->missing_pnn_list = talloc_array(mem_ctx, uint32_t, nlist->count);
+ if (tevent_req_nomem(state->missing_pnn_list, req)) {
+ return tevent_req_post(req, ev);
+ }
+
+ for (i = 0; i < nlist->count; i++) {
+ uint32_t pnn = nlist->pnn_list[i] ;
+
+ for (j = 0; j < db->num_nodes; j++) {
+ if (pnn == db->pnn_list[j]) {
+ break;
+ }
+ }
+
+ if (j < db->num_nodes) {
+ continue;
+ }
+
+ DBG_INFO("Create database %s on node %u\n",
+ state->db_name,
+ pnn);
+ state->missing_pnn_list[state->missing_num_nodes] = pnn;
+ state->missing_num_nodes++;
+ }
+
+ if (db->db_flags & CTDB_DB_FLAGS_PERSISTENT) {
+ ctdb_req_control_db_attach_persistent(&request, db_name);
+ } else if (db->db_flags & CTDB_DB_FLAGS_REPLICATED) {
+ ctdb_req_control_db_attach_replicated(&request, db_name);
+ } else {
+ ctdb_req_control_db_attach(&request, db_name);
+ }
+ request.flags = CTDB_CTRL_FLAG_ATTACH_RECOVERY;
+ subreq = ctdb_client_control_multi_send(state,
+ state->ev,
+ state->client,
+ state->missing_pnn_list,
+ state->missing_num_nodes,
+ TIMEOUT(),
+ &request);
+ if (tevent_req_nomem(subreq, req)) {
+ return tevent_req_post(req, ev);
+ }
+ tevent_req_set_callback(subreq, db_create_missing_done, req);
+
+ return req;
+}
+
+static void db_create_missing_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct db_create_missing_state *state = tevent_req_data(
+ req, struct db_create_missing_state);
+ int *err_list;
+ int ret;
+ bool status;
+
+ status = ctdb_client_control_multi_recv(subreq,
+ &ret,
+ NULL,
+ &err_list,
+ NULL);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ int ret2;
+ uint32_t pnn;
+
+ ret2 = ctdb_client_control_multi_error(
+ state->missing_pnn_list,
+ state->missing_num_nodes,
+ err_list,
+ &pnn);
+ if (ret2 != 0) {
+ D_ERR("control DB_ATTACH failed for db %s"
+ " on node %u, ret=%d\n",
+ state->db_name,
+ pnn,
+ ret2);
+ node_list_ban_credits(state->nlist, pnn);
+ } else {
+ D_ERR("control DB_ATTACH failed for db %s, ret=%d\n",
+ state->db_name,
+ ret);
+ }
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ tevent_req_done(req);
+}
+
+static bool db_create_missing_recv(struct tevent_req *req, int *perr)
+{
+ return generic_recv(req, perr);
+}
+
+/*
+ * Recovery database functions
+ */
+
+struct recdb_context {
+ uint32_t db_id;
+ const char *db_name;
+ const char *db_path;
+ struct tdb_wrap *db;
+ bool persistent;
+};
+
+static struct recdb_context *recdb_create(TALLOC_CTX *mem_ctx, uint32_t db_id,
+ const char *db_name,
+ const char *db_path,
+ uint32_t hash_size, bool persistent)
+{
+ static char *db_dir_state = NULL;
+ struct recdb_context *recdb;
+ unsigned int tdb_flags;
+
+ recdb = talloc(mem_ctx, struct recdb_context);
+ if (recdb == NULL) {
+ return NULL;
+ }
+
+ if (db_dir_state == NULL) {
+ db_dir_state = getenv("CTDB_DBDIR_STATE");
+ }
+
+ recdb->db_name = db_name;
+ recdb->db_id = db_id;
+ recdb->db_path = talloc_asprintf(recdb, "%s/recdb.%s",
+ db_dir_state != NULL ?
+ db_dir_state :
+ dirname(discard_const(db_path)),
+ db_name);
+ if (recdb->db_path == NULL) {
+ talloc_free(recdb);
+ return NULL;
+ }
+ unlink(recdb->db_path);
+
+ tdb_flags = TDB_NOLOCK | TDB_INCOMPATIBLE_HASH | TDB_DISALLOW_NESTING;
+ recdb->db = tdb_wrap_open(mem_ctx, recdb->db_path, hash_size,
+ tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
+ if (recdb->db == NULL) {
+ talloc_free(recdb);
+ D_ERR("failed to create recovery db %s\n", recdb->db_path);
+ return NULL;
+ }
+
+ recdb->persistent = persistent;
+
+ return recdb;
+}
+
+static uint32_t recdb_id(struct recdb_context *recdb)
+{
+ return recdb->db_id;
+}
+
+static const char *recdb_name(struct recdb_context *recdb)
+{
+ return recdb->db_name;
+}
+
+static const char *recdb_path(struct recdb_context *recdb)
+{
+ return recdb->db_path;
+}
+
+static struct tdb_context *recdb_tdb(struct recdb_context *recdb)
+{
+ return recdb->db->tdb;
+}
+
+static bool recdb_persistent(struct recdb_context *recdb)
+{
+ return recdb->persistent;
+}
+
+struct recdb_add_traverse_state {
+ struct recdb_context *recdb;
+ uint32_t mypnn;
+};
+
+static int recdb_add_traverse(uint32_t reqid, struct ctdb_ltdb_header *header,
+ TDB_DATA key, TDB_DATA data,
+ void *private_data)
+{
+ struct recdb_add_traverse_state *state =
+ (struct recdb_add_traverse_state *)private_data;
+ struct ctdb_ltdb_header *hdr;
+ TDB_DATA prev_data;
+ int ret;
+
+ /* header is not marshalled separately in the pulldb control */
+ if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
+ return -1;
+ }
+
+ hdr = (struct ctdb_ltdb_header *)data.dptr;
+
+ /* fetch the existing record, if any */
+ prev_data = tdb_fetch(recdb_tdb(state->recdb), key);
+
+ if (prev_data.dptr != NULL) {
+ struct ctdb_ltdb_header prev_hdr;
+
+ prev_hdr = *(struct ctdb_ltdb_header *)prev_data.dptr;
+ free(prev_data.dptr);
+ if (hdr->rsn < prev_hdr.rsn ||
+ (hdr->rsn == prev_hdr.rsn &&
+ prev_hdr.dmaster != state->mypnn)) {
+ return 0;
+ }
+ }
+
+ ret = tdb_store(recdb_tdb(state->recdb), key, data, TDB_REPLACE);
+ if (ret != 0) {
+ return -1;
+ }
+ return 0;
+}
+
+static bool recdb_add(struct recdb_context *recdb, int mypnn,
+ struct ctdb_rec_buffer *recbuf)
+{
+ struct recdb_add_traverse_state state;
+ int ret;
+
+ state.recdb = recdb;
+ state.mypnn = mypnn;
+
+ ret = ctdb_rec_buffer_traverse(recbuf, recdb_add_traverse, &state);
+ if (ret != 0) {
+ return false;
+ }
+
+ return true;
+}
+
+/* This function decides which records from recdb are retained */
+static int recbuf_filter_add(struct ctdb_rec_buffer *recbuf, bool persistent,
+ uint32_t reqid, uint32_t dmaster,
+ TDB_DATA key, TDB_DATA data)
+{
+ struct ctdb_ltdb_header *header;
+ int ret;
+
+ /* Skip empty records */
+ if (data.dsize <= sizeof(struct ctdb_ltdb_header)) {
+ return 0;
+ }
+
+ /* update the dmaster field to point to us */
+ header = (struct ctdb_ltdb_header *)data.dptr;
+ if (!persistent) {
+ header->dmaster = dmaster;
+ header->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
+ }
+
+ ret = ctdb_rec_buffer_add(recbuf, recbuf, reqid, NULL, key, data);
+ if (ret != 0) {
+ return ret;
+ }
+
+ return 0;
+}
+
+struct recdb_file_traverse_state {
+ struct ctdb_rec_buffer *recbuf;
+ struct recdb_context *recdb;
+ TALLOC_CTX *mem_ctx;
+ uint32_t dmaster;
+ uint32_t reqid;
+ bool persistent;
+ bool failed;
+ int fd;
+ size_t max_size;
+ unsigned int num_buffers;
+};
+
+static int recdb_file_traverse(struct tdb_context *tdb,
+ TDB_DATA key, TDB_DATA data,
+ void *private_data)
+{
+ struct recdb_file_traverse_state *state =
+ (struct recdb_file_traverse_state *)private_data;
+ int ret;
+
+ ret = recbuf_filter_add(state->recbuf, state->persistent,
+ state->reqid, state->dmaster, key, data);
+ if (ret != 0) {
+ state->failed = true;
+ return ret;
+ }
+
+ if (ctdb_rec_buffer_len(state->recbuf) > state->max_size) {
+ ret = ctdb_rec_buffer_write(state->recbuf, state->fd);
+ if (ret != 0) {
+ D_ERR("Failed to collect recovery records for %s\n",
+ recdb_name(state->recdb));
+ state->failed = true;
+ return ret;
+ }
+
+ state->num_buffers += 1;
+
+ TALLOC_FREE(state->recbuf);
+ state->recbuf = ctdb_rec_buffer_init(state->mem_ctx,
+ recdb_id(state->recdb));
+ if (state->recbuf == NULL) {
+ state->failed = true;
+ return ENOMEM;
+ }
+ }
+
+ return 0;
+}
+
+static int recdb_file(struct recdb_context *recdb, TALLOC_CTX *mem_ctx,
+ uint32_t dmaster, int fd, int max_size)
+{
+ struct recdb_file_traverse_state state;
+ int ret;
+
+ state.recbuf = ctdb_rec_buffer_init(mem_ctx, recdb_id(recdb));
+ if (state.recbuf == NULL) {
+ return -1;
+ }
+ state.recdb = recdb;
+ state.mem_ctx = mem_ctx;
+ state.dmaster = dmaster;
+ state.reqid = 0;
+ state.persistent = recdb_persistent(recdb);
+ state.failed = false;
+ state.fd = fd;
+ state.max_size = max_size;
+ state.num_buffers = 0;
+
+ ret = tdb_traverse_read(recdb_tdb(recdb), recdb_file_traverse, &state);
+ if (ret == -1 || state.failed) {
+ TALLOC_FREE(state.recbuf);
+ return -1;
+ }
+
+ ret = ctdb_rec_buffer_write(state.recbuf, fd);
+ if (ret != 0) {
+ D_ERR("Failed to collect recovery records for %s\n",
+ recdb_name(recdb));
+ TALLOC_FREE(state.recbuf);
+ return -1;
+ }
+ state.num_buffers += 1;
+
+ D_DEBUG("Wrote %d buffers of recovery records for %s\n",
+ state.num_buffers, recdb_name(recdb));
+
+ return state.num_buffers;
+}
+
+/*
+ * Pull database from a single node
+ */
+
+struct pull_database_state {
+ struct tevent_context *ev;
+ struct ctdb_client_context *client;
+ struct recdb_context *recdb;
+ uint32_t pnn;
+ uint64_t srvid;
+ unsigned int num_records;
+ int result;
+};
+
+static void pull_database_handler(uint64_t srvid, TDB_DATA data,
+ void *private_data);
+static void pull_database_register_done(struct tevent_req *subreq);
+static void pull_database_unregister_done(struct tevent_req *subreq);
+static void pull_database_done(struct tevent_req *subreq);
+
+static struct tevent_req *pull_database_send(
+ TALLOC_CTX *mem_ctx,
+ struct tevent_context *ev,
+ struct ctdb_client_context *client,
+ uint32_t pnn,
+ struct recdb_context *recdb)
+{
+ struct tevent_req *req, *subreq;
+ struct pull_database_state *state;
+
+ req = tevent_req_create(mem_ctx, &state, struct pull_database_state);
+ if (req == NULL) {
+ return NULL;
+ }
+
+ state->ev = ev;
+ state->client = client;
+ state->recdb = recdb;
+ state->pnn = pnn;
+ state->srvid = srvid_next();
+
+ subreq = ctdb_client_set_message_handler_send(
+ state, state->ev, state->client,
+ state->srvid, pull_database_handler,
+ req);
+ if (tevent_req_nomem(subreq, req)) {
+ return tevent_req_post(req, ev);
+ }
+
+ tevent_req_set_callback(subreq, pull_database_register_done, req);
+
+ return req;
+}
+
+static void pull_database_handler(uint64_t srvid, TDB_DATA data,
+ void *private_data)
+{
+ struct tevent_req *req = talloc_get_type_abort(
+ private_data, struct tevent_req);
+ struct pull_database_state *state = tevent_req_data(
+ req, struct pull_database_state);
+ struct ctdb_rec_buffer *recbuf;
+ size_t np;
+ int ret;
+ bool status;
+
+ if (srvid != state->srvid) {
+ return;
+ }
+
+ ret = ctdb_rec_buffer_pull(data.dptr, data.dsize, state, &recbuf, &np);
+ if (ret != 0) {
+ D_ERR("Invalid data received for DB_PULL messages\n");
+ return;
+ }
+
+ if (recbuf->db_id != recdb_id(state->recdb)) {
+ talloc_free(recbuf);
+ D_ERR("Invalid dbid:%08x for DB_PULL messages for %s\n",
+ recbuf->db_id, recdb_name(state->recdb));
+ return;
+ }
+
+ status = recdb_add(state->recdb, ctdb_client_pnn(state->client),
+ recbuf);
+ if (! status) {
+ talloc_free(recbuf);
+ D_ERR("Failed to add records to recdb for %s\n",
+ recdb_name(state->recdb));
+ return;
+ }
+
+ state->num_records += recbuf->count;
+ talloc_free(recbuf);
+}
+
+static void pull_database_register_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct pull_database_state *state = tevent_req_data(
+ req, struct pull_database_state);
+ struct ctdb_req_control request;
+ struct ctdb_pulldb_ext pulldb_ext;
+ int ret;
+ bool status;
+
+ status = ctdb_client_set_message_handler_recv(subreq, &ret);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ D_ERR("Failed to set message handler for DB_PULL for %s\n",
+ recdb_name(state->recdb));
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ pulldb_ext.db_id = recdb_id(state->recdb);
+ pulldb_ext.lmaster = CTDB_LMASTER_ANY;
+ pulldb_ext.srvid = state->srvid;
+
+ ctdb_req_control_db_pull(&request, &pulldb_ext);
+ subreq = ctdb_client_control_send(state, state->ev, state->client,
+ state->pnn, TIMEOUT(), &request);
+ if (tevent_req_nomem(subreq, req)) {
+ return;
+ }
+ tevent_req_set_callback(subreq, pull_database_done, req);
+}
+
+static void pull_database_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct pull_database_state *state = tevent_req_data(
+ req, struct pull_database_state);
+ struct ctdb_reply_control *reply;
+ uint32_t num_records;
+ int ret;
+ bool status;
+
+ status = ctdb_client_control_recv(subreq, &ret, state, &reply);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ D_ERR("control DB_PULL failed for %s on node %u, ret=%d\n",
+ recdb_name(state->recdb), state->pnn, ret);
+ state->result = ret;
+ goto unregister;
+ }
+
+ ret = ctdb_reply_control_db_pull(reply, &num_records);
+ talloc_free(reply);
+ if (num_records != state->num_records) {
+ D_ERR("mismatch (%u != %u) in DB_PULL records for db %s\n",
+ num_records, state->num_records,
+ recdb_name(state->recdb));
+ state->result = EIO;
+ goto unregister;
+ }
+
+ D_INFO("Pulled %d records for db %s from node %d\n",
+ state->num_records, recdb_name(state->recdb), state->pnn);
+
+unregister:
+
+ subreq = ctdb_client_remove_message_handler_send(
+ state, state->ev, state->client,
+ state->srvid, req);
+ if (tevent_req_nomem(subreq, req)) {
+ return;
+ }
+ tevent_req_set_callback(subreq, pull_database_unregister_done, req);
+}
+
+static void pull_database_unregister_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct pull_database_state *state = tevent_req_data(
+ req, struct pull_database_state);
+ int ret;
+ bool status;
+
+ status = ctdb_client_remove_message_handler_recv(subreq, &ret);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ D_ERR("failed to remove message handler for DB_PULL for db %s\n",
+ recdb_name(state->recdb));
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ if (state->result != 0) {
+ tevent_req_error(req, state->result);
+ return;
+ }
+
+ tevent_req_done(req);
+}
+
+static bool pull_database_recv(struct tevent_req *req, int *perr)
+{
+ return generic_recv(req, perr);
+}
+
+/*
+ * Push database to specified nodes (new style)
+ */
+
+struct push_database_state {
+ struct tevent_context *ev;
+ struct ctdb_client_context *client;
+ struct recdb_context *recdb;
+ uint32_t *pnn_list;
+ unsigned int count;
+ uint64_t srvid;
+ uint32_t dmaster;
+ int fd;
+ int num_buffers;
+ int num_buffers_sent;
+ unsigned int num_records;
+};
+
+static void push_database_started(struct tevent_req *subreq);
+static void push_database_send_msg(struct tevent_req *req);
+static void push_database_send_done(struct tevent_req *subreq);
+static void push_database_confirmed(struct tevent_req *subreq);
+
+static struct tevent_req *push_database_send(
+ TALLOC_CTX *mem_ctx,
+ struct tevent_context *ev,
+ struct ctdb_client_context *client,
+ uint32_t *pnn_list,
+ unsigned int count,
+ struct recdb_context *recdb,
+ int max_size)
+{
+ struct tevent_req *req, *subreq;
+ struct push_database_state *state;
+ struct ctdb_req_control request;
+ struct ctdb_pulldb_ext pulldb_ext;
+ char *filename;
+ off_t offset;
+
+ req = tevent_req_create(mem_ctx, &state,
+ struct push_database_state);
+ if (req == NULL) {
+ return NULL;
+ }
+
+ state->ev = ev;
+ state->client = client;
+ state->recdb = recdb;
+ state->pnn_list = pnn_list;
+ state->count = count;
+
+ state->srvid = srvid_next();
+ state->dmaster = ctdb_client_pnn(client);
+ state->num_buffers_sent = 0;
+ state->num_records = 0;
+
+ filename = talloc_asprintf(state, "%s.dat", recdb_path(recdb));
+ if (tevent_req_nomem(filename, req)) {
+ return tevent_req_post(req, ev);
+ }
+
+ state->fd = open(filename, O_RDWR|O_CREAT, 0644);
+ if (state->fd == -1) {
+ tevent_req_error(req, errno);
+ return tevent_req_post(req, ev);
+ }
+ unlink(filename);
+ talloc_free(filename);
+
+ state->num_buffers = recdb_file(recdb, state, state->dmaster,
+ state->fd, max_size);
+ if (state->num_buffers == -1) {
+ tevent_req_error(req, ENOMEM);
+ return tevent_req_post(req, ev);
+ }
+
+ offset = lseek(state->fd, 0, SEEK_SET);
+ if (offset != 0) {
+ tevent_req_error(req, EIO);
+ return tevent_req_post(req, ev);
+ }
+
+ pulldb_ext.db_id = recdb_id(recdb);
+ pulldb_ext.srvid = state->srvid;
+
+ ctdb_req_control_db_push_start(&request, &pulldb_ext);
+ subreq = ctdb_client_control_multi_send(state, ev, client,
+ pnn_list, count,
+ TIMEOUT(), &request);
+ if (tevent_req_nomem(subreq, req)) {
+ return tevent_req_post(req, ev);
+ }
+ tevent_req_set_callback(subreq, push_database_started, req);
+
+ return req;
+}
+
+static void push_database_started(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct push_database_state *state = tevent_req_data(
+ req, struct push_database_state);
+ int *err_list;
+ int ret;
+ bool status;
+
+ status = ctdb_client_control_multi_recv(subreq, &ret, state,
+ &err_list, NULL);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ int ret2;
+ uint32_t pnn;
+
+ ret2 = ctdb_client_control_multi_error(state->pnn_list,
+ state->count,
+ err_list, &pnn);
+ if (ret2 != 0) {
+ D_ERR("control DB_PUSH_START failed for db %s"
+ " on node %u, ret=%d\n",
+ recdb_name(state->recdb), pnn, ret2);
+ } else {
+ D_ERR("control DB_PUSH_START failed for db %s,"
+ " ret=%d\n",
+ recdb_name(state->recdb), ret);
+ }
+ talloc_free(err_list);
+
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ push_database_send_msg(req);
+}
+
+static void push_database_send_msg(struct tevent_req *req)
+{
+ struct push_database_state *state = tevent_req_data(
+ req, struct push_database_state);
+ struct tevent_req *subreq;
+ struct ctdb_rec_buffer *recbuf;
+ struct ctdb_req_message message;
+ TDB_DATA data;
+ size_t np;
+ int ret;
+
+ if (state->num_buffers_sent == state->num_buffers) {
+ struct ctdb_req_control request;
+
+ ctdb_req_control_db_push_confirm(&request,
+ recdb_id(state->recdb));
+ subreq = ctdb_client_control_multi_send(state, state->ev,
+ state->client,
+ state->pnn_list,
+ state->count,
+ TIMEOUT(), &request);
+ if (tevent_req_nomem(subreq, req)) {
+ return;
+ }
+ tevent_req_set_callback(subreq, push_database_confirmed, req);
+ return;
+ }
+
+ ret = ctdb_rec_buffer_read(state->fd, state, &recbuf);
+ if (ret != 0) {
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ data.dsize = ctdb_rec_buffer_len(recbuf);
+ data.dptr = talloc_size(state, data.dsize);
+ if (tevent_req_nomem(data.dptr, req)) {
+ return;
+ }
+
+ ctdb_rec_buffer_push(recbuf, data.dptr, &np);
+
+ message.srvid = state->srvid;
+ message.data.data = data;
+
+ D_DEBUG("Pushing buffer %d with %d records for db %s\n",
+ state->num_buffers_sent, recbuf->count,
+ recdb_name(state->recdb));
+
+ subreq = ctdb_client_message_multi_send(state, state->ev,
+ state->client,
+ state->pnn_list, state->count,
+ &message);
+ if (tevent_req_nomem(subreq, req)) {
+ return;
+ }
+ tevent_req_set_callback(subreq, push_database_send_done, req);
+
+ state->num_records += recbuf->count;
+
+ talloc_free(data.dptr);
+ talloc_free(recbuf);
+}
+
+static void push_database_send_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct push_database_state *state = tevent_req_data(
+ req, struct push_database_state);
+ bool status;
+ int ret;
+
+ status = ctdb_client_message_multi_recv(subreq, &ret, NULL, NULL);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ D_ERR("Sending recovery records failed for %s\n",
+ recdb_name(state->recdb));
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ state->num_buffers_sent += 1;
+
+ push_database_send_msg(req);
+}
+
+static void push_database_confirmed(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct push_database_state *state = tevent_req_data(
+ req, struct push_database_state);
+ struct ctdb_reply_control **reply;
+ int *err_list;
+ bool status;
+ unsigned int i;
+ int ret;
+ uint32_t num_records;
+
+ status = ctdb_client_control_multi_recv(subreq, &ret, state,
+ &err_list, &reply);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ int ret2;
+ uint32_t pnn;
+
+ ret2 = ctdb_client_control_multi_error(state->pnn_list,
+ state->count, err_list,
+ &pnn);
+ if (ret2 != 0) {
+ D_ERR("control DB_PUSH_CONFIRM failed for db %s"
+ " on node %u, ret=%d\n",
+ recdb_name(state->recdb), pnn, ret2);
+ } else {
+ D_ERR("control DB_PUSH_CONFIRM failed for db %s,"
+ " ret=%d\n",
+ recdb_name(state->recdb), ret);
+ }
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ for (i=0; i<state->count; i++) {
+ ret = ctdb_reply_control_db_push_confirm(reply[i],
+ &num_records);
+ if (ret != 0) {
+ tevent_req_error(req, EPROTO);
+ return;
+ }
+
+ if (num_records != state->num_records) {
+ D_ERR("Node %u received %d of %d records for %s\n",
+ state->pnn_list[i], num_records,
+ state->num_records, recdb_name(state->recdb));
+ tevent_req_error(req, EPROTO);
+ return;
+ }
+ }
+
+ talloc_free(reply);
+
+ D_INFO("Pushed %d records for db %s\n",
+ state->num_records, recdb_name(state->recdb));
+
+ tevent_req_done(req);
+}
+
+static bool push_database_recv(struct tevent_req *req, int *perr)
+{
+ return generic_recv(req, perr);
+}
+
+/*
+ * Collect databases using highest sequence number
+ */
+
+struct collect_highseqnum_db_state {
+ struct tevent_context *ev;
+ struct ctdb_client_context *client;
+ struct node_list *nlist;
+ uint32_t db_id;
+ struct recdb_context *recdb;
+
+ uint32_t max_pnn;
+};
+
+static void collect_highseqnum_db_seqnum_done(struct tevent_req *subreq);
+static void collect_highseqnum_db_pulldb_done(struct tevent_req *subreq);
+
+static struct tevent_req *collect_highseqnum_db_send(
+ TALLOC_CTX *mem_ctx,
+ struct tevent_context *ev,
+ struct ctdb_client_context *client,
+ struct node_list *nlist,
+ uint32_t db_id,
+ struct recdb_context *recdb)
+{
+ struct tevent_req *req, *subreq;
+ struct collect_highseqnum_db_state *state;
+ struct ctdb_req_control request;
+
+ req = tevent_req_create(mem_ctx, &state,
+ struct collect_highseqnum_db_state);
+ if (req == NULL) {
+ return NULL;
+ }
+
+ state->ev = ev;
+ state->client = client;
+ state->nlist = nlist;
+ state->db_id = db_id;
+ state->recdb = recdb;
+
+ ctdb_req_control_get_db_seqnum(&request, db_id);
+ subreq = ctdb_client_control_multi_send(mem_ctx,
+ ev,
+ client,
+ nlist->pnn_list,
+ nlist->count,
+ TIMEOUT(),
+ &request);
+ if (tevent_req_nomem(subreq, req)) {
+ return tevent_req_post(req, ev);
+ }
+ tevent_req_set_callback(subreq, collect_highseqnum_db_seqnum_done,
+ req);
+
+ return req;
+}
+
+static void collect_highseqnum_db_seqnum_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct collect_highseqnum_db_state *state = tevent_req_data(
+ req, struct collect_highseqnum_db_state);
+ struct ctdb_reply_control **reply;
+ int *err_list;
+ bool status;
+ unsigned int i;
+ int ret;
+ uint64_t seqnum, max_seqnum;
+
+ status = ctdb_client_control_multi_recv(subreq, &ret, state,
+ &err_list, &reply);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ int ret2;
+ uint32_t pnn;
+
+ ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list,
+ state->nlist->count,
+ err_list,
+ &pnn);
+ if (ret2 != 0) {
+ D_ERR("control GET_DB_SEQNUM failed for db %s"
+ " on node %u, ret=%d\n",
+ recdb_name(state->recdb), pnn, ret2);
+ } else {
+ D_ERR("control GET_DB_SEQNUM failed for db %s,"
+ " ret=%d\n",
+ recdb_name(state->recdb), ret);
+ }
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ max_seqnum = 0;
+ state->max_pnn = state->nlist->pnn_list[0];
+ for (i=0; i<state->nlist->count; i++) {
+ ret = ctdb_reply_control_get_db_seqnum(reply[i], &seqnum);
+ if (ret != 0) {
+ tevent_req_error(req, EPROTO);
+ return;
+ }
+
+ if (max_seqnum < seqnum) {
+ max_seqnum = seqnum;
+ state->max_pnn = state->nlist->pnn_list[i];
+ }
+ }
+
+ talloc_free(reply);
+
+ D_INFO("Pull persistent db %s from node %d with seqnum 0x%"PRIx64"\n",
+ recdb_name(state->recdb), state->max_pnn, max_seqnum);
+
+ subreq = pull_database_send(state,
+ state->ev,
+ state->client,
+ state->max_pnn,
+ state->recdb);
+ if (tevent_req_nomem(subreq, req)) {
+ return;
+ }
+ tevent_req_set_callback(subreq, collect_highseqnum_db_pulldb_done,
+ req);
+}
+
+static void collect_highseqnum_db_pulldb_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct collect_highseqnum_db_state *state = tevent_req_data(
+ req, struct collect_highseqnum_db_state);
+ int ret;
+ bool status;
+
+ status = pull_database_recv(subreq, &ret);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ node_list_ban_credits(state->nlist, state->max_pnn);
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ tevent_req_done(req);
+}
+
+static bool collect_highseqnum_db_recv(struct tevent_req *req, int *perr)
+{
+ return generic_recv(req, perr);
+}
+
+/*
+ * Collect all databases
+ */
+
+struct collect_all_db_state {
+ struct tevent_context *ev;
+ struct ctdb_client_context *client;
+ struct node_list *nlist;
+ uint32_t db_id;
+ struct recdb_context *recdb;
+
+ struct ctdb_pulldb pulldb;
+ unsigned int index;
+};
+
+static void collect_all_db_pulldb_done(struct tevent_req *subreq);
+
+static struct tevent_req *collect_all_db_send(
+ TALLOC_CTX *mem_ctx,
+ struct tevent_context *ev,
+ struct ctdb_client_context *client,
+ struct node_list *nlist,
+ uint32_t db_id,
+ struct recdb_context *recdb)
+{
+ struct tevent_req *req, *subreq;
+ struct collect_all_db_state *state;
+
+ req = tevent_req_create(mem_ctx, &state,
+ struct collect_all_db_state);
+ if (req == NULL) {
+ return NULL;
+ }
+
+ state->ev = ev;
+ state->client = client;
+ state->nlist = nlist;
+ state->db_id = db_id;
+ state->recdb = recdb;
+ state->index = 0;
+
+ subreq = pull_database_send(state,
+ ev,
+ client,
+ nlist->pnn_list[state->index],
+ recdb);
+ if (tevent_req_nomem(subreq, req)) {
+ return tevent_req_post(req, ev);
+ }
+ tevent_req_set_callback(subreq, collect_all_db_pulldb_done, req);
+
+ return req;
+}
+
+static void collect_all_db_pulldb_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct collect_all_db_state *state = tevent_req_data(
+ req, struct collect_all_db_state);
+ int ret;
+ bool status;
+
+ status = pull_database_recv(subreq, &ret);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ node_list_ban_credits(state->nlist,
+ state->nlist->pnn_list[state->index]);
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ state->index += 1;
+ if (state->index == state->nlist->count) {
+ tevent_req_done(req);
+ return;
+ }
+
+ subreq = pull_database_send(state,
+ state->ev,
+ state->client,
+ state->nlist->pnn_list[state->index],
+ state->recdb);
+ if (tevent_req_nomem(subreq, req)) {
+ return;
+ }
+ tevent_req_set_callback(subreq, collect_all_db_pulldb_done, req);
+}
+
+static bool collect_all_db_recv(struct tevent_req *req, int *perr)
+{
+ return generic_recv(req, perr);
+}
+
+
+/**
+ * For each database do the following:
+ * - Get DB name from all nodes
+ * - Attach database on missing nodes
+ * - Get DB path
+ * - Freeze database on all nodes
+ * - Start transaction on all nodes
+ * - Collect database from all nodes
+ * - Wipe database on all nodes
+ * - Push database to all nodes
+ * - Commit transaction on all nodes
+ * - Thaw database on all nodes
+ */
+
+struct recover_db_state {
+ struct tevent_context *ev;
+ struct ctdb_client_context *client;
+ struct ctdb_tunable_list *tun_list;
+ struct node_list *nlist;
+ struct db *db;
+
+ uint32_t destnode;
+ struct ctdb_transdb transdb;
+
+ const char *db_name, *db_path;
+ struct recdb_context *recdb;
+};
+
+static void recover_db_name_done(struct tevent_req *subreq);
+static void recover_db_create_missing_done(struct tevent_req *subreq);
+static void recover_db_path_done(struct tevent_req *subreq);
+static void recover_db_freeze_done(struct tevent_req *subreq);
+static void recover_db_transaction_started(struct tevent_req *subreq);
+static void recover_db_collect_done(struct tevent_req *subreq);
+static void recover_db_wipedb_done(struct tevent_req *subreq);
+static void recover_db_pushdb_done(struct tevent_req *subreq);
+static void recover_db_transaction_committed(struct tevent_req *subreq);
+static void recover_db_thaw_done(struct tevent_req *subreq);
+
+static struct tevent_req *recover_db_send(TALLOC_CTX *mem_ctx,
+ struct tevent_context *ev,
+ struct ctdb_client_context *client,
+ struct ctdb_tunable_list *tun_list,
+ struct node_list *nlist,
+ uint32_t generation,
+ struct db *db)
+{
+ struct tevent_req *req, *subreq;
+ struct recover_db_state *state;
+ struct ctdb_req_control request;
+
+ req = tevent_req_create(mem_ctx, &state, struct recover_db_state);
+ if (req == NULL) {
+ return NULL;
+ }
+
+ state->ev = ev;
+ state->client = client;
+ state->tun_list = tun_list;
+ state->nlist = nlist;
+ state->db = db;
+
+ state->destnode = ctdb_client_pnn(client);
+ state->transdb.db_id = db->db_id;
+ state->transdb.tid = generation;
+
+ ctdb_req_control_get_dbname(&request, db->db_id);
+ subreq = ctdb_client_control_multi_send(state,
+ ev,
+ client,
+ state->db->pnn_list,
+ state->db->num_nodes,
+ TIMEOUT(),
+ &request);
+ if (tevent_req_nomem(subreq, req)) {
+ return tevent_req_post(req, ev);
+ }
+ tevent_req_set_callback(subreq, recover_db_name_done, req);
+
+ return req;
+}
+
+static void recover_db_name_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct recover_db_state *state = tevent_req_data(
+ req, struct recover_db_state);
+ struct ctdb_reply_control **reply;
+ int *err_list;
+ unsigned int i;
+ int ret;
+ bool status;
+
+ status = ctdb_client_control_multi_recv(subreq,
+ &ret,
+ state,
+ &err_list,
+ &reply);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ int ret2;
+ uint32_t pnn;
+
+ ret2 = ctdb_client_control_multi_error(state->db->pnn_list,
+ state->db->num_nodes,
+ err_list,
+ &pnn);
+ if (ret2 != 0) {
+ D_ERR("control GET_DBNAME failed on node %u,"
+ " ret=%d\n",
+ pnn,
+ ret2);
+ } else {
+ D_ERR("control GET_DBNAME failed, ret=%d\n",
+ ret);
+ }
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ for (i = 0; i < state->db->num_nodes; i++) {
+ const char *db_name;
+ uint32_t pnn;
+
+ pnn = state->nlist->pnn_list[i];
+
+ ret = ctdb_reply_control_get_dbname(reply[i],
+ state,
+ &db_name);
+ if (ret != 0) {
+ D_ERR("control GET_DBNAME failed on node %u "
+ "for db=0x%x, ret=%d\n",
+ pnn,
+ state->db->db_id,
+ ret);
+ tevent_req_error(req, EPROTO);
+ return;
+ }
+
+ if (state->db_name == NULL) {
+ state->db_name = db_name;
+ continue;
+ }
+
+ if (strcmp(state->db_name, db_name) != 0) {
+ D_ERR("Incompatible database name for 0x%"PRIx32" "
+ "(%s != %s) on node %"PRIu32"\n",
+ state->db->db_id,
+ db_name,
+ state->db_name,
+ pnn);
+ node_list_ban_credits(state->nlist, pnn);
+ tevent_req_error(req, ret);
+ return;
+ }
+ }
+
+ talloc_free(reply);
+
+ subreq = db_create_missing_send(state,
+ state->ev,
+ state->client,
+ state->nlist,
+ state->db_name,
+ state->db);
+
+ if (tevent_req_nomem(subreq, req)) {
+ return;
+ }
+ tevent_req_set_callback(subreq, recover_db_create_missing_done, req);
+}
+
+static void recover_db_create_missing_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct recover_db_state *state = tevent_req_data(
+ req, struct recover_db_state);
+ struct ctdb_req_control request;
+ int ret;
+ bool status;
+
+ /* Could sanity check the db_id here */
+ status = db_create_missing_recv(subreq, &ret);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ ctdb_req_control_getdbpath(&request, state->db->db_id);
+ subreq = ctdb_client_control_send(state, state->ev, state->client,
+ state->destnode, TIMEOUT(),
+ &request);
+ if (tevent_req_nomem(subreq, req)) {
+ return;
+ }
+ tevent_req_set_callback(subreq, recover_db_path_done, req);
+}
+
+static void recover_db_path_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct recover_db_state *state = tevent_req_data(
+ req, struct recover_db_state);
+ struct ctdb_reply_control *reply;
+ struct ctdb_req_control request;
+ int ret;
+ bool status;
+
+ status = ctdb_client_control_recv(subreq, &ret, state, &reply);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ D_ERR("control GETDBPATH failed for db %s, ret=%d\n",
+ state->db_name, ret);
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ ret = ctdb_reply_control_getdbpath(reply, state, &state->db_path);
+ if (ret != 0) {
+ D_ERR("control GETDBPATH failed for db %s, ret=%d\n",
+ state->db_name, ret);
+ tevent_req_error(req, EPROTO);
+ return;
+ }
+
+ talloc_free(reply);
+
+ ctdb_req_control_db_freeze(&request, state->db->db_id);
+ subreq = ctdb_client_control_multi_send(state,
+ state->ev,
+ state->client,
+ state->nlist->pnn_list,
+ state->nlist->count,
+ TIMEOUT(),
+ &request);
+ if (tevent_req_nomem(subreq, req)) {
+ return;
+ }
+ tevent_req_set_callback(subreq, recover_db_freeze_done, req);
+}
+
+static void recover_db_freeze_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct recover_db_state *state = tevent_req_data(
+ req, struct recover_db_state);
+ struct ctdb_req_control request;
+ int *err_list;
+ int ret;
+ bool status;
+
+ status = ctdb_client_control_multi_recv(subreq, &ret, NULL, &err_list,
+ NULL);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ int ret2;
+ uint32_t pnn;
+
+ ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list,
+ state->nlist->count,
+ err_list,
+ &pnn);
+ if (ret2 != 0) {
+ D_ERR("control FREEZE_DB failed for db %s"
+ " on node %u, ret=%d\n",
+ state->db_name, pnn, ret2);
+
+ node_list_ban_credits(state->nlist, pnn);
+ } else {
+ D_ERR("control FREEZE_DB failed for db %s, ret=%d\n",
+ state->db_name, ret);
+ }
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ ctdb_req_control_db_transaction_start(&request, &state->transdb);
+ subreq = ctdb_client_control_multi_send(state,
+ state->ev,
+ state->client,
+ state->nlist->pnn_list,
+ state->nlist->count,
+ TIMEOUT(),
+ &request);
+ if (tevent_req_nomem(subreq, req)) {
+ return;
+ }
+ tevent_req_set_callback(subreq, recover_db_transaction_started, req);
+}
+
+static void recover_db_transaction_started(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct recover_db_state *state = tevent_req_data(
+ req, struct recover_db_state);
+ int *err_list;
+ uint32_t flags;
+ int ret;
+ bool status;
+
+ status = ctdb_client_control_multi_recv(subreq, &ret, NULL, &err_list,
+ NULL);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ int ret2;
+ uint32_t pnn;
+
+ ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list,
+ state->nlist->count,
+ err_list,
+ &pnn);
+ if (ret2 != 0) {
+ D_ERR("control TRANSACTION_DB failed for db=%s"
+ " on node %u, ret=%d\n",
+ state->db_name, pnn, ret2);
+ } else {
+ D_ERR("control TRANSACTION_DB failed for db=%s,"
+ " ret=%d\n", state->db_name, ret);
+ }
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ flags = state->db->db_flags;
+ state->recdb = recdb_create(state,
+ state->db->db_id,
+ state->db_name,
+ state->db_path,
+ state->tun_list->database_hash_size,
+ flags & CTDB_DB_FLAGS_PERSISTENT);
+ if (tevent_req_nomem(state->recdb, req)) {
+ return;
+ }
+
+ if ((flags & CTDB_DB_FLAGS_PERSISTENT) ||
+ (flags & CTDB_DB_FLAGS_REPLICATED)) {
+ subreq = collect_highseqnum_db_send(state,
+ state->ev,
+ state->client,
+ state->nlist,
+ state->db->db_id,
+ state->recdb);
+ } else {
+ subreq = collect_all_db_send(state,
+ state->ev,
+ state->client,
+ state->nlist,
+ state->db->db_id,
+ state->recdb);
+ }
+ if (tevent_req_nomem(subreq, req)) {
+ return;
+ }
+ tevent_req_set_callback(subreq, recover_db_collect_done, req);
+}
+
+static void recover_db_collect_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct recover_db_state *state = tevent_req_data(
+ req, struct recover_db_state);
+ struct ctdb_req_control request;
+ int ret;
+ bool status;
+
+ if ((state->db->db_flags & CTDB_DB_FLAGS_PERSISTENT) ||
+ (state->db->db_flags & CTDB_DB_FLAGS_REPLICATED)) {
+ status = collect_highseqnum_db_recv(subreq, &ret);
+ } else {
+ status = collect_all_db_recv(subreq, &ret);
+ }
+ TALLOC_FREE(subreq);
+ if (! status) {
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ ctdb_req_control_wipe_database(&request, &state->transdb);
+ subreq = ctdb_client_control_multi_send(state,
+ state->ev,
+ state->client,
+ state->nlist->pnn_list,
+ state->nlist->count,
+ TIMEOUT(),
+ &request);
+ if (tevent_req_nomem(subreq, req)) {
+ return;
+ }
+ tevent_req_set_callback(subreq, recover_db_wipedb_done, req);
+}
+
+static void recover_db_wipedb_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct recover_db_state *state = tevent_req_data(
+ req, struct recover_db_state);
+ int *err_list;
+ int ret;
+ bool status;
+
+ status = ctdb_client_control_multi_recv(subreq, &ret, NULL, &err_list,
+ NULL);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ int ret2;
+ uint32_t pnn;
+
+ ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list,
+ state->nlist->count,
+ err_list,
+ &pnn);
+ if (ret2 != 0) {
+ D_ERR("control WIPEDB failed for db %s on node %u,"
+ " ret=%d\n", state->db_name, pnn, ret2);
+ } else {
+ D_ERR("control WIPEDB failed for db %s, ret=%d\n",
+ state->db_name, ret);
+ }
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ subreq = push_database_send(state,
+ state->ev,
+ state->client,
+ state->nlist->pnn_list,
+ state->nlist->count,
+ state->recdb,
+ state->tun_list->rec_buffer_size_limit);
+ if (tevent_req_nomem(subreq, req)) {
+ return;
+ }
+ tevent_req_set_callback(subreq, recover_db_pushdb_done, req);
+}
+
+static void recover_db_pushdb_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct recover_db_state *state = tevent_req_data(
+ req, struct recover_db_state);
+ struct ctdb_req_control request;
+ int ret;
+ bool status;
+
+ status = push_database_recv(subreq, &ret);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ TALLOC_FREE(state->recdb);
+
+ ctdb_req_control_db_transaction_commit(&request, &state->transdb);
+ subreq = ctdb_client_control_multi_send(state,
+ state->ev,
+ state->client,
+ state->nlist->pnn_list,
+ state->nlist->count,
+ TIMEOUT(),
+ &request);
+ if (tevent_req_nomem(subreq, req)) {
+ return;
+ }
+ tevent_req_set_callback(subreq, recover_db_transaction_committed, req);
+}
+
+static void recover_db_transaction_committed(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct recover_db_state *state = tevent_req_data(
+ req, struct recover_db_state);
+ struct ctdb_req_control request;
+ int *err_list;
+ int ret;
+ bool status;
+
+ status = ctdb_client_control_multi_recv(subreq, &ret, NULL, &err_list,
+ NULL);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ int ret2;
+ uint32_t pnn;
+
+ ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list,
+ state->nlist->count,
+ err_list,
+ &pnn);
+ if (ret2 != 0) {
+ D_ERR("control DB_TRANSACTION_COMMIT failed for db %s"
+ " on node %u, ret=%d\n",
+ state->db_name, pnn, ret2);
+ } else {
+ D_ERR("control DB_TRANSACTION_COMMIT failed for db %s,"
+ " ret=%d\n", state->db_name, ret);
+ }
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ ctdb_req_control_db_thaw(&request, state->db->db_id);
+ subreq = ctdb_client_control_multi_send(state,
+ state->ev,
+ state->client,
+ state->nlist->pnn_list,
+ state->nlist->count,
+ TIMEOUT(),
+ &request);
+ if (tevent_req_nomem(subreq, req)) {
+ return;
+ }
+ tevent_req_set_callback(subreq, recover_db_thaw_done, req);
+}
+
+static void recover_db_thaw_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct recover_db_state *state = tevent_req_data(
+ req, struct recover_db_state);
+ int *err_list;
+ int ret;
+ bool status;
+
+ status = ctdb_client_control_multi_recv(subreq, &ret, NULL, &err_list,
+ NULL);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ int ret2;
+ uint32_t pnn;
+
+ ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list,
+ state->nlist->count,
+ err_list,
+ &pnn);
+ if (ret2 != 0) {
+ D_ERR("control DB_THAW failed for db %s on node %u,"
+ " ret=%d\n", state->db_name, pnn, ret2);
+ } else {
+ D_ERR("control DB_THAW failed for db %s, ret=%d\n",
+ state->db_name, ret);
+ }
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ tevent_req_done(req);
+}
+
+static bool recover_db_recv(struct tevent_req *req)
+{
+ return generic_recv(req, NULL);
+}
+
+
+/*
+ * Start database recovery for each database
+ *
+ * Try to recover each database 5 times before failing recovery.
+ */
+
+struct db_recovery_state {
+ struct tevent_context *ev;
+ struct db_list *dblist;
+ unsigned int num_replies;
+ unsigned int num_failed;
+};
+
+struct db_recovery_one_state {
+ struct tevent_req *req;
+ struct ctdb_client_context *client;
+ struct db_list *dblist;
+ struct ctdb_tunable_list *tun_list;
+ struct node_list *nlist;
+ uint32_t generation;
+ struct db *db;
+ int num_fails;
+};
+
+static void db_recovery_one_done(struct tevent_req *subreq);
+
+static struct tevent_req *db_recovery_send(TALLOC_CTX *mem_ctx,
+ struct tevent_context *ev,
+ struct ctdb_client_context *client,
+ struct db_list *dblist,
+ struct ctdb_tunable_list *tun_list,
+ struct node_list *nlist,
+ uint32_t generation)
+{
+ struct tevent_req *req, *subreq;
+ struct db_recovery_state *state;
+ struct db *db;
+
+ req = tevent_req_create(mem_ctx, &state, struct db_recovery_state);
+ if (req == NULL) {
+ return NULL;
+ }
+
+ state->ev = ev;
+ state->dblist = dblist;
+ state->num_replies = 0;
+ state->num_failed = 0;
+
+ if (dblist->num_dbs == 0) {
+ tevent_req_done(req);
+ return tevent_req_post(req, ev);
+ }
+
+ for (db = dblist->db; db != NULL; db = db->next) {
+ struct db_recovery_one_state *substate;
+
+ substate = talloc_zero(state, struct db_recovery_one_state);
+ if (tevent_req_nomem(substate, req)) {
+ return tevent_req_post(req, ev);
+ }
+
+ substate->req = req;
+ substate->client = client;
+ substate->dblist = dblist;
+ substate->tun_list = tun_list;
+ substate->nlist = nlist;
+ substate->generation = generation;
+ substate->db = db;
+
+ subreq = recover_db_send(state,
+ ev,
+ client,
+ tun_list,
+ nlist,
+ generation,
+ substate->db);
+ if (tevent_req_nomem(subreq, req)) {
+ return tevent_req_post(req, ev);
+ }
+ tevent_req_set_callback(subreq, db_recovery_one_done,
+ substate);
+ D_NOTICE("recover database 0x%08x\n", substate->db->db_id);
+ }
+
+ return req;
+}
+
+static void db_recovery_one_done(struct tevent_req *subreq)
+{
+ struct db_recovery_one_state *substate = tevent_req_callback_data(
+ subreq, struct db_recovery_one_state);
+ struct tevent_req *req = substate->req;
+ struct db_recovery_state *state = tevent_req_data(
+ req, struct db_recovery_state);
+ bool status;
+
+ status = recover_db_recv(subreq);
+ TALLOC_FREE(subreq);
+
+ if (status) {
+ talloc_free(substate);
+ goto done;
+ }
+
+ substate->num_fails += 1;
+ if (substate->num_fails < NUM_RETRIES) {
+ subreq = recover_db_send(state,
+ state->ev,
+ substate->client,
+ substate->tun_list,
+ substate->nlist,
+ substate->generation,
+ substate->db);
+ if (tevent_req_nomem(subreq, req)) {
+ goto failed;
+ }
+ tevent_req_set_callback(subreq, db_recovery_one_done, substate);
+ D_NOTICE("recover database 0x%08x, attempt %d\n",
+ substate->db->db_id, substate->num_fails+1);
+ return;
+ }
+
+failed:
+ state->num_failed += 1;
+
+done:
+ state->num_replies += 1;
+
+ if (state->num_replies == state->dblist->num_dbs) {
+ tevent_req_done(req);
+ }
+}
+
+static bool db_recovery_recv(struct tevent_req *req, unsigned int *count)
+{
+ struct db_recovery_state *state = tevent_req_data(
+ req, struct db_recovery_state);
+ int err;
+
+ if (tevent_req_is_unix_error(req, &err)) {
+ *count = 0;
+ return false;
+ }
+
+ *count = state->num_replies - state->num_failed;
+
+ if (state->num_failed > 0) {
+ return false;
+ }
+
+ return true;
+}
+
+struct ban_node_state {
+ struct tevent_context *ev;
+ struct ctdb_client_context *client;
+ struct ctdb_tunable_list *tun_list;
+ struct node_list *nlist;
+ uint32_t destnode;
+
+ uint32_t max_pnn;
+};
+
+static bool ban_node_check(struct tevent_req *req);
+static void ban_node_check_done(struct tevent_req *subreq);
+static void ban_node_done(struct tevent_req *subreq);
+
+static struct tevent_req *ban_node_send(TALLOC_CTX *mem_ctx,
+ struct tevent_context *ev,
+ struct ctdb_client_context *client,
+ struct ctdb_tunable_list *tun_list,
+ struct node_list *nlist)
+{
+ struct tevent_req *req;
+ struct ban_node_state *state;
+ bool ok;
+
+ req = tevent_req_create(mem_ctx, &state, struct ban_node_state);
+ if (req == NULL) {
+ return NULL;
+ }
+
+ state->ev = ev;
+ state->client = client;
+ state->tun_list = tun_list;
+ state->nlist = nlist;
+ state->destnode = ctdb_client_pnn(client);
+
+ /* Bans are not enabled */
+ if (state->tun_list->enable_bans == 0) {
+ D_ERR("Bans are not enabled\n");
+ tevent_req_done(req);
+ return tevent_req_post(req, ev);
+ }
+
+ ok = ban_node_check(req);
+ if (!ok) {
+ return tevent_req_post(req, ev);
+ }
+
+ return req;
+}
+
+static bool ban_node_check(struct tevent_req *req)
+{
+ struct tevent_req *subreq;
+ struct ban_node_state *state = tevent_req_data(
+ req, struct ban_node_state);
+ struct ctdb_req_control request;
+ unsigned max_credits = 0, i;
+
+ for (i=0; i<state->nlist->count; i++) {
+ if (state->nlist->ban_credits[i] > max_credits) {
+ state->max_pnn = state->nlist->pnn_list[i];
+ max_credits = state->nlist->ban_credits[i];
+ }
+ }
+
+ if (max_credits < NUM_RETRIES) {
+ tevent_req_done(req);
+ return false;
+ }
+
+ ctdb_req_control_get_nodemap(&request);
+ subreq = ctdb_client_control_send(state,
+ state->ev,
+ state->client,
+ state->max_pnn,
+ TIMEOUT(),
+ &request);
+ if (tevent_req_nomem(subreq, req)) {
+ return false;
+ }
+ tevent_req_set_callback(subreq, ban_node_check_done, req);
+
+ return true;
+}
+
+static void ban_node_check_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct ban_node_state *state = tevent_req_data(
+ req, struct ban_node_state);
+ struct ctdb_reply_control *reply;
+ struct ctdb_node_map *nodemap;
+ struct ctdb_req_control request;
+ struct ctdb_ban_state ban;
+ unsigned int i;
+ int ret;
+ bool ok;
+
+ ok = ctdb_client_control_recv(subreq, &ret, state, &reply);
+ TALLOC_FREE(subreq);
+ if (!ok) {
+ D_ERR("control GET_NODEMAP failed to node %u, ret=%d\n",
+ state->max_pnn, ret);
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ ret = ctdb_reply_control_get_nodemap(reply, state, &nodemap);
+ if (ret != 0) {
+ D_ERR("control GET_NODEMAP failed, ret=%d\n", ret);
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ for (i=0; i<nodemap->num; i++) {
+ if (nodemap->node[i].pnn != state->max_pnn) {
+ continue;
+ }
+
+ /* If the node became inactive, reset ban_credits */
+ if (nodemap->node[i].flags & NODE_FLAGS_INACTIVE) {
+ unsigned int j;
+
+ for (j=0; j<state->nlist->count; j++) {
+ if (state->nlist->pnn_list[j] ==
+ state->max_pnn) {
+ state->nlist->ban_credits[j] = 0;
+ break;
+ }
+ }
+ state->max_pnn = CTDB_UNKNOWN_PNN;
+ }
+ }
+
+ talloc_free(nodemap);
+ talloc_free(reply);
+
+ /* If node becames inactive during recovery, pick next */
+ if (state->max_pnn == CTDB_UNKNOWN_PNN) {
+ (void) ban_node_check(req);
+ return;
+ }
+
+ ban = (struct ctdb_ban_state) {
+ .pnn = state->max_pnn,
+ .time = state->tun_list->recovery_ban_period,
+ };
+
+ D_ERR("Banning node %u for %u seconds\n", ban.pnn, ban.time);
+
+ ctdb_req_control_set_ban_state(&request, &ban);
+ subreq = ctdb_client_control_send(state,
+ state->ev,
+ state->client,
+ ban.pnn,
+ TIMEOUT(),
+ &request);
+ if (tevent_req_nomem(subreq, req)) {
+ return;
+ }
+ tevent_req_set_callback(subreq, ban_node_done, req);
+}
+
+static void ban_node_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct node_ban_state *state = tevent_req_data(
+ req, struct node_ban_state);
+ struct ctdb_reply_control *reply;
+ int ret;
+ bool status;
+
+ status = ctdb_client_control_recv(subreq, &ret, state, &reply);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ ret = ctdb_reply_control_set_ban_state(reply);
+ if (ret != 0) {
+ D_ERR("control SET_BAN_STATE failed, ret=%d\n", ret);
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ talloc_free(reply);
+ tevent_req_done(req);
+}
+
+static bool ban_node_recv(struct tevent_req *req, int *perr)
+{
+ if (tevent_req_is_unix_error(req, perr)) {
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Run the parallel database recovery
+ *
+ * - Get tunables
+ * - Get nodemap from all nodes
+ * - Get capabilities from all nodes
+ * - Get dbmap
+ * - Set RECOVERY_ACTIVE
+ * - Send START_RECOVERY
+ * - Update vnnmap on all nodes
+ * - Run database recovery
+ * - Set RECOVERY_NORMAL
+ * - Send END_RECOVERY
+ */
+
+struct recovery_state {
+ struct tevent_context *ev;
+ struct ctdb_client_context *client;
+ uint32_t generation;
+ uint32_t destnode;
+ struct node_list *nlist;
+ struct ctdb_tunable_list *tun_list;
+ struct ctdb_vnn_map *vnnmap;
+ struct db_list *dblist;
+};
+
+static void recovery_tunables_done(struct tevent_req *subreq);
+static void recovery_nodemap_done(struct tevent_req *subreq);
+static void recovery_nodemap_verify(struct tevent_req *subreq);
+static void recovery_capabilities_done(struct tevent_req *subreq);
+static void recovery_dbmap_done(struct tevent_req *subreq);
+static void recovery_active_done(struct tevent_req *subreq);
+static void recovery_start_recovery_done(struct tevent_req *subreq);
+static void recovery_vnnmap_update_done(struct tevent_req *subreq);
+static void recovery_db_recovery_done(struct tevent_req *subreq);
+static void recovery_failed_done(struct tevent_req *subreq);
+static void recovery_normal_done(struct tevent_req *subreq);
+static void recovery_end_recovery_done(struct tevent_req *subreq);
+
+static struct tevent_req *recovery_send(TALLOC_CTX *mem_ctx,
+ struct tevent_context *ev,
+ struct ctdb_client_context *client,
+ uint32_t generation)
+{
+ struct tevent_req *req, *subreq;
+ struct recovery_state *state;
+ struct ctdb_req_control request;
+
+ req = tevent_req_create(mem_ctx, &state, struct recovery_state);
+ if (req == NULL) {
+ return NULL;
+ }
+
+ state->ev = ev;
+ state->client = client;
+ state->generation = generation;
+ state->destnode = ctdb_client_pnn(client);
+
+ ctdb_req_control_get_all_tunables(&request);
+ subreq = ctdb_client_control_send(state, state->ev, state->client,
+ state->destnode, TIMEOUT(),
+ &request);
+ if (tevent_req_nomem(subreq, req)) {
+ return tevent_req_post(req, ev);
+ }
+ tevent_req_set_callback(subreq, recovery_tunables_done, req);
+
+ return req;
+}
+
+static void recovery_tunables_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct recovery_state *state = tevent_req_data(
+ req, struct recovery_state);
+ struct ctdb_reply_control *reply;
+ struct ctdb_req_control request;
+ int ret;
+ bool status;
+
+ status = ctdb_client_control_recv(subreq, &ret, state, &reply);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ D_ERR("control GET_ALL_TUNABLES failed, ret=%d\n", ret);
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ ret = ctdb_reply_control_get_all_tunables(reply, state,
+ &state->tun_list);
+ if (ret != 0) {
+ D_ERR("control GET_ALL_TUNABLES failed, ret=%d\n", ret);
+ tevent_req_error(req, EPROTO);
+ return;
+ }
+
+ talloc_free(reply);
+
+ recover_timeout = state->tun_list->recover_timeout;
+
+ ctdb_req_control_get_nodemap(&request);
+ subreq = ctdb_client_control_send(state, state->ev, state->client,
+ state->destnode, TIMEOUT(),
+ &request);
+ if (tevent_req_nomem(subreq, req)) {
+ return;
+ }
+ tevent_req_set_callback(subreq, recovery_nodemap_done, req);
+}
+
+static void recovery_nodemap_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct recovery_state *state = tevent_req_data(
+ req, struct recovery_state);
+ struct ctdb_reply_control *reply;
+ struct ctdb_req_control request;
+ struct ctdb_node_map *nodemap;
+ unsigned int i;
+ bool status;
+ int ret;
+
+ status = ctdb_client_control_recv(subreq, &ret, state, &reply);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ D_ERR("control GET_NODEMAP failed to node %u, ret=%d\n",
+ state->destnode, ret);
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ ret = ctdb_reply_control_get_nodemap(reply, state, &nodemap);
+ if (ret != 0) {
+ D_ERR("control GET_NODEMAP failed, ret=%d\n", ret);
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ state->nlist = node_list_init(state, nodemap->num);
+ if (tevent_req_nomem(state->nlist, req)) {
+ return;
+ }
+
+ for (i=0; i<nodemap->num; i++) {
+ bool ok;
+
+ if (nodemap->node[i].flags & NODE_FLAGS_DISCONNECTED) {
+ continue;
+ }
+
+ ok = node_list_add(state->nlist, nodemap->node[i].pnn);
+ if (!ok) {
+ tevent_req_error(req, EINVAL);
+ return;
+ }
+ }
+
+ talloc_free(nodemap);
+ talloc_free(reply);
+
+ /* Verify flags by getting local node information from each node */
+ ctdb_req_control_get_nodemap(&request);
+ subreq = ctdb_client_control_multi_send(state,
+ state->ev,
+ state->client,
+ state->nlist->pnn_list,
+ state->nlist->count,
+ TIMEOUT(),
+ &request);
+ if (tevent_req_nomem(subreq, req)) {
+ return;
+ }
+ tevent_req_set_callback(subreq, recovery_nodemap_verify, req);
+}
+
+static void recovery_nodemap_verify(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct recovery_state *state = tevent_req_data(
+ req, struct recovery_state);
+ struct ctdb_req_control request;
+ struct ctdb_reply_control **reply;
+ struct node_list *nlist;
+ unsigned int i;
+ int *err_list;
+ int ret;
+ bool status;
+
+ status = ctdb_client_control_multi_recv(subreq,
+ &ret,
+ state,
+ &err_list,
+ &reply);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ int ret2;
+ uint32_t pnn;
+
+ ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list,
+ state->nlist->count,
+ err_list,
+ &pnn);
+ if (ret2 != 0) {
+ D_ERR("control GET_NODEMAP failed on node %u,"
+ " ret=%d\n", pnn, ret2);
+ } else {
+ D_ERR("control GET_NODEMAP failed, ret=%d\n", ret);
+ }
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ nlist = node_list_init(state, state->nlist->size);
+ if (tevent_req_nomem(nlist, req)) {
+ return;
+ }
+
+ for (i=0; i<state->nlist->count; i++) {
+ struct ctdb_node_map *nodemap = NULL;
+ uint32_t pnn, flags;
+ unsigned int j;
+ bool ok;
+
+ pnn = state->nlist->pnn_list[i];
+ ret = ctdb_reply_control_get_nodemap(reply[i],
+ state,
+ &nodemap);
+ if (ret != 0) {
+ D_ERR("control GET_NODEMAP failed on node %u\n", pnn);
+ tevent_req_error(req, EPROTO);
+ return;
+ }
+
+ flags = NODE_FLAGS_DISCONNECTED;
+ for (j=0; j<nodemap->num; j++) {
+ if (nodemap->node[j].pnn == pnn) {
+ flags = nodemap->node[j].flags;
+ break;
+ }
+ }
+
+ TALLOC_FREE(nodemap);
+
+ if (flags & NODE_FLAGS_INACTIVE) {
+ continue;
+ }
+
+ ok = node_list_add(nlist, pnn);
+ if (!ok) {
+ tevent_req_error(req, EINVAL);
+ return;
+ }
+ }
+
+ talloc_free(reply);
+
+ talloc_free(state->nlist);
+ state->nlist = nlist;
+
+ ctdb_req_control_get_capabilities(&request);
+ subreq = ctdb_client_control_multi_send(state,
+ state->ev,
+ state->client,
+ state->nlist->pnn_list,
+ state->nlist->count,
+ TIMEOUT(),
+ &request);
+ if (tevent_req_nomem(subreq, req)) {
+ return;
+ }
+ tevent_req_set_callback(subreq, recovery_capabilities_done, req);
+}
+
+static void recovery_capabilities_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct recovery_state *state = tevent_req_data(
+ req, struct recovery_state);
+ struct ctdb_reply_control **reply;
+ struct ctdb_req_control request;
+ int *err_list;
+ unsigned int i;
+ int ret;
+ bool status;
+
+ status = ctdb_client_control_multi_recv(subreq, &ret, state, &err_list,
+ &reply);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ int ret2;
+ uint32_t pnn;
+
+ ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list,
+ state->nlist->count,
+ err_list,
+ &pnn);
+ if (ret2 != 0) {
+ D_ERR("control GET_CAPABILITIES failed on node %u,"
+ " ret=%d\n", pnn, ret2);
+ } else {
+ D_ERR("control GET_CAPABILITIES failed, ret=%d\n",
+ ret);
+ }
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ for (i=0; i<state->nlist->count; i++) {
+ uint32_t caps;
+
+ ret = ctdb_reply_control_get_capabilities(reply[i], &caps);
+ if (ret != 0) {
+ D_ERR("control GET_CAPABILITIES failed on node %u\n",
+ state->nlist->pnn_list[i]);
+ tevent_req_error(req, EPROTO);
+ return;
+ }
+
+ state->nlist->caps[i] = caps;
+ }
+
+ talloc_free(reply);
+
+ ctdb_req_control_get_dbmap(&request);
+ subreq = ctdb_client_control_multi_send(state,
+ state->ev,
+ state->client,
+ state->nlist->pnn_list,
+ state->nlist->count,
+ TIMEOUT(),
+ &request);
+ if (tevent_req_nomem(subreq, req)) {
+ return;
+ }
+ tevent_req_set_callback(subreq, recovery_dbmap_done, req);
+}
+
+static void recovery_dbmap_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct recovery_state *state = tevent_req_data(
+ req, struct recovery_state);
+ struct ctdb_reply_control **reply;
+ struct ctdb_req_control request;
+ int *err_list;
+ unsigned int i, j;
+ int ret;
+ bool status;
+
+ status = ctdb_client_control_multi_recv(subreq,
+ &ret,
+ state,
+ &err_list,
+ &reply);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ int ret2;
+ uint32_t pnn;
+
+ ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list,
+ state->nlist->count,
+ err_list,
+ &pnn);
+ if (ret2 != 0) {
+ D_ERR("control GET_DBMAP failed on node %u,"
+ " ret=%d\n", pnn, ret2);
+ } else {
+ D_ERR("control GET_DBMAP failed, ret=%d\n",
+ ret);
+ }
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ state->dblist = db_list_init(state, state->nlist->count);
+ if (tevent_req_nomem(state->dblist, req)) {
+ D_ERR("memory allocation error\n");
+ return;
+ }
+
+ for (i = 0; i < state->nlist->count; i++) {
+ struct ctdb_dbid_map *dbmap = NULL;
+ uint32_t pnn;
+
+ pnn = state->nlist->pnn_list[i];
+
+ ret = ctdb_reply_control_get_dbmap(reply[i], state, &dbmap);
+ if (ret != 0) {
+ D_ERR("control GET_DBMAP failed on node %u\n",
+ pnn);
+ tevent_req_error(req, EPROTO);
+ return;
+ }
+
+ for (j = 0; j < dbmap->num; j++) {
+ ret = db_list_check_and_add(state->dblist,
+ dbmap->dbs[j].db_id,
+ dbmap->dbs[j].flags,
+ pnn);
+ if (ret != 0) {
+ D_ERR("failed to add database list entry, "
+ "ret=%d\n",
+ ret);
+ tevent_req_error(req, ret);
+ return;
+ }
+ }
+
+ TALLOC_FREE(dbmap);
+ }
+
+ ctdb_req_control_set_recmode(&request, CTDB_RECOVERY_ACTIVE);
+ subreq = ctdb_client_control_multi_send(state,
+ state->ev,
+ state->client,
+ state->nlist->pnn_list,
+ state->nlist->count,
+ TIMEOUT(),
+ &request);
+ if (tevent_req_nomem(subreq, req)) {
+ return;
+ }
+ tevent_req_set_callback(subreq, recovery_active_done, req);
+}
+
+static void recovery_active_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct recovery_state *state = tevent_req_data(
+ req, struct recovery_state);
+ struct ctdb_req_control request;
+ struct ctdb_vnn_map *vnnmap;
+ int *err_list;
+ int ret;
+ bool status;
+
+ status = ctdb_client_control_multi_recv(subreq, &ret, NULL, &err_list,
+ NULL);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ int ret2;
+ uint32_t pnn;
+
+ ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list,
+ state->nlist->count,
+ err_list,
+ &pnn);
+ if (ret2 != 0) {
+ D_ERR("failed to set recovery mode ACTIVE on node %u,"
+ " ret=%d\n", pnn, ret2);
+ } else {
+ D_ERR("failed to set recovery mode ACTIVE, ret=%d\n",
+ ret);
+ }
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ D_ERR("Set recovery mode to ACTIVE\n");
+
+ /* Calculate new VNNMAP */
+ vnnmap = talloc_zero(state, struct ctdb_vnn_map);
+ if (tevent_req_nomem(vnnmap, req)) {
+ return;
+ }
+
+ vnnmap->map = node_list_lmaster(state->nlist, vnnmap, &vnnmap->size);
+ if (tevent_req_nomem(vnnmap->map, req)) {
+ return;
+ }
+
+ if (vnnmap->size == 0) {
+ D_WARNING("No active lmasters found. Adding recmaster anyway\n");
+ vnnmap->map[0] = state->destnode;
+ vnnmap->size = 1;
+ }
+
+ vnnmap->generation = state->generation;
+
+ state->vnnmap = vnnmap;
+
+ ctdb_req_control_start_recovery(&request);
+ subreq = ctdb_client_control_multi_send(state,
+ state->ev,
+ state->client,
+ state->nlist->pnn_list,
+ state->nlist->count,
+ TIMEOUT(),
+ &request);
+ if (tevent_req_nomem(subreq, req)) {
+ return;
+ }
+ tevent_req_set_callback(subreq, recovery_start_recovery_done, req);
+}
+
+static void recovery_start_recovery_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct recovery_state *state = tevent_req_data(
+ req, struct recovery_state);
+ struct ctdb_req_control request;
+ int *err_list;
+ int ret;
+ bool status;
+
+ status = ctdb_client_control_multi_recv(subreq, &ret, NULL, &err_list,
+ NULL);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ int ret2;
+ uint32_t pnn;
+
+ ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list,
+ state->nlist->count,
+ err_list,
+ &pnn);
+ if (ret2 != 0) {
+ D_ERR("failed to run start_recovery event on node %u,"
+ " ret=%d\n", pnn, ret2);
+ } else {
+ D_ERR("failed to run start_recovery event, ret=%d\n",
+ ret);
+ }
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ D_ERR("start_recovery event finished\n");
+
+ ctdb_req_control_setvnnmap(&request, state->vnnmap);
+ subreq = ctdb_client_control_multi_send(state,
+ state->ev,
+ state->client,
+ state->nlist->pnn_list,
+ state->nlist->count,
+ TIMEOUT(),
+ &request);
+ if (tevent_req_nomem(subreq, req)) {
+ return;
+ }
+ tevent_req_set_callback(subreq, recovery_vnnmap_update_done, req);
+}
+
+static void recovery_vnnmap_update_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct recovery_state *state = tevent_req_data(
+ req, struct recovery_state);
+ int *err_list;
+ int ret;
+ bool status;
+
+ status = ctdb_client_control_multi_recv(subreq, &ret, NULL, &err_list,
+ NULL);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ int ret2;
+ uint32_t pnn;
+
+ ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list,
+ state->nlist->count,
+ err_list,
+ &pnn);
+ if (ret2 != 0) {
+ D_ERR("failed to update VNNMAP on node %u, ret=%d\n",
+ pnn, ret2);
+ } else {
+ D_ERR("failed to update VNNMAP, ret=%d\n", ret);
+ }
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ D_NOTICE("updated VNNMAP\n");
+
+ subreq = db_recovery_send(state,
+ state->ev,
+ state->client,
+ state->dblist,
+ state->tun_list,
+ state->nlist,
+ state->vnnmap->generation);
+ if (tevent_req_nomem(subreq, req)) {
+ return;
+ }
+ tevent_req_set_callback(subreq, recovery_db_recovery_done, req);
+}
+
+static void recovery_db_recovery_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct recovery_state *state = tevent_req_data(
+ req, struct recovery_state);
+ struct ctdb_req_control request;
+ bool status;
+ unsigned int count;
+
+ status = db_recovery_recv(subreq, &count);
+ TALLOC_FREE(subreq);
+
+ D_ERR("%d of %d databases recovered\n", count, state->dblist->num_dbs);
+
+ if (! status) {
+ subreq = ban_node_send(state,
+ state->ev,
+ state->client,
+ state->tun_list,
+ state->nlist);
+ if (tevent_req_nomem(subreq, req)) {
+ return;
+ }
+ tevent_req_set_callback(subreq, recovery_failed_done, req);
+ return;
+ }
+
+ ctdb_req_control_set_recmode(&request, CTDB_RECOVERY_NORMAL);
+ subreq = ctdb_client_control_multi_send(state,
+ state->ev,
+ state->client,
+ state->nlist->pnn_list,
+ state->nlist->count,
+ TIMEOUT(),
+ &request);
+ if (tevent_req_nomem(subreq, req)) {
+ return;
+ }
+ tevent_req_set_callback(subreq, recovery_normal_done, req);
+}
+
+static void recovery_failed_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ int ret;
+ bool status;
+
+ status = ban_node_recv(subreq, &ret);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ D_ERR("failed to ban node, ret=%d\n", ret);
+ }
+
+ tevent_req_error(req, EIO);
+}
+
+static void recovery_normal_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct recovery_state *state = tevent_req_data(
+ req, struct recovery_state);
+ struct ctdb_req_control request;
+ int *err_list;
+ int ret;
+ bool status;
+
+ status = ctdb_client_control_multi_recv(subreq, &ret, state, &err_list,
+ NULL);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ int ret2;
+ uint32_t pnn;
+
+ ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list,
+ state->nlist->count,
+ err_list,
+ &pnn);
+ if (ret2 != 0) {
+ D_ERR("failed to set recovery mode NORMAL on node %u,"
+ " ret=%d\n", pnn, ret2);
+ } else {
+ D_ERR("failed to set recovery mode NORMAL, ret=%d\n",
+ ret);
+ }
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ D_ERR("Set recovery mode to NORMAL\n");
+
+ ctdb_req_control_end_recovery(&request);
+ subreq = ctdb_client_control_multi_send(state,
+ state->ev,
+ state->client,
+ state->nlist->pnn_list,
+ state->nlist->count,
+ TIMEOUT(),
+ &request);
+ if (tevent_req_nomem(subreq, req)) {
+ return;
+ }
+ tevent_req_set_callback(subreq, recovery_end_recovery_done, req);
+}
+
+static void recovery_end_recovery_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct recovery_state *state = tevent_req_data(
+ req, struct recovery_state);
+ int *err_list;
+ int ret;
+ bool status;
+
+ status = ctdb_client_control_multi_recv(subreq, &ret, state, &err_list,
+ NULL);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ int ret2;
+ uint32_t pnn;
+
+ ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list,
+ state->nlist->count,
+ err_list,
+ &pnn);
+ if (ret2 != 0) {
+ D_ERR("failed to run recovered event on node %u,"
+ " ret=%d\n", pnn, ret2);
+ } else {
+ D_ERR("failed to run recovered event, ret=%d\n", ret);
+ }
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ D_ERR("recovered event finished\n");
+
+ tevent_req_done(req);
+}
+
+static void recovery_recv(struct tevent_req *req, int *perr)
+{
+ generic_recv(req, perr);
+}
+
+static void usage(const char *progname)
+{
+ fprintf(stderr, "\nUsage: %s <output-fd> <ctdb-socket-path> <generation>\n",
+ progname);
+}
+
+
+/*
+ * Arguments - log fd, write fd, socket path, generation
+ */
+int main(int argc, char *argv[])
+{
+ int write_fd;
+ const char *sockpath;
+ TALLOC_CTX *mem_ctx = NULL;
+ struct tevent_context *ev;
+ struct ctdb_client_context *client;
+ bool status;
+ int ret = 0;
+ struct tevent_req *req;
+ uint32_t generation;
+
+ if (argc != 4) {
+ usage(argv[0]);
+ exit(1);
+ }
+
+ write_fd = atoi(argv[1]);
+ sockpath = argv[2];
+ generation = (uint32_t)smb_strtoul(argv[3],
+ NULL,
+ 0,
+ &ret,
+ SMB_STR_STANDARD);
+ if (ret != 0) {
+ fprintf(stderr, "recovery: unable to initialize generation\n");
+ goto failed;
+ }
+
+ mem_ctx = talloc_new(NULL);
+ if (mem_ctx == NULL) {
+ fprintf(stderr, "recovery: talloc_new() failed\n");
+ goto failed;
+ }
+
+ ret = logging_init(mem_ctx, NULL, NULL, "ctdb-recovery");
+ if (ret != 0) {
+ fprintf(stderr, "recovery: Unable to initialize logging\n");
+ goto failed;
+ }
+
+ ev = tevent_context_init(mem_ctx);
+ if (ev == NULL) {
+ D_ERR("tevent_context_init() failed\n");
+ goto failed;
+ }
+
+ status = logging_setup_sighup_handler(ev, mem_ctx, NULL, NULL);
+ if (!status) {
+ D_ERR("logging_setup_sighup_handler() failed\n");
+ goto failed;
+ }
+
+ ret = ctdb_client_init(mem_ctx, ev, sockpath, &client);
+ if (ret != 0) {
+ D_ERR("ctdb_client_init() failed, ret=%d\n", ret);
+ goto failed;
+ }
+
+ req = recovery_send(mem_ctx, ev, client, generation);
+ if (req == NULL) {
+ D_ERR("database_recover_send() failed\n");
+ goto failed;
+ }
+
+ if (! tevent_req_poll(req, ev)) {
+ D_ERR("tevent_req_poll() failed\n");
+ goto failed;
+ }
+
+ recovery_recv(req, &ret);
+ TALLOC_FREE(req);
+ if (ret != 0) {
+ D_ERR("database recovery failed, ret=%d\n", ret);
+ goto failed;
+ }
+
+ sys_write(write_fd, &ret, sizeof(ret));
+ return 0;
+
+failed:
+ TALLOC_FREE(mem_ctx);
+ return 1;
+}
diff --git a/ctdb/server/ctdb_server.c b/ctdb/server/ctdb_server.c
new file mode 100644
index 0000000..ec6480c
--- /dev/null
+++ b/ctdb/server/ctdb_server.c
@@ -0,0 +1,608 @@
+/*
+ ctdb main protocol code
+
+ Copyright (C) Andrew Tridgell 2006
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+#include "system/filesys.h"
+
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/util/dlinklist.h"
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+
+#include "ctdb_private.h"
+#include "ctdb_client.h"
+
+#include "common/common.h"
+#include "common/logging.h"
+
+/*
+ choose the transport we will use
+*/
+int ctdb_set_transport(struct ctdb_context *ctdb, const char *transport)
+{
+ ctdb->transport = talloc_strdup(ctdb, transport);
+ CTDB_NO_MEMORY(ctdb, ctdb->transport);
+
+ return 0;
+}
+
+/* Return the node structure for nodeip, NULL if nodeip is invalid */
+struct ctdb_node *ctdb_ip_to_node(struct ctdb_context *ctdb,
+ const ctdb_sock_addr *nodeip)
+{
+ unsigned int nodeid;
+
+ for (nodeid=0;nodeid<ctdb->num_nodes;nodeid++) {
+ if (ctdb->nodes[nodeid]->flags & NODE_FLAGS_DELETED) {
+ continue;
+ }
+ if (ctdb_same_ip(&ctdb->nodes[nodeid]->address, nodeip)) {
+ return ctdb->nodes[nodeid];
+ }
+ }
+
+ return NULL;
+}
+
+/* Return the PNN for nodeip, CTDB_UNKNOWN_PNN if nodeip is invalid */
+uint32_t ctdb_ip_to_pnn(struct ctdb_context *ctdb,
+ const ctdb_sock_addr *nodeip)
+{
+ struct ctdb_node *node;
+
+ node = ctdb_ip_to_node(ctdb, nodeip);
+ if (node == NULL) {
+ return CTDB_UNKNOWN_PNN;
+ }
+
+ return node->pnn;
+}
+
+/* Load a nodes list file into a nodes array */
+static int convert_node_map_to_list(struct ctdb_context *ctdb,
+ TALLOC_CTX *mem_ctx,
+ struct ctdb_node_map_old *node_map,
+ struct ctdb_node ***nodes,
+ uint32_t *num_nodes)
+{
+ unsigned int i;
+
+ *nodes = talloc_zero_array(mem_ctx,
+ struct ctdb_node *, node_map->num);
+ CTDB_NO_MEMORY(ctdb, *nodes);
+ *num_nodes = node_map->num;
+
+ for (i = 0; i < node_map->num; i++) {
+ struct ctdb_node *node;
+
+ node = talloc_zero(*nodes, struct ctdb_node);
+ CTDB_NO_MEMORY(ctdb, node);
+ (*nodes)[i] = node;
+
+ node->address = node_map->nodes[i].addr;
+ node->name = talloc_asprintf(node, "%s:%u",
+ ctdb_addr_to_str(&node->address),
+ ctdb_addr_to_port(&node->address));
+
+ node->flags = node_map->nodes[i].flags;
+ if (!(node->flags & NODE_FLAGS_DELETED)) {
+ node->flags = NODE_FLAGS_UNHEALTHY;
+ }
+ node->flags |= NODE_FLAGS_DISCONNECTED;
+
+ node->pnn = i;
+ node->ctdb = ctdb;
+ node->dead_count = 0;
+ }
+
+ return 0;
+}
+
+/* Load the nodes list from a file */
+void ctdb_load_nodes_file(struct ctdb_context *ctdb)
+{
+ struct ctdb_node_map_old *node_map;
+ int ret;
+
+ node_map = ctdb_read_nodes_file(ctdb, ctdb->nodes_file);
+ if (node_map == NULL) {
+ goto fail;
+ }
+
+ TALLOC_FREE(ctdb->nodes);
+ ret = convert_node_map_to_list(ctdb, ctdb, node_map,
+ &ctdb->nodes, &ctdb->num_nodes);
+ if (ret == -1) {
+ goto fail;
+ }
+
+ talloc_free(node_map);
+ return;
+
+fail:
+ DEBUG(DEBUG_ERR, ("Failed to load nodes file \"%s\"\n",
+ ctdb->nodes_file));
+ talloc_free(node_map);
+ exit(1);
+}
+
+/*
+ setup the local node address
+*/
+int ctdb_set_address(struct ctdb_context *ctdb, const char *address)
+{
+ ctdb->address = talloc(ctdb, ctdb_sock_addr);
+ CTDB_NO_MEMORY(ctdb, ctdb->address);
+
+ if (ctdb_parse_address(ctdb, address, ctdb->address) != 0) {
+ return -1;
+ }
+
+ ctdb->name = talloc_asprintf(ctdb, "%s:%u",
+ ctdb_addr_to_str(ctdb->address),
+ ctdb_addr_to_port(ctdb->address));
+ return 0;
+}
+
+
+/*
+ return the number of active nodes
+*/
+uint32_t ctdb_get_num_active_nodes(struct ctdb_context *ctdb)
+{
+ unsigned int i;
+ uint32_t count=0;
+ for (i=0; i < ctdb->num_nodes; i++) {
+ if (!(ctdb->nodes[i]->flags & NODE_FLAGS_INACTIVE)) {
+ count++;
+ }
+ }
+ return count;
+}
+
+
+/*
+ called when we need to process a packet. This can be a requeued packet
+ after a lockwait, or a real packet from another node
+*/
+void ctdb_input_pkt(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+ TALLOC_CTX *tmp_ctx;
+
+ /* place the packet as a child of the tmp_ctx. We then use
+ talloc_free() below to free it. If any of the calls want
+ to keep it, then they will steal it somewhere else, and the
+ talloc_free() will only free the tmp_ctx */
+ tmp_ctx = talloc_new(ctdb);
+ talloc_steal(tmp_ctx, hdr);
+
+ DEBUG(DEBUG_DEBUG,(__location__ " ctdb request %u of type %u length %u from "
+ "node %u to %u\n", hdr->reqid, hdr->operation, hdr->length,
+ hdr->srcnode, hdr->destnode));
+
+ switch (hdr->operation) {
+ case CTDB_REQ_CALL:
+ case CTDB_REPLY_CALL:
+ case CTDB_REQ_DMASTER:
+ case CTDB_REPLY_DMASTER:
+ /* we don't allow these calls when banned */
+ if (ctdb->nodes[ctdb->pnn]->flags & NODE_FLAGS_BANNED) {
+ DEBUG(DEBUG_DEBUG,(__location__ " ctdb operation %u"
+ " request %u"
+ " length %u from node %u to %u while node"
+ " is banned\n",
+ hdr->operation, hdr->reqid,
+ hdr->length,
+ hdr->srcnode, hdr->destnode));
+ goto done;
+ }
+
+ /* for ctdb_call inter-node operations verify that the
+ remote node that sent us the call is running in the
+ same generation instance as this node
+ */
+ if (ctdb->vnn_map->generation != hdr->generation) {
+ DEBUG(DEBUG_DEBUG,(__location__ " ctdb operation %u"
+ " request %u"
+ " length %u from node %u to %u had an"
+ " invalid generation id:%u while our"
+ " generation id is:%u\n",
+ hdr->operation, hdr->reqid,
+ hdr->length,
+ hdr->srcnode, hdr->destnode,
+ hdr->generation, ctdb->vnn_map->generation));
+ goto done;
+ }
+ }
+
+ switch (hdr->operation) {
+ case CTDB_REQ_CALL:
+ CTDB_INCREMENT_STAT(ctdb, node.req_call);
+ ctdb_request_call(ctdb, hdr);
+ break;
+
+ case CTDB_REPLY_CALL:
+ CTDB_INCREMENT_STAT(ctdb, node.reply_call);
+ ctdb_reply_call(ctdb, hdr);
+ break;
+
+ case CTDB_REPLY_ERROR:
+ CTDB_INCREMENT_STAT(ctdb, node.reply_error);
+ ctdb_reply_error(ctdb, hdr);
+ break;
+
+ case CTDB_REQ_DMASTER:
+ CTDB_INCREMENT_STAT(ctdb, node.req_dmaster);
+ ctdb_request_dmaster(ctdb, hdr);
+ break;
+
+ case CTDB_REPLY_DMASTER:
+ CTDB_INCREMENT_STAT(ctdb, node.reply_dmaster);
+ ctdb_reply_dmaster(ctdb, hdr);
+ break;
+
+ case CTDB_REQ_MESSAGE:
+ CTDB_INCREMENT_STAT(ctdb, node.req_message);
+ ctdb_request_message(ctdb, hdr);
+ break;
+
+ case CTDB_REQ_CONTROL:
+ CTDB_INCREMENT_STAT(ctdb, node.req_control);
+ ctdb_request_control(ctdb, hdr);
+ break;
+
+ case CTDB_REPLY_CONTROL:
+ CTDB_INCREMENT_STAT(ctdb, node.reply_control);
+ ctdb_reply_control(ctdb, hdr);
+ break;
+
+ case CTDB_REQ_KEEPALIVE:
+ CTDB_INCREMENT_STAT(ctdb, keepalive_packets_recv);
+ ctdb_request_keepalive(ctdb, hdr);
+ break;
+
+ case CTDB_REQ_TUNNEL:
+ CTDB_INCREMENT_STAT(ctdb, node.req_tunnel);
+ ctdb_request_tunnel(ctdb, hdr);
+ break;
+
+ default:
+ DEBUG(DEBUG_CRIT,("%s: Packet with unknown operation %u\n",
+ __location__, hdr->operation));
+ break;
+ }
+
+done:
+ talloc_free(tmp_ctx);
+}
+
+
+/*
+ called by the transport layer when a node is dead
+*/
+void ctdb_node_dead(struct ctdb_node *node)
+{
+ if (node->ctdb->methods == NULL) {
+ DBG_ERR("Can not restart transport while shutting down\n");
+ return;
+ }
+ node->ctdb->methods->restart(node);
+
+ if (node->flags & NODE_FLAGS_DISCONNECTED) {
+ DEBUG(DEBUG_INFO,("%s: node %s is already marked disconnected: %u connected\n",
+ node->ctdb->name, node->name,
+ node->ctdb->num_connected));
+ return;
+ }
+ node->ctdb->num_connected--;
+ node->flags |= NODE_FLAGS_DISCONNECTED | NODE_FLAGS_UNHEALTHY;
+ node->rx_cnt = 0;
+ node->dead_count = 0;
+
+ DEBUG(DEBUG_ERR,("%s: node %s is dead: %u connected\n",
+ node->ctdb->name, node->name, node->ctdb->num_connected));
+ ctdb_daemon_cancel_controls(node->ctdb, node);
+}
+
+/*
+ called by the transport layer when a node is connected
+*/
+void ctdb_node_connected(struct ctdb_node *node)
+{
+ if (!(node->flags & NODE_FLAGS_DISCONNECTED)) {
+ DEBUG(DEBUG_INFO,("%s: node %s is already marked connected: %u connected\n",
+ node->ctdb->name, node->name,
+ node->ctdb->num_connected));
+ return;
+ }
+ node->ctdb->num_connected++;
+ node->dead_count = 0;
+ node->flags &= ~NODE_FLAGS_DISCONNECTED;
+ DEBUG(DEBUG_ERR,
+ ("%s: connected to %s - %u connected\n",
+ node->ctdb->name, node->name, node->ctdb->num_connected));
+}
+
+struct queue_next {
+ struct ctdb_context *ctdb;
+ struct ctdb_req_header *hdr;
+};
+
+
+/*
+ triggered when a deferred packet is due
+ */
+static void queue_next_trigger(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *private_data)
+{
+ struct queue_next *q = talloc_get_type(private_data, struct queue_next);
+ ctdb_input_pkt(q->ctdb, q->hdr);
+ talloc_free(q);
+}
+
+/*
+ defer a packet, so it is processed on the next event loop
+ this is used for sending packets to ourselves
+ */
+static void ctdb_defer_packet(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+ struct queue_next *q;
+ q = talloc(ctdb, struct queue_next);
+ if (q == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to allocate deferred packet\n"));
+ return;
+ }
+ q->ctdb = ctdb;
+ q->hdr = talloc_memdup(q, hdr, hdr->length);
+ if (q->hdr == NULL) {
+ talloc_free(q);
+ DEBUG(DEBUG_ERR,("Error copying deferred packet to self\n"));
+ return;
+ }
+#if 0
+ /* use this to put packets directly into our recv function */
+ ctdb_input_pkt(q->ctdb, q->hdr);
+#else
+ tevent_add_timer(ctdb->ev, q, timeval_zero(), queue_next_trigger, q);
+#endif
+}
+
+
+/*
+ broadcast a packet to all nodes
+*/
+static void ctdb_broadcast_packet_all(struct ctdb_context *ctdb,
+ struct ctdb_req_header *hdr)
+{
+ unsigned int i;
+ for (i=0; i < ctdb->num_nodes; i++) {
+ if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
+ continue;
+ }
+ hdr->destnode = ctdb->nodes[i]->pnn;
+ ctdb_queue_packet(ctdb, hdr);
+ }
+}
+
+/*
+ broadcast a packet to all active nodes
+*/
+static void ctdb_broadcast_packet_active(struct ctdb_context *ctdb,
+ struct ctdb_req_header *hdr)
+{
+ unsigned int i;
+ for (i = 0; i < ctdb->num_nodes; i++) {
+ if (ctdb->nodes[i]->flags & NODE_FLAGS_INACTIVE) {
+ continue;
+ }
+
+ hdr->destnode = ctdb->nodes[i]->pnn;
+ ctdb_queue_packet(ctdb, hdr);
+ }
+}
+
+/*
+ broadcast a packet to all connected nodes
+*/
+static void ctdb_broadcast_packet_connected(struct ctdb_context *ctdb,
+ struct ctdb_req_header *hdr)
+{
+ unsigned int i;
+ for (i=0; i < ctdb->num_nodes; i++) {
+ if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
+ continue;
+ }
+ if (!(ctdb->nodes[i]->flags & NODE_FLAGS_DISCONNECTED)) {
+ hdr->destnode = ctdb->nodes[i]->pnn;
+ ctdb_queue_packet(ctdb, hdr);
+ }
+ }
+}
+
+/*
+ queue a packet or die
+*/
+void ctdb_queue_packet(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+ struct ctdb_node *node;
+
+ switch (hdr->destnode) {
+ case CTDB_BROADCAST_ALL:
+ ctdb_broadcast_packet_all(ctdb, hdr);
+ return;
+ case CTDB_BROADCAST_ACTIVE:
+ ctdb_broadcast_packet_active(ctdb, hdr);
+ return;
+ case CTDB_BROADCAST_CONNECTED:
+ ctdb_broadcast_packet_connected(ctdb, hdr);
+ return;
+ }
+
+ CTDB_INCREMENT_STAT(ctdb, node_packets_sent);
+
+ if (!ctdb_validate_pnn(ctdb, hdr->destnode)) {
+ DEBUG(DEBUG_CRIT,(__location__ " cant send to node %u that does not exist\n",
+ hdr->destnode));
+ return;
+ }
+
+ node = ctdb->nodes[hdr->destnode];
+
+ if (node->flags & NODE_FLAGS_DELETED) {
+ DEBUG(DEBUG_ERR, (__location__ " Can not queue packet to DELETED node %d\n", hdr->destnode));
+ return;
+ }
+
+ if (node->pnn == ctdb->pnn) {
+ ctdb_defer_packet(ctdb, hdr);
+ return;
+ }
+
+ if (ctdb->methods == NULL) {
+ DEBUG(DEBUG_ALERT, (__location__ " Can not queue packet. "
+ "Transport is DOWN\n"));
+ return;
+ }
+
+ node->tx_cnt++;
+ if (ctdb->methods->queue_pkt(node, (uint8_t *)hdr, hdr->length) != 0) {
+ ctdb_fatal(ctdb, "Unable to queue packet\n");
+ }
+}
+
+
+
+
+/*
+ a valgrind hack to allow us to get opcode specific backtraces
+ very ugly, and relies on no compiler optimisation!
+*/
+void ctdb_queue_packet_opcode(struct ctdb_context *ctdb, struct ctdb_req_header *hdr, unsigned opcode)
+{
+ switch (opcode) {
+#define DO_OP(x) case x: ctdb_queue_packet(ctdb, hdr); break
+ DO_OP(1);
+ DO_OP(2);
+ DO_OP(3);
+ DO_OP(4);
+ DO_OP(5);
+ DO_OP(6);
+ DO_OP(7);
+ DO_OP(8);
+ DO_OP(9);
+ DO_OP(10);
+ DO_OP(11);
+ DO_OP(12);
+ DO_OP(13);
+ DO_OP(14);
+ DO_OP(15);
+ DO_OP(16);
+ DO_OP(17);
+ DO_OP(18);
+ DO_OP(19);
+ DO_OP(20);
+ DO_OP(21);
+ DO_OP(22);
+ DO_OP(23);
+ DO_OP(24);
+ DO_OP(25);
+ DO_OP(26);
+ DO_OP(27);
+ DO_OP(28);
+ DO_OP(29);
+ DO_OP(30);
+ DO_OP(31);
+ DO_OP(32);
+ DO_OP(33);
+ DO_OP(34);
+ DO_OP(35);
+ DO_OP(36);
+ DO_OP(37);
+ DO_OP(38);
+ DO_OP(39);
+ DO_OP(40);
+ DO_OP(41);
+ DO_OP(42);
+ DO_OP(43);
+ DO_OP(44);
+ DO_OP(45);
+ DO_OP(46);
+ DO_OP(47);
+ DO_OP(48);
+ DO_OP(49);
+ DO_OP(50);
+ DO_OP(51);
+ DO_OP(52);
+ DO_OP(53);
+ DO_OP(54);
+ DO_OP(55);
+ DO_OP(56);
+ DO_OP(57);
+ DO_OP(58);
+ DO_OP(59);
+ DO_OP(60);
+ DO_OP(61);
+ DO_OP(62);
+ DO_OP(63);
+ DO_OP(64);
+ DO_OP(65);
+ DO_OP(66);
+ DO_OP(67);
+ DO_OP(68);
+ DO_OP(69);
+ DO_OP(70);
+ DO_OP(71);
+ DO_OP(72);
+ DO_OP(73);
+ DO_OP(74);
+ DO_OP(75);
+ DO_OP(76);
+ DO_OP(77);
+ DO_OP(78);
+ DO_OP(79);
+ DO_OP(80);
+ DO_OP(81);
+ DO_OP(82);
+ DO_OP(83);
+ DO_OP(84);
+ DO_OP(85);
+ DO_OP(86);
+ DO_OP(87);
+ DO_OP(88);
+ DO_OP(89);
+ DO_OP(90);
+ DO_OP(91);
+ DO_OP(92);
+ DO_OP(93);
+ DO_OP(94);
+ DO_OP(95);
+ DO_OP(96);
+ DO_OP(97);
+ DO_OP(98);
+ DO_OP(99);
+ DO_OP(100);
+ default:
+ ctdb_queue_packet(ctdb, hdr);
+ break;
+ }
+}
diff --git a/ctdb/server/ctdb_statistics.c b/ctdb/server/ctdb_statistics.c
new file mode 100644
index 0000000..4cf8f9e
--- /dev/null
+++ b/ctdb/server/ctdb_statistics.c
@@ -0,0 +1,93 @@
+/*
+ ctdb statistics code
+
+ Copyright (C) Ronnie Sahlberg 2010
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+#include "system/time.h"
+
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+
+#include "ctdb_private.h"
+
+#include "common/logging.h"
+
+static void ctdb_statistics_update(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *p)
+{
+ struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
+
+ memmove(&ctdb->statistics_history[1], &ctdb->statistics_history[0], (MAX_STAT_HISTORY-1)*sizeof(struct ctdb_statistics));
+ memcpy(&ctdb->statistics_history[0], &ctdb->statistics_current, sizeof(struct ctdb_statistics));
+ ctdb->statistics_history[0].statistics_current_time = timeval_current();
+
+
+ bzero(&ctdb->statistics_current, sizeof(struct ctdb_statistics));
+ ctdb->statistics_current.statistics_start_time = timeval_current();
+
+ tevent_add_timer(ctdb->ev, ctdb,
+ timeval_current_ofs(ctdb->tunable.stat_history_interval, 0),
+ ctdb_statistics_update, ctdb);
+}
+
+int ctdb_statistics_init(struct ctdb_context *ctdb)
+{
+ bzero(&ctdb->statistics, sizeof(struct ctdb_statistics));
+ ctdb->statistics.statistics_start_time = timeval_current();
+
+ bzero(&ctdb->statistics_current, sizeof(struct ctdb_statistics));
+ ctdb->statistics_current.statistics_start_time = timeval_current();
+
+ bzero(ctdb->statistics_history, sizeof(ctdb->statistics_history));
+
+ tevent_add_timer(ctdb->ev, ctdb,
+ timeval_current_ofs(ctdb->tunable.stat_history_interval, 0),
+ ctdb_statistics_update, ctdb);
+ return 0;
+}
+
+
+int32_t ctdb_control_get_stat_history(struct ctdb_context *ctdb,
+ struct ctdb_req_control_old *c,
+ TDB_DATA *outdata)
+{
+ int len;
+ struct ctdb_statistics_list_old *s;
+
+ len = offsetof(struct ctdb_statistics_list_old, stats) +
+ MAX_STAT_HISTORY*sizeof(struct ctdb_statistics);
+
+ s = talloc_size(outdata, len);
+ if (s == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to allocate statistics history structure\n"));
+ return -1;
+ }
+
+ s->num = MAX_STAT_HISTORY;
+ memcpy(&s->stats[0], &ctdb->statistics_history[0], sizeof(ctdb->statistics_history));
+
+ outdata->dsize = len;
+ outdata->dptr = (uint8_t *)s;
+
+ return 0;
+}
diff --git a/ctdb/server/ctdb_takeover.c b/ctdb/server/ctdb_takeover.c
new file mode 100644
index 0000000..4d2d041
--- /dev/null
+++ b/ctdb/server/ctdb_takeover.c
@@ -0,0 +1,2653 @@
+/*
+ ctdb ip takeover code
+
+ Copyright (C) Ronnie Sahlberg 2007
+ Copyright (C) Andrew Tridgell 2007
+ Copyright (C) Martin Schwenke 2011
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "replace.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "system/time.h"
+#include "system/wait.h"
+
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/util/dlinklist.h"
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+#include "lib/util/sys_rw.h"
+#include "lib/util/util_process.h"
+
+#include "protocol/protocol_util.h"
+
+#include "ctdb_private.h"
+#include "ctdb_client.h"
+
+#include "common/reqid.h"
+#include "common/system.h"
+#include "common/system_socket.h"
+#include "common/common.h"
+#include "common/logging.h"
+
+#include "server/ctdb_config.h"
+
+#include "server/ipalloc.h"
+
+#define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
+
+#define CTDB_ARP_INTERVAL 1
+#define CTDB_ARP_REPEAT 3
+
+struct ctdb_interface {
+ struct ctdb_interface *prev, *next;
+ const char *name;
+ bool link_up;
+ uint32_t references;
+};
+
+struct vnn_interface {
+ struct vnn_interface *prev, *next;
+ struct ctdb_interface *iface;
+};
+
+/* state associated with a public ip address */
+struct ctdb_vnn {
+ struct ctdb_vnn *prev, *next;
+
+ struct ctdb_interface *iface;
+ struct vnn_interface *ifaces;
+ ctdb_sock_addr public_address;
+ uint8_t public_netmask_bits;
+
+ /*
+ * The node number that is serving this public address - set
+ * to CTDB_UNKNOWN_PNN if node is serving it
+ */
+ uint32_t pnn;
+
+ /* List of clients to tickle for this public address */
+ struct ctdb_tcp_array *tcp_array;
+
+ /* whether we need to update the other nodes with changes to our list
+ of connected clients */
+ bool tcp_update_needed;
+
+ /* a context to hang sending gratious arp events off */
+ TALLOC_CTX *takeover_ctx;
+
+ /* Set to true any time an update to this VNN is in flight.
+ This helps to avoid races. */
+ bool update_in_flight;
+
+ /* If CTDB_CONTROL_DEL_PUBLIC_IP is received for this IP
+ * address then this flag is set. It will be deleted in the
+ * release IP callback. */
+ bool delete_pending;
+};
+
+static const char *iface_string(const struct ctdb_interface *iface)
+{
+ return (iface != NULL ? iface->name : "__none__");
+}
+
+static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
+{
+ return iface_string(vnn->iface);
+}
+
+static struct ctdb_interface *ctdb_find_iface(struct ctdb_context *ctdb,
+ const char *iface);
+
+static struct ctdb_interface *
+ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
+{
+ struct ctdb_interface *i;
+
+ if (strlen(iface) > CTDB_IFACE_SIZE) {
+ DEBUG(DEBUG_ERR, ("Interface name too long \"%s\"\n", iface));
+ return NULL;
+ }
+
+ /* Verify that we don't have an entry for this ip yet */
+ i = ctdb_find_iface(ctdb, iface);
+ if (i != NULL) {
+ return i;
+ }
+
+ /* create a new structure for this interface */
+ i = talloc_zero(ctdb, struct ctdb_interface);
+ if (i == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
+ return NULL;
+ }
+ i->name = talloc_strdup(i, iface);
+ if (i->name == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
+ talloc_free(i);
+ return NULL;
+ }
+
+ i->link_up = true;
+
+ DLIST_ADD(ctdb->ifaces, i);
+
+ return i;
+}
+
+static bool vnn_has_interface(struct ctdb_vnn *vnn,
+ const struct ctdb_interface *iface)
+{
+ struct vnn_interface *i;
+
+ for (i = vnn->ifaces; i != NULL; i = i->next) {
+ if (iface == i->iface) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/* If any interfaces now have no possible IPs then delete them. This
+ * implementation is naive (i.e. simple) rather than clever
+ * (i.e. complex). Given that this is run on delip and that operation
+ * is rare, this doesn't need to be efficient - it needs to be
+ * foolproof. One alternative is reference counting, where the logic
+ * is distributed and can, therefore, be broken in multiple places.
+ * Another alternative is to build a red-black tree of interfaces that
+ * can have addresses (by walking ctdb->vnn once) and then walking
+ * ctdb->ifaces once and deleting those not in the tree. Let's go to
+ * one of those if the naive implementation causes problems... :-)
+ */
+static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
+ struct ctdb_vnn *vnn)
+{
+ struct ctdb_interface *i, *next;
+
+ /* For each interface, check if there's an IP using it. */
+ for (i = ctdb->ifaces; i != NULL; i = next) {
+ struct ctdb_vnn *tv;
+ bool found;
+ next = i->next;
+
+ /* Only consider interfaces named in the given VNN. */
+ if (!vnn_has_interface(vnn, i)) {
+ continue;
+ }
+
+ /* Search for a vnn with this interface. */
+ found = false;
+ for (tv=ctdb->vnn; tv; tv=tv->next) {
+ if (vnn_has_interface(tv, i)) {
+ found = true;
+ break;
+ }
+ }
+
+ if (!found) {
+ /* None of the VNNs are using this interface. */
+ DLIST_REMOVE(ctdb->ifaces, i);
+ talloc_free(i);
+ }
+ }
+}
+
+
+static struct ctdb_interface *ctdb_find_iface(struct ctdb_context *ctdb,
+ const char *iface)
+{
+ struct ctdb_interface *i;
+
+ for (i=ctdb->ifaces;i;i=i->next) {
+ if (strcmp(i->name, iface) == 0) {
+ return i;
+ }
+ }
+
+ return NULL;
+}
+
+static struct ctdb_interface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
+ struct ctdb_vnn *vnn)
+{
+ struct vnn_interface *i;
+ struct ctdb_interface *cur = NULL;
+ struct ctdb_interface *best = NULL;
+
+ for (i = vnn->ifaces; i != NULL; i = i->next) {
+
+ cur = i->iface;
+
+ if (!cur->link_up) {
+ continue;
+ }
+
+ if (best == NULL) {
+ best = cur;
+ continue;
+ }
+
+ if (cur->references < best->references) {
+ best = cur;
+ continue;
+ }
+ }
+
+ return best;
+}
+
+static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
+ struct ctdb_vnn *vnn)
+{
+ struct ctdb_interface *best = NULL;
+
+ if (vnn->iface) {
+ DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
+ "still assigned to iface '%s'\n",
+ ctdb_addr_to_str(&vnn->public_address),
+ ctdb_vnn_iface_string(vnn)));
+ return 0;
+ }
+
+ best = ctdb_vnn_best_iface(ctdb, vnn);
+ if (best == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
+ "cannot assign to iface any iface\n",
+ ctdb_addr_to_str(&vnn->public_address)));
+ return -1;
+ }
+
+ vnn->iface = best;
+ best->references++;
+ vnn->pnn = ctdb->pnn;
+
+ DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
+ "now assigned to iface '%s' refs[%d]\n",
+ ctdb_addr_to_str(&vnn->public_address),
+ ctdb_vnn_iface_string(vnn),
+ best->references));
+ return 0;
+}
+
+static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
+ struct ctdb_vnn *vnn)
+{
+ DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
+ "now unassigned (old iface '%s' refs[%d])\n",
+ ctdb_addr_to_str(&vnn->public_address),
+ ctdb_vnn_iface_string(vnn),
+ vnn->iface?vnn->iface->references:0));
+ if (vnn->iface) {
+ vnn->iface->references--;
+ }
+ vnn->iface = NULL;
+ if (vnn->pnn == ctdb->pnn) {
+ vnn->pnn = CTDB_UNKNOWN_PNN;
+ }
+}
+
+static bool ctdb_vnn_available(struct ctdb_context *ctdb,
+ struct ctdb_vnn *vnn)
+{
+ uint32_t flags;
+ struct vnn_interface *i;
+
+ /* Nodes that are not RUNNING can not host IPs */
+ if (ctdb->runstate != CTDB_RUNSTATE_RUNNING) {
+ return false;
+ }
+
+ flags = ctdb->nodes[ctdb->pnn]->flags;
+ if ((flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED)) != 0) {
+ return false;
+ }
+
+ if (vnn->delete_pending) {
+ return false;
+ }
+
+ if (vnn->iface && vnn->iface->link_up) {
+ return true;
+ }
+
+ for (i = vnn->ifaces; i != NULL; i = i->next) {
+ if (i->iface->link_up) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+struct ctdb_takeover_arp {
+ struct ctdb_context *ctdb;
+ uint32_t count;
+ ctdb_sock_addr addr;
+ struct ctdb_tcp_array *tcparray;
+ struct ctdb_vnn *vnn;
+};
+
+
+/*
+ lists of tcp endpoints
+ */
+struct ctdb_tcp_list {
+ struct ctdb_tcp_list *prev, *next;
+ struct ctdb_connection connection;
+};
+
+/*
+ list of clients to kill on IP release
+ */
+struct ctdb_client_ip {
+ struct ctdb_client_ip *prev, *next;
+ struct ctdb_context *ctdb;
+ ctdb_sock_addr addr;
+ uint32_t client_id;
+};
+
+
+/*
+ send a gratuitous arp
+ */
+static void ctdb_control_send_arp(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *private_data)
+{
+ struct ctdb_takeover_arp *arp = talloc_get_type(private_data,
+ struct ctdb_takeover_arp);
+ int ret;
+ struct ctdb_tcp_array *tcparray;
+ const char *iface;
+
+ /* IP address might have been released between sends */
+ if (arp->vnn->iface == NULL) {
+ DBG_INFO("Cancelling ARP send for released IP %s\n",
+ ctdb_addr_to_str(&arp->vnn->public_address));
+ talloc_free(arp);
+ return;
+ }
+
+ iface = ctdb_vnn_iface_string(arp->vnn);
+ ret = ctdb_sys_send_arp(&arp->addr, iface);
+ if (ret != 0) {
+ DBG_ERR("Failed to send ARP on interface %s: %s\n",
+ iface, strerror(ret));
+ }
+
+ tcparray = arp->tcparray;
+ if (tcparray) {
+ unsigned int i;
+
+ for (i=0;i<tcparray->num;i++) {
+ struct ctdb_connection *tcon;
+ char buf[128];
+
+ tcon = &tcparray->connections[i];
+ ret = ctdb_connection_to_buf(buf,
+ sizeof(buf),
+ tcon,
+ false,
+ " -> ");
+ if (ret != 0) {
+ strlcpy(buf, "UNKNOWN", sizeof(buf));
+ }
+ D_INFO("Send TCP tickle ACK: %s\n", buf);
+ ret = ctdb_sys_send_tcp(
+ &tcon->src,
+ &tcon->dst,
+ 0, 0, 0);
+ if (ret != 0) {
+ DBG_ERR("Failed to send TCP tickle ACK: %s\n",
+ buf);
+ }
+ }
+ }
+
+ arp->count++;
+
+ if (arp->count == CTDB_ARP_REPEAT) {
+ talloc_free(arp);
+ return;
+ }
+
+ tevent_add_timer(arp->ctdb->ev, arp->vnn->takeover_ctx,
+ timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
+ ctdb_control_send_arp, arp);
+}
+
+static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
+ struct ctdb_vnn *vnn)
+{
+ struct ctdb_takeover_arp *arp;
+ struct ctdb_tcp_array *tcparray;
+
+ if (!vnn->takeover_ctx) {
+ vnn->takeover_ctx = talloc_new(vnn);
+ if (!vnn->takeover_ctx) {
+ return -1;
+ }
+ }
+
+ arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
+ if (!arp) {
+ return -1;
+ }
+
+ arp->ctdb = ctdb;
+ arp->addr = vnn->public_address;
+ arp->vnn = vnn;
+
+ tcparray = vnn->tcp_array;
+ if (tcparray) {
+ /* add all of the known tcp connections for this IP to the
+ list of tcp connections to send tickle acks for */
+ arp->tcparray = talloc_steal(arp, tcparray);
+
+ vnn->tcp_array = NULL;
+ vnn->tcp_update_needed = true;
+ }
+
+ tevent_add_timer(arp->ctdb->ev, vnn->takeover_ctx,
+ timeval_zero(), ctdb_control_send_arp, arp);
+
+ return 0;
+}
+
+struct ctdb_do_takeip_state {
+ struct ctdb_req_control_old *c;
+ struct ctdb_vnn *vnn;
+};
+
+/*
+ called when takeip event finishes
+ */
+static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
+ void *private_data)
+{
+ struct ctdb_do_takeip_state *state =
+ talloc_get_type(private_data, struct ctdb_do_takeip_state);
+ int32_t ret;
+ TDB_DATA data;
+
+ if (status != 0) {
+ if (status == -ETIMEDOUT) {
+ ctdb_ban_self(ctdb);
+ }
+ DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
+ ctdb_addr_to_str(&state->vnn->public_address),
+ ctdb_vnn_iface_string(state->vnn)));
+ ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
+
+ talloc_free(state);
+ return;
+ }
+
+ if (ctdb->do_checkpublicip) {
+
+ ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
+ if (ret != 0) {
+ ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
+ talloc_free(state);
+ return;
+ }
+
+ }
+
+ data.dptr = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
+ data.dsize = strlen((char *)data.dptr) + 1;
+ DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
+
+ ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
+
+
+ /* the control succeeded */
+ ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
+ talloc_free(state);
+ return;
+}
+
+static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
+{
+ state->vnn->update_in_flight = false;
+ return 0;
+}
+
+/*
+ take over an ip address
+ */
+static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
+ struct ctdb_req_control_old *c,
+ struct ctdb_vnn *vnn)
+{
+ int ret;
+ struct ctdb_do_takeip_state *state;
+
+ if (vnn->update_in_flight) {
+ DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
+ "update for this IP already in flight\n",
+ ctdb_addr_to_str(&vnn->public_address),
+ vnn->public_netmask_bits));
+ return -1;
+ }
+
+ ret = ctdb_vnn_assign_iface(ctdb, vnn);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
+ "assign a usable interface\n",
+ ctdb_addr_to_str(&vnn->public_address),
+ vnn->public_netmask_bits));
+ return -1;
+ }
+
+ state = talloc(vnn, struct ctdb_do_takeip_state);
+ CTDB_NO_MEMORY(ctdb, state);
+
+ state->c = NULL;
+ state->vnn = vnn;
+
+ vnn->update_in_flight = true;
+ talloc_set_destructor(state, ctdb_takeip_destructor);
+
+ DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
+ ctdb_addr_to_str(&vnn->public_address),
+ vnn->public_netmask_bits,
+ ctdb_vnn_iface_string(vnn)));
+
+ ret = ctdb_event_script_callback(ctdb,
+ state,
+ ctdb_do_takeip_callback,
+ state,
+ CTDB_EVENT_TAKE_IP,
+ "%s %s %u",
+ ctdb_vnn_iface_string(vnn),
+ ctdb_addr_to_str(&vnn->public_address),
+ vnn->public_netmask_bits);
+
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
+ ctdb_addr_to_str(&vnn->public_address),
+ ctdb_vnn_iface_string(vnn)));
+ talloc_free(state);
+ return -1;
+ }
+
+ state->c = talloc_steal(ctdb, c);
+ return 0;
+}
+
+struct ctdb_do_updateip_state {
+ struct ctdb_req_control_old *c;
+ struct ctdb_interface *old;
+ struct ctdb_vnn *vnn;
+};
+
+/*
+ called when updateip event finishes
+ */
+static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
+ void *private_data)
+{
+ struct ctdb_do_updateip_state *state =
+ talloc_get_type(private_data, struct ctdb_do_updateip_state);
+
+ if (status != 0) {
+ if (status == -ETIMEDOUT) {
+ ctdb_ban_self(ctdb);
+ }
+ DEBUG(DEBUG_ERR,
+ ("Failed update of IP %s from interface %s to %s\n",
+ ctdb_addr_to_str(&state->vnn->public_address),
+ iface_string(state->old),
+ ctdb_vnn_iface_string(state->vnn)));
+
+ /*
+ * All we can do is reset the old interface
+ * and let the next run fix it
+ */
+ ctdb_vnn_unassign_iface(ctdb, state->vnn);
+ state->vnn->iface = state->old;
+ state->vnn->iface->references++;
+
+ ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
+ talloc_free(state);
+ return;
+ }
+
+ /* the control succeeded */
+ ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
+ talloc_free(state);
+ return;
+}
+
+static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
+{
+ state->vnn->update_in_flight = false;
+ return 0;
+}
+
+/*
+ update (move) an ip address
+ */
+static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
+ struct ctdb_req_control_old *c,
+ struct ctdb_vnn *vnn)
+{
+ int ret;
+ struct ctdb_do_updateip_state *state;
+ struct ctdb_interface *old = vnn->iface;
+ const char *old_name = iface_string(old);
+ const char *new_name;
+
+ if (vnn->update_in_flight) {
+ DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
+ "update for this IP already in flight\n",
+ ctdb_addr_to_str(&vnn->public_address),
+ vnn->public_netmask_bits));
+ return -1;
+ }
+
+ ctdb_vnn_unassign_iface(ctdb, vnn);
+ ret = ctdb_vnn_assign_iface(ctdb, vnn);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,("Update of IP %s/%u failed to "
+ "assign a usable interface (old iface '%s')\n",
+ ctdb_addr_to_str(&vnn->public_address),
+ vnn->public_netmask_bits,
+ old_name));
+ return -1;
+ }
+
+ if (old == vnn->iface) {
+ /* A benign update from one interface onto itself.
+ * no need to run the eventscripts in this case, just return
+ * success.
+ */
+ ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
+ return 0;
+ }
+
+ state = talloc(vnn, struct ctdb_do_updateip_state);
+ CTDB_NO_MEMORY(ctdb, state);
+
+ state->c = NULL;
+ state->old = old;
+ state->vnn = vnn;
+
+ vnn->update_in_flight = true;
+ talloc_set_destructor(state, ctdb_updateip_destructor);
+
+ new_name = ctdb_vnn_iface_string(vnn);
+ DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
+ "interface %s to %s\n",
+ ctdb_addr_to_str(&vnn->public_address),
+ vnn->public_netmask_bits,
+ old_name,
+ new_name));
+
+ ret = ctdb_event_script_callback(ctdb,
+ state,
+ ctdb_do_updateip_callback,
+ state,
+ CTDB_EVENT_UPDATE_IP,
+ "%s %s %s %u",
+ old_name,
+ new_name,
+ ctdb_addr_to_str(&vnn->public_address),
+ vnn->public_netmask_bits);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,
+ ("Failed update IP %s from interface %s to %s\n",
+ ctdb_addr_to_str(&vnn->public_address),
+ old_name, new_name));
+ talloc_free(state);
+ return -1;
+ }
+
+ state->c = talloc_steal(ctdb, c);
+ return 0;
+}
+
+/*
+ Find the vnn of the node that has a public ip address
+ returns -1 if the address is not known as a public address
+ */
+static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
+{
+ struct ctdb_vnn *vnn;
+
+ for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
+ if (ctdb_same_ip(&vnn->public_address, addr)) {
+ return vnn;
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ take over an ip address
+ */
+int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
+ struct ctdb_req_control_old *c,
+ TDB_DATA indata,
+ bool *async_reply)
+{
+ int ret;
+ struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
+ struct ctdb_vnn *vnn;
+ bool have_ip = false;
+ bool do_updateip = false;
+ bool do_takeip = false;
+ struct ctdb_interface *best_iface = NULL;
+
+ if (pip->pnn != ctdb->pnn) {
+ DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
+ "with pnn %d, but we're node %d\n",
+ ctdb_addr_to_str(&pip->addr),
+ pip->pnn, ctdb->pnn));
+ return -1;
+ }
+
+ /* update out vnn list */
+ vnn = find_public_ip_vnn(ctdb, &pip->addr);
+ if (vnn == NULL) {
+ DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
+ ctdb_addr_to_str(&pip->addr)));
+ return 0;
+ }
+
+ if (ctdb_config.failover_disabled == 0 && ctdb->do_checkpublicip) {
+ have_ip = ctdb_sys_have_ip(&pip->addr);
+ }
+ best_iface = ctdb_vnn_best_iface(ctdb, vnn);
+ if (best_iface == NULL) {
+ DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
+ "a usable interface (old %s, have_ip %d)\n",
+ ctdb_addr_to_str(&vnn->public_address),
+ vnn->public_netmask_bits,
+ ctdb_vnn_iface_string(vnn),
+ have_ip));
+ return -1;
+ }
+
+ if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != CTDB_UNKNOWN_PNN) {
+ DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
+ "and we have it on iface[%s], but it was assigned to node %d"
+ "and we are node %d, banning ourself\n",
+ ctdb_addr_to_str(&vnn->public_address),
+ ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
+ ctdb_ban_self(ctdb);
+ return -1;
+ }
+
+ if (vnn->pnn == CTDB_UNKNOWN_PNN && have_ip) {
+ /* This will cause connections to be reset and
+ * reestablished. However, this is a very unusual
+ * situation and doing this will completely repair the
+ * inconsistency in the VNN.
+ */
+ DEBUG(DEBUG_WARNING,
+ (__location__
+ " Doing updateip for IP %s already on an interface\n",
+ ctdb_addr_to_str(&vnn->public_address)));
+ do_updateip = true;
+ }
+
+ if (vnn->iface) {
+ if (vnn->iface != best_iface) {
+ if (!vnn->iface->link_up) {
+ do_updateip = true;
+ } else if (vnn->iface->references > (best_iface->references + 1)) {
+ /* only move when the rebalance gains something */
+ do_updateip = true;
+ }
+ }
+ }
+
+ if (!have_ip) {
+ if (do_updateip) {
+ ctdb_vnn_unassign_iface(ctdb, vnn);
+ do_updateip = false;
+ }
+ do_takeip = true;
+ }
+
+ if (do_takeip) {
+ ret = ctdb_do_takeip(ctdb, c, vnn);
+ if (ret != 0) {
+ return -1;
+ }
+ } else if (do_updateip) {
+ ret = ctdb_do_updateip(ctdb, c, vnn);
+ if (ret != 0) {
+ return -1;
+ }
+ } else {
+ /*
+ * The interface is up and the kernel known the ip
+ * => do nothing
+ */
+ DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
+ ctdb_addr_to_str(&pip->addr),
+ vnn->public_netmask_bits,
+ ctdb_vnn_iface_string(vnn)));
+ return 0;
+ }
+
+ /* tell ctdb_control.c that we will be replying asynchronously */
+ *async_reply = true;
+
+ return 0;
+}
+
+static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
+{
+ DLIST_REMOVE(ctdb->vnn, vnn);
+ ctdb_vnn_unassign_iface(ctdb, vnn);
+ ctdb_remove_orphaned_ifaces(ctdb, vnn);
+ talloc_free(vnn);
+}
+
+static struct ctdb_vnn *release_ip_post(struct ctdb_context *ctdb,
+ struct ctdb_vnn *vnn,
+ ctdb_sock_addr *addr)
+{
+ TDB_DATA data;
+
+ /* Send a message to all clients of this node telling them
+ * that the cluster has been reconfigured and they should
+ * close any connections on this IP address
+ */
+ data.dptr = (uint8_t *)ctdb_addr_to_str(addr);
+ data.dsize = strlen((char *)data.dptr)+1;
+ DEBUG(DEBUG_INFO, ("Sending RELEASE_IP message for %s\n", data.dptr));
+ ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
+
+ ctdb_vnn_unassign_iface(ctdb, vnn);
+
+ /* Process the IP if it has been marked for deletion */
+ if (vnn->delete_pending) {
+ do_delete_ip(ctdb, vnn);
+ return NULL;
+ }
+
+ return vnn;
+}
+
+struct release_ip_callback_state {
+ struct ctdb_req_control_old *c;
+ ctdb_sock_addr *addr;
+ struct ctdb_vnn *vnn;
+ uint32_t target_pnn;
+};
+
+/*
+ called when releaseip event finishes
+ */
+static void release_ip_callback(struct ctdb_context *ctdb, int status,
+ void *private_data)
+{
+ struct release_ip_callback_state *state =
+ talloc_get_type(private_data, struct release_ip_callback_state);
+
+ if (status == -ETIMEDOUT) {
+ ctdb_ban_self(ctdb);
+ }
+
+ if (ctdb_config.failover_disabled == 0 && ctdb->do_checkpublicip) {
+ if (ctdb_sys_have_ip(state->addr)) {
+ DEBUG(DEBUG_ERR,
+ ("IP %s still hosted during release IP callback, failing\n",
+ ctdb_addr_to_str(state->addr)));
+ ctdb_request_control_reply(ctdb, state->c,
+ NULL, -1, NULL);
+ talloc_free(state);
+ return;
+ }
+ }
+
+ state->vnn->pnn = state->target_pnn;
+ state->vnn = release_ip_post(ctdb, state->vnn, state->addr);
+
+ /* the control succeeded */
+ ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
+ talloc_free(state);
+}
+
+static int ctdb_releaseip_destructor(struct release_ip_callback_state *state)
+{
+ if (state->vnn != NULL) {
+ state->vnn->update_in_flight = false;
+ }
+ return 0;
+}
+
+/*
+ release an ip address
+ */
+int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
+ struct ctdb_req_control_old *c,
+ TDB_DATA indata,
+ bool *async_reply)
+{
+ int ret;
+ struct release_ip_callback_state *state;
+ struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
+ struct ctdb_vnn *vnn;
+ const char *iface;
+
+ /* update our vnn list */
+ vnn = find_public_ip_vnn(ctdb, &pip->addr);
+ if (vnn == NULL) {
+ DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
+ ctdb_addr_to_str(&pip->addr)));
+ return 0;
+ }
+
+ /* stop any previous arps */
+ talloc_free(vnn->takeover_ctx);
+ vnn->takeover_ctx = NULL;
+
+ /* RELEASE_IP controls are sent to all nodes that should not
+ * be hosting a particular IP. This serves 2 purposes. The
+ * first is to help resolve any inconsistencies. If a node
+ * does unexpectly host an IP then it will be released. The
+ * 2nd is to use a "redundant release" to tell non-takeover
+ * nodes where an IP is moving to. This is how "ctdb ip" can
+ * report the (likely) location of an IP by only asking the
+ * local node. Redundant releases need to update the PNN but
+ * are otherwise ignored.
+ */
+ if (ctdb_config.failover_disabled == 0 && ctdb->do_checkpublicip) {
+ if (!ctdb_sys_have_ip(&pip->addr)) {
+ DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
+ ctdb_addr_to_str(&pip->addr),
+ vnn->public_netmask_bits,
+ ctdb_vnn_iface_string(vnn)));
+ vnn->pnn = pip->pnn;
+ ctdb_vnn_unassign_iface(ctdb, vnn);
+ return 0;
+ }
+ } else {
+ if (vnn->iface == NULL) {
+ DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
+ ctdb_addr_to_str(&pip->addr),
+ vnn->public_netmask_bits));
+ vnn->pnn = pip->pnn;
+ return 0;
+ }
+ }
+
+ /* There is a potential race between take_ip and us because we
+ * update the VNN via a callback that run when the
+ * eventscripts have been run. Avoid the race by allowing one
+ * update to be in flight at a time.
+ */
+ if (vnn->update_in_flight) {
+ DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
+ "update for this IP already in flight\n",
+ ctdb_addr_to_str(&vnn->public_address),
+ vnn->public_netmask_bits));
+ return -1;
+ }
+
+ iface = ctdb_vnn_iface_string(vnn);
+
+ DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s node:%d\n",
+ ctdb_addr_to_str(&pip->addr),
+ vnn->public_netmask_bits,
+ iface,
+ pip->pnn));
+
+ state = talloc(ctdb, struct release_ip_callback_state);
+ if (state == NULL) {
+ ctdb_set_error(ctdb, "Out of memory at %s:%d",
+ __FILE__, __LINE__);
+ return -1;
+ }
+
+ state->c = NULL;
+ state->addr = talloc(state, ctdb_sock_addr);
+ if (state->addr == NULL) {
+ ctdb_set_error(ctdb, "Out of memory at %s:%d",
+ __FILE__, __LINE__);
+ talloc_free(state);
+ return -1;
+ }
+ *state->addr = pip->addr;
+ state->target_pnn = pip->pnn;
+ state->vnn = vnn;
+
+ vnn->update_in_flight = true;
+ talloc_set_destructor(state, ctdb_releaseip_destructor);
+
+ ret = ctdb_event_script_callback(ctdb,
+ state, release_ip_callback, state,
+ CTDB_EVENT_RELEASE_IP,
+ "%s %s %u",
+ iface,
+ ctdb_addr_to_str(&pip->addr),
+ vnn->public_netmask_bits);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
+ ctdb_addr_to_str(&pip->addr),
+ ctdb_vnn_iface_string(vnn)));
+ talloc_free(state);
+ return -1;
+ }
+
+ /* tell the control that we will be reply asynchronously */
+ *async_reply = true;
+ state->c = talloc_steal(state, c);
+ return 0;
+}
+
+static int ctdb_add_public_address(struct ctdb_context *ctdb,
+ ctdb_sock_addr *addr,
+ unsigned mask, const char *ifaces,
+ bool check_address)
+{
+ struct ctdb_vnn *vnn;
+ char *tmp;
+ const char *iface;
+
+ /* Verify that we don't have an entry for this IP yet */
+ for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
+ if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
+ D_ERR("Duplicate public IP address '%s'\n",
+ ctdb_addr_to_str(addr));
+ return -1;
+ }
+ }
+
+ /* Create a new VNN structure for this IP address */
+ vnn = talloc_zero(ctdb, struct ctdb_vnn);
+ if (vnn == NULL) {
+ DBG_ERR("Memory allocation error\n");
+ return -1;
+ }
+ tmp = talloc_strdup(vnn, ifaces);
+ if (tmp == NULL) {
+ DBG_ERR("Memory allocation error\n");
+ talloc_free(vnn);
+ return -1;
+ }
+ for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
+ struct vnn_interface *vnn_iface;
+ struct ctdb_interface *i;
+
+ if (!ctdb_sys_check_iface_exists(iface)) {
+ D_ERR("Unknown interface %s for public address %s\n",
+ iface,
+ ctdb_addr_to_str(addr));
+ talloc_free(vnn);
+ return -1;
+ }
+
+ i = ctdb_add_local_iface(ctdb, iface);
+ if (i == NULL) {
+ D_ERR("Failed to add interface '%s' "
+ "for public address %s\n",
+ iface,
+ ctdb_addr_to_str(addr));
+ talloc_free(vnn);
+ return -1;
+ }
+
+ vnn_iface = talloc_zero(vnn, struct vnn_interface);
+ if (vnn_iface == NULL) {
+ DBG_ERR("Memory allocation error\n");
+ talloc_free(vnn);
+ return -1;
+ }
+
+ vnn_iface->iface = i;
+ DLIST_ADD_END(vnn->ifaces, vnn_iface);
+ }
+ talloc_free(tmp);
+ vnn->public_address = *addr;
+ vnn->public_netmask_bits = mask;
+ vnn->pnn = -1;
+
+ DLIST_ADD(ctdb->vnn, vnn);
+
+ return 0;
+}
+
+/*
+ setup the public address lists from a file
+*/
+int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
+{
+ bool ok;
+ char **lines;
+ int nlines;
+ int i;
+
+ /* If no public addresses file given then try the default */
+ if (ctdb->public_addresses_file == NULL) {
+ const char *b = getenv("CTDB_BASE");
+ if (b == NULL) {
+ DBG_ERR("CTDB_BASE not set\n");
+ return -1;
+ }
+ ctdb->public_addresses_file = talloc_asprintf(
+ ctdb, "%s/%s", b, "public_addresses");
+ if (ctdb->public_addresses_file == NULL) {
+ DBG_ERR("Out of memory\n");
+ return -1;
+ }
+ }
+
+ /* If the file doesn't exist then warn and do nothing */
+ ok = file_exist(ctdb->public_addresses_file);
+ if (!ok) {
+ D_WARNING("Not loading public addresses, no file %s\n",
+ ctdb->public_addresses_file);
+ return 0;
+ }
+
+ lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
+ if (lines == NULL) {
+ ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
+ return -1;
+ }
+ while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
+ nlines--;
+ }
+
+ for (i=0;i<nlines;i++) {
+ unsigned mask;
+ ctdb_sock_addr addr;
+ const char *addrstr;
+ const char *ifaces;
+ char *tok, *line;
+ int ret;
+
+ line = lines[i];
+ while ((*line == ' ') || (*line == '\t')) {
+ line++;
+ }
+ if (*line == '#') {
+ continue;
+ }
+ if (strcmp(line, "") == 0) {
+ continue;
+ }
+ tok = strtok(line, " \t");
+ addrstr = tok;
+
+ tok = strtok(NULL, " \t");
+ if (tok == NULL) {
+ D_ERR("No interface specified at line %u "
+ "of public addresses file\n", i+1);
+ talloc_free(lines);
+ return -1;
+ }
+ ifaces = tok;
+
+ if (addrstr == NULL) {
+ D_ERR("Badly formed line %u in public address list\n",
+ i+1);
+ talloc_free(lines);
+ return -1;
+ }
+
+ ret = ctdb_sock_addr_mask_from_string(addrstr, &addr, &mask);
+ if (ret != 0) {
+ D_ERR("Badly formed line %u in public address list\n",
+ i+1);
+ talloc_free(lines);
+ return -1;
+ }
+
+ if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
+ DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
+ talloc_free(lines);
+ return -1;
+ }
+ }
+
+
+ D_NOTICE("Loaded public addresses from %s\n",
+ ctdb->public_addresses_file);
+
+ talloc_free(lines);
+ return 0;
+}
+
+/*
+ destroy a ctdb_client_ip structure
+ */
+static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
+{
+ DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
+ ctdb_addr_to_str(&ip->addr),
+ ntohs(ip->addr.ip.sin_port),
+ ip->client_id));
+
+ DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
+ return 0;
+}
+
+/*
+ called by a client to inform us of a TCP connection that it is managing
+ that should tickled with an ACK when IP takeover is done
+ */
+int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
+ TDB_DATA indata)
+{
+ struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
+ struct ctdb_connection *tcp_sock = NULL;
+ struct ctdb_tcp_list *tcp;
+ struct ctdb_connection t;
+ int ret;
+ TDB_DATA data;
+ struct ctdb_client_ip *ip;
+ struct ctdb_vnn *vnn;
+ ctdb_sock_addr src_addr;
+ ctdb_sock_addr dst_addr;
+
+ /* If we don't have public IPs, tickles are useless */
+ if (ctdb->vnn == NULL) {
+ return 0;
+ }
+
+ tcp_sock = (struct ctdb_connection *)indata.dptr;
+
+ src_addr = tcp_sock->src;
+ ctdb_canonicalize_ip(&src_addr, &tcp_sock->src);
+ ZERO_STRUCT(src_addr);
+ memcpy(&src_addr, &tcp_sock->src, sizeof(src_addr));
+
+ dst_addr = tcp_sock->dst;
+ ctdb_canonicalize_ip(&dst_addr, &tcp_sock->dst);
+ ZERO_STRUCT(dst_addr);
+ memcpy(&dst_addr, &tcp_sock->dst, sizeof(dst_addr));
+
+ vnn = find_public_ip_vnn(ctdb, &dst_addr);
+ if (vnn == NULL) {
+ char *src_addr_str = NULL;
+ char *dst_addr_str = NULL;
+
+ switch (dst_addr.sa.sa_family) {
+ case AF_INET:
+ if (ntohl(dst_addr.ip.sin_addr.s_addr) == INADDR_LOOPBACK) {
+ /* ignore ... */
+ return 0;
+ }
+ break;
+ case AF_INET6:
+ break;
+ default:
+ DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n",
+ dst_addr.sa.sa_family));
+ return 0;
+ }
+
+ src_addr_str = ctdb_sock_addr_to_string(client, &src_addr, false);
+ dst_addr_str = ctdb_sock_addr_to_string(client, &dst_addr, false);
+ DEBUG(DEBUG_ERR,(
+ "Could not register TCP connection from "
+ "%s to %s (not a public address) (port %u) "
+ "(client_id %u pid %u).\n",
+ src_addr_str,
+ dst_addr_str,
+ ctdb_sock_addr_port(&dst_addr),
+ client_id, client->pid));
+ TALLOC_FREE(src_addr_str);
+ TALLOC_FREE(dst_addr_str);
+ return 0;
+ }
+
+ if (vnn->pnn != ctdb->pnn) {
+ DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
+ ctdb_addr_to_str(&dst_addr),
+ client_id, client->pid));
+ /* failing this call will tell smbd to die */
+ return -1;
+ }
+
+ ip = talloc(client, struct ctdb_client_ip);
+ CTDB_NO_MEMORY(ctdb, ip);
+
+ ip->ctdb = ctdb;
+ ip->addr = dst_addr;
+ ip->client_id = client_id;
+ talloc_set_destructor(ip, ctdb_client_ip_destructor);
+ DLIST_ADD(ctdb->client_ip_list, ip);
+
+ tcp = talloc(client, struct ctdb_tcp_list);
+ CTDB_NO_MEMORY(ctdb, tcp);
+
+ tcp->connection.src = tcp_sock->src;
+ tcp->connection.dst = tcp_sock->dst;
+
+ DLIST_ADD(client->tcp_list, tcp);
+
+ t.src = tcp_sock->src;
+ t.dst = tcp_sock->dst;
+
+ data.dptr = (uint8_t *)&t;
+ data.dsize = sizeof(t);
+
+ switch (dst_addr.sa.sa_family) {
+ case AF_INET:
+ DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
+ (unsigned)ntohs(tcp_sock->dst.ip.sin_port),
+ ctdb_addr_to_str(&tcp_sock->src),
+ (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
+ break;
+ case AF_INET6:
+ DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
+ (unsigned)ntohs(tcp_sock->dst.ip6.sin6_port),
+ ctdb_addr_to_str(&tcp_sock->src),
+ (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
+ break;
+ default:
+ DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n",
+ dst_addr.sa.sa_family));
+ }
+
+
+ /* tell all nodes about this tcp connection */
+ ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0,
+ CTDB_CONTROL_TCP_ADD,
+ 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ find a tcp address on a list
+ */
+static struct ctdb_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
+ struct ctdb_connection *tcp)
+{
+ unsigned int i;
+
+ if (array == NULL) {
+ return NULL;
+ }
+
+ for (i=0;i<array->num;i++) {
+ if (ctdb_same_sockaddr(&array->connections[i].src, &tcp->src) &&
+ ctdb_same_sockaddr(&array->connections[i].dst, &tcp->dst)) {
+ return &array->connections[i];
+ }
+ }
+ return NULL;
+}
+
+
+
+/*
+ called by a daemon to inform us of a TCP connection that one of its
+ clients managing that should tickled with an ACK when IP takeover is
+ done
+ */
+int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
+{
+ struct ctdb_connection *p = (struct ctdb_connection *)indata.dptr;
+ struct ctdb_tcp_array *tcparray;
+ struct ctdb_connection tcp;
+ struct ctdb_vnn *vnn;
+
+ /* If we don't have public IPs, tickles are useless */
+ if (ctdb->vnn == NULL) {
+ return 0;
+ }
+
+ vnn = find_public_ip_vnn(ctdb, &p->dst);
+ if (vnn == NULL) {
+ DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
+ ctdb_addr_to_str(&p->dst)));
+
+ return -1;
+ }
+
+
+ tcparray = vnn->tcp_array;
+
+ /* If this is the first tickle */
+ if (tcparray == NULL) {
+ tcparray = talloc(vnn, struct ctdb_tcp_array);
+ CTDB_NO_MEMORY(ctdb, tcparray);
+ vnn->tcp_array = tcparray;
+
+ tcparray->num = 0;
+ tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_connection));
+ CTDB_NO_MEMORY(ctdb, tcparray->connections);
+
+ tcparray->connections[tcparray->num].src = p->src;
+ tcparray->connections[tcparray->num].dst = p->dst;
+ tcparray->num++;
+
+ if (tcp_update_needed) {
+ vnn->tcp_update_needed = true;
+ }
+ return 0;
+ }
+
+
+ /* Do we already have this tickle ?*/
+ tcp.src = p->src;
+ tcp.dst = p->dst;
+ if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
+ DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
+ ctdb_addr_to_str(&tcp.dst),
+ ntohs(tcp.dst.ip.sin_port),
+ vnn->pnn));
+ return 0;
+ }
+
+ /* A new tickle, we must add it to the array */
+ tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
+ struct ctdb_connection,
+ tcparray->num+1);
+ CTDB_NO_MEMORY(ctdb, tcparray->connections);
+
+ tcparray->connections[tcparray->num].src = p->src;
+ tcparray->connections[tcparray->num].dst = p->dst;
+ tcparray->num++;
+
+ DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
+ ctdb_addr_to_str(&tcp.dst),
+ ntohs(tcp.dst.ip.sin_port),
+ vnn->pnn));
+
+ if (tcp_update_needed) {
+ vnn->tcp_update_needed = true;
+ }
+
+ return 0;
+}
+
+
+static void ctdb_remove_connection(struct ctdb_vnn *vnn, struct ctdb_connection *conn)
+{
+ struct ctdb_connection *tcpp;
+
+ if (vnn == NULL) {
+ return;
+ }
+
+ /* if the array is empty we cant remove it
+ and we don't need to do anything
+ */
+ if (vnn->tcp_array == NULL) {
+ DEBUG(DEBUG_INFO,("Trying to remove tickle that doesn't exist (array is empty) %s:%u\n",
+ ctdb_addr_to_str(&conn->dst),
+ ntohs(conn->dst.ip.sin_port)));
+ return;
+ }
+
+
+ /* See if we know this connection
+ if we don't know this connection then we don't need to do anything
+ */
+ tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
+ if (tcpp == NULL) {
+ DEBUG(DEBUG_INFO,("Trying to remove tickle that doesn't exist %s:%u\n",
+ ctdb_addr_to_str(&conn->dst),
+ ntohs(conn->dst.ip.sin_port)));
+ return;
+ }
+
+
+ /* We need to remove this entry from the array.
+ Instead of allocating a new array and copying data to it
+ we cheat and just copy the last entry in the existing array
+ to the entry that is to be removed and just shring the
+ ->num field
+ */
+ *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
+ vnn->tcp_array->num--;
+
+ /* If we deleted the last entry we also need to remove the entire array
+ */
+ if (vnn->tcp_array->num == 0) {
+ talloc_free(vnn->tcp_array);
+ vnn->tcp_array = NULL;
+ }
+
+ vnn->tcp_update_needed = true;
+
+ DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
+ ctdb_addr_to_str(&conn->src),
+ ntohs(conn->src.ip.sin_port)));
+}
+
+
+/*
+ called by a daemon to inform us of a TCP connection that one of its
+ clients used are no longer needed in the tickle database
+ */
+int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+ struct ctdb_vnn *vnn;
+ struct ctdb_connection *conn = (struct ctdb_connection *)indata.dptr;
+
+ /* If we don't have public IPs, tickles are useless */
+ if (ctdb->vnn == NULL) {
+ return 0;
+ }
+
+ vnn = find_public_ip_vnn(ctdb, &conn->dst);
+ if (vnn == NULL) {
+ DEBUG(DEBUG_ERR,
+ (__location__ " unable to find public address %s\n",
+ ctdb_addr_to_str(&conn->dst)));
+ return 0;
+ }
+
+ ctdb_remove_connection(vnn, conn);
+
+ return 0;
+}
+
+
+static void ctdb_send_set_tcp_tickles_for_all(struct ctdb_context *ctdb,
+ bool force);
+
+/*
+ Called when another daemon starts - causes all tickles for all
+ public addresses we are serving to be sent to the new node on the
+ next check. This actually causes the tickles to be sent to the
+ other node immediately. In case there is an error, the periodic
+ timer will send the updates on timer event. This is simple and
+ doesn't require careful error handling.
+ */
+int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
+{
+ DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
+ (unsigned long) pnn));
+
+ ctdb_send_set_tcp_tickles_for_all(ctdb, true);
+ return 0;
+}
+
+
+/*
+ called when a client structure goes away - hook to remove
+ elements from the tcp_list in all daemons
+ */
+void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
+{
+ while (client->tcp_list) {
+ struct ctdb_vnn *vnn;
+ struct ctdb_tcp_list *tcp = client->tcp_list;
+ struct ctdb_connection *conn = &tcp->connection;
+
+ DLIST_REMOVE(client->tcp_list, tcp);
+
+ vnn = find_public_ip_vnn(client->ctdb,
+ &conn->dst);
+ if (vnn == NULL) {
+ DEBUG(DEBUG_ERR,
+ (__location__ " unable to find public address %s\n",
+ ctdb_addr_to_str(&conn->dst)));
+ continue;
+ }
+
+ /* If the IP address is hosted on this node then
+ * remove the connection. */
+ if (vnn->pnn == client->ctdb->pnn) {
+ ctdb_remove_connection(vnn, conn);
+ }
+
+ /* Otherwise this function has been called because the
+ * server IP address has been released to another node
+ * and the client has exited. This means that we
+ * should not delete the connection information. The
+ * takeover node processes connections too. */
+ }
+}
+
+
+void ctdb_release_all_ips(struct ctdb_context *ctdb)
+{
+ struct ctdb_vnn *vnn, *next;
+ int count = 0;
+
+ if (ctdb_config.failover_disabled == 1) {
+ return;
+ }
+
+ for (vnn = ctdb->vnn; vnn != NULL; vnn = next) {
+ /* vnn can be freed below in release_ip_post() */
+ next = vnn->next;
+
+ if (!ctdb_sys_have_ip(&vnn->public_address)) {
+ ctdb_vnn_unassign_iface(ctdb, vnn);
+ continue;
+ }
+
+ /* Don't allow multiple releases at once. Some code,
+ * particularly ctdb_tickle_sentenced_connections() is
+ * not re-entrant */
+ if (vnn->update_in_flight) {
+ DEBUG(DEBUG_WARNING,
+ (__location__
+ " Not releasing IP %s/%u on interface %s, an update is already in progress\n",
+ ctdb_addr_to_str(&vnn->public_address),
+ vnn->public_netmask_bits,
+ ctdb_vnn_iface_string(vnn)));
+ continue;
+ }
+ vnn->update_in_flight = true;
+
+ DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
+ ctdb_addr_to_str(&vnn->public_address),
+ vnn->public_netmask_bits,
+ ctdb_vnn_iface_string(vnn)));
+
+ ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
+ ctdb_vnn_iface_string(vnn),
+ ctdb_addr_to_str(&vnn->public_address),
+ vnn->public_netmask_bits);
+ /* releaseip timeouts are converted to success, so to
+ * detect failures just check if the IP address is
+ * still there...
+ */
+ if (ctdb_sys_have_ip(&vnn->public_address)) {
+ DEBUG(DEBUG_ERR,
+ (__location__
+ " IP address %s not released\n",
+ ctdb_addr_to_str(&vnn->public_address)));
+ vnn->update_in_flight = false;
+ continue;
+ }
+
+ vnn = release_ip_post(ctdb, vnn, &vnn->public_address);
+ if (vnn != NULL) {
+ vnn->update_in_flight = false;
+ }
+ count++;
+ }
+
+ DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
+}
+
+
+/*
+ get list of public IPs
+ */
+int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb,
+ struct ctdb_req_control_old *c, TDB_DATA *outdata)
+{
+ int i, num, len;
+ struct ctdb_public_ip_list_old *ips;
+ struct ctdb_vnn *vnn;
+ bool only_available = false;
+
+ if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
+ only_available = true;
+ }
+
+ /* count how many public ip structures we have */
+ num = 0;
+ for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
+ num++;
+ }
+
+ len = offsetof(struct ctdb_public_ip_list_old, ips) +
+ num*sizeof(struct ctdb_public_ip);
+ ips = talloc_zero_size(outdata, len);
+ CTDB_NO_MEMORY(ctdb, ips);
+
+ i = 0;
+ for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
+ if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
+ continue;
+ }
+ ips->ips[i].pnn = vnn->pnn;
+ ips->ips[i].addr = vnn->public_address;
+ i++;
+ }
+ ips->num = i;
+ len = offsetof(struct ctdb_public_ip_list_old, ips) +
+ i*sizeof(struct ctdb_public_ip);
+
+ outdata->dsize = len;
+ outdata->dptr = (uint8_t *)ips;
+
+ return 0;
+}
+
+
+int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
+ struct ctdb_req_control_old *c,
+ TDB_DATA indata,
+ TDB_DATA *outdata)
+{
+ int i, num, len;
+ ctdb_sock_addr *addr;
+ struct ctdb_public_ip_info_old *info;
+ struct ctdb_vnn *vnn;
+ struct vnn_interface *iface;
+
+ addr = (ctdb_sock_addr *)indata.dptr;
+
+ vnn = find_public_ip_vnn(ctdb, addr);
+ if (vnn == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
+ "'%s'not a public address\n",
+ ctdb_addr_to_str(addr)));
+ return -1;
+ }
+
+ /* count how many public ip structures we have */
+ num = 0;
+ for (iface = vnn->ifaces; iface != NULL; iface = iface->next) {
+ num++;
+ }
+
+ len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
+ num*sizeof(struct ctdb_iface);
+ info = talloc_zero_size(outdata, len);
+ CTDB_NO_MEMORY(ctdb, info);
+
+ info->ip.addr = vnn->public_address;
+ info->ip.pnn = vnn->pnn;
+ info->active_idx = 0xFFFFFFFF;
+
+ i = 0;
+ for (iface = vnn->ifaces; iface != NULL; iface = iface->next) {
+ struct ctdb_interface *cur;
+
+ cur = iface->iface;
+ if (vnn->iface == cur) {
+ info->active_idx = i;
+ }
+ strncpy(info->ifaces[i].name, cur->name,
+ sizeof(info->ifaces[i].name));
+ info->ifaces[i].name[sizeof(info->ifaces[i].name)-1] = '\0';
+ info->ifaces[i].link_state = cur->link_up;
+ info->ifaces[i].references = cur->references;
+
+ i++;
+ }
+ info->num = i;
+ len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
+ i*sizeof(struct ctdb_iface);
+
+ outdata->dsize = len;
+ outdata->dptr = (uint8_t *)info;
+
+ return 0;
+}
+
+int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
+ struct ctdb_req_control_old *c,
+ TDB_DATA *outdata)
+{
+ int i, num, len;
+ struct ctdb_iface_list_old *ifaces;
+ struct ctdb_interface *cur;
+
+ /* count how many public ip structures we have */
+ num = 0;
+ for (cur=ctdb->ifaces;cur;cur=cur->next) {
+ num++;
+ }
+
+ len = offsetof(struct ctdb_iface_list_old, ifaces) +
+ num*sizeof(struct ctdb_iface);
+ ifaces = talloc_zero_size(outdata, len);
+ CTDB_NO_MEMORY(ctdb, ifaces);
+
+ i = 0;
+ for (cur=ctdb->ifaces;cur;cur=cur->next) {
+ strncpy(ifaces->ifaces[i].name, cur->name,
+ sizeof(ifaces->ifaces[i].name));
+ ifaces->ifaces[i].name[sizeof(ifaces->ifaces[i].name)-1] = '\0';
+ ifaces->ifaces[i].link_state = cur->link_up;
+ ifaces->ifaces[i].references = cur->references;
+ i++;
+ }
+ ifaces->num = i;
+ len = offsetof(struct ctdb_iface_list_old, ifaces) +
+ i*sizeof(struct ctdb_iface);
+
+ outdata->dsize = len;
+ outdata->dptr = (uint8_t *)ifaces;
+
+ return 0;
+}
+
+int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
+ struct ctdb_req_control_old *c,
+ TDB_DATA indata)
+{
+ struct ctdb_iface *info;
+ struct ctdb_interface *iface;
+ bool link_up = false;
+
+ info = (struct ctdb_iface *)indata.dptr;
+
+ if (info->name[CTDB_IFACE_SIZE] != '\0') {
+ int len = strnlen(info->name, CTDB_IFACE_SIZE);
+ DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
+ len, len, info->name));
+ return -1;
+ }
+
+ switch (info->link_state) {
+ case 0:
+ link_up = false;
+ break;
+ case 1:
+ link_up = true;
+ break;
+ default:
+ DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
+ (unsigned int)info->link_state));
+ return -1;
+ }
+
+ if (info->references != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
+ (unsigned int)info->references));
+ return -1;
+ }
+
+ iface = ctdb_find_iface(ctdb, info->name);
+ if (iface == NULL) {
+ return -1;
+ }
+
+ if (link_up == iface->link_up) {
+ return 0;
+ }
+
+ DEBUG(DEBUG_ERR,
+ ("iface[%s] has changed it's link status %s => %s\n",
+ iface->name,
+ iface->link_up?"up":"down",
+ link_up?"up":"down"));
+
+ iface->link_up = link_up;
+ return 0;
+}
+
+
+/*
+ called by a daemon to inform us of the entire list of TCP tickles for
+ a particular public address.
+ this control should only be sent by the node that is currently serving
+ that public address.
+ */
+int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+ struct ctdb_tickle_list_old *list = (struct ctdb_tickle_list_old *)indata.dptr;
+ struct ctdb_tcp_array *tcparray;
+ struct ctdb_vnn *vnn;
+
+ /* We must at least have tickles.num or else we cant verify the size
+ of the received data blob
+ */
+ if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)) {
+ DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list. Not enough data for the tickle.num field\n"));
+ return -1;
+ }
+
+ /* verify that the size of data matches what we expect */
+ if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)
+ + sizeof(struct ctdb_connection) * list->num) {
+ DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list\n"));
+ return -1;
+ }
+
+ DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
+ ctdb_addr_to_str(&list->addr)));
+
+ vnn = find_public_ip_vnn(ctdb, &list->addr);
+ if (vnn == NULL) {
+ DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
+ ctdb_addr_to_str(&list->addr)));
+
+ return 1;
+ }
+
+ if (vnn->pnn == ctdb->pnn) {
+ DEBUG(DEBUG_INFO,
+ ("Ignoring redundant set tcp tickle list, this node hosts '%s'\n",
+ ctdb_addr_to_str(&list->addr)));
+ return 0;
+ }
+
+ /* remove any old ticklelist we might have */
+ talloc_free(vnn->tcp_array);
+ vnn->tcp_array = NULL;
+
+ tcparray = talloc(vnn, struct ctdb_tcp_array);
+ CTDB_NO_MEMORY(ctdb, tcparray);
+
+ tcparray->num = list->num;
+
+ tcparray->connections = talloc_array(tcparray, struct ctdb_connection, tcparray->num);
+ CTDB_NO_MEMORY(ctdb, tcparray->connections);
+
+ memcpy(tcparray->connections, &list->connections[0],
+ sizeof(struct ctdb_connection)*tcparray->num);
+
+ /* We now have a new fresh tickle list array for this vnn */
+ vnn->tcp_array = tcparray;
+
+ return 0;
+}
+
+/*
+ called to return the full list of tickles for the puclic address associated
+ with the provided vnn
+ */
+int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
+{
+ ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
+ struct ctdb_tickle_list_old *list;
+ struct ctdb_tcp_array *tcparray;
+ unsigned int num, i;
+ struct ctdb_vnn *vnn;
+ unsigned port;
+
+ vnn = find_public_ip_vnn(ctdb, addr);
+ if (vnn == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n",
+ ctdb_addr_to_str(addr)));
+
+ return 1;
+ }
+
+ port = ctdb_addr_to_port(addr);
+
+ tcparray = vnn->tcp_array;
+ num = 0;
+ if (tcparray != NULL) {
+ if (port == 0) {
+ /* All connections */
+ num = tcparray->num;
+ } else {
+ /* Count connections for port */
+ for (i = 0; i < tcparray->num; i++) {
+ if (port == ctdb_addr_to_port(&tcparray->connections[i].dst)) {
+ num++;
+ }
+ }
+ }
+ }
+
+ outdata->dsize = offsetof(struct ctdb_tickle_list_old, connections)
+ + sizeof(struct ctdb_connection) * num;
+
+ outdata->dptr = talloc_size(outdata, outdata->dsize);
+ CTDB_NO_MEMORY(ctdb, outdata->dptr);
+ list = (struct ctdb_tickle_list_old *)outdata->dptr;
+
+ list->addr = *addr;
+ list->num = num;
+
+ if (num == 0) {
+ return 0;
+ }
+
+ num = 0;
+ for (i = 0; i < tcparray->num; i++) {
+ if (port == 0 || \
+ port == ctdb_addr_to_port(&tcparray->connections[i].dst)) {
+ list->connections[num] = tcparray->connections[i];
+ num++;
+ }
+ }
+
+ return 0;
+}
+
+
+/*
+ set the list of all tcp tickles for a public address
+ */
+static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
+ ctdb_sock_addr *addr,
+ struct ctdb_tcp_array *tcparray)
+{
+ int ret, num;
+ TDB_DATA data;
+ struct ctdb_tickle_list_old *list;
+
+ if (tcparray) {
+ num = tcparray->num;
+ } else {
+ num = 0;
+ }
+
+ data.dsize = offsetof(struct ctdb_tickle_list_old, connections) +
+ sizeof(struct ctdb_connection) * num;
+ data.dptr = talloc_size(ctdb, data.dsize);
+ CTDB_NO_MEMORY(ctdb, data.dptr);
+
+ list = (struct ctdb_tickle_list_old *)data.dptr;
+ list->addr = *addr;
+ list->num = num;
+ if (tcparray) {
+ memcpy(&list->connections[0], tcparray->connections, sizeof(struct ctdb_connection) * num);
+ }
+
+ ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
+ CTDB_CONTROL_SET_TCP_TICKLE_LIST,
+ 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
+ return -1;
+ }
+
+ talloc_free(data.dptr);
+
+ return ret;
+}
+
+static void ctdb_send_set_tcp_tickles_for_all(struct ctdb_context *ctdb,
+ bool force)
+{
+ struct ctdb_vnn *vnn;
+ int ret;
+
+ for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
+ /* we only send out updates for public addresses that
+ we have taken over
+ */
+ if (ctdb->pnn != vnn->pnn) {
+ continue;
+ }
+
+ /* We only send out the updates if we need to */
+ if (!force && !vnn->tcp_update_needed) {
+ continue;
+ }
+
+ ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
+ &vnn->public_address,
+ vnn->tcp_array);
+ if (ret != 0) {
+ D_ERR("Failed to send the tickle update for ip %s\n",
+ ctdb_addr_to_str(&vnn->public_address));
+ vnn->tcp_update_needed = true;
+ } else {
+ D_INFO("Sent tickle update for ip %s\n",
+ ctdb_addr_to_str(&vnn->public_address));
+ vnn->tcp_update_needed = false;
+ }
+ }
+
+}
+
+/*
+ perform tickle updates if required
+ */
+static void ctdb_update_tcp_tickles(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *private_data)
+{
+ struct ctdb_context *ctdb = talloc_get_type(
+ private_data, struct ctdb_context);
+
+ ctdb_send_set_tcp_tickles_for_all(ctdb, false);
+
+ tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
+ timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
+ ctdb_update_tcp_tickles, ctdb);
+}
+
+/*
+ start periodic update of tcp tickles
+ */
+void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
+{
+ ctdb->tickle_update_context = talloc_new(ctdb);
+
+ tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
+ timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
+ ctdb_update_tcp_tickles, ctdb);
+}
+
+
+
+
+struct control_gratious_arp {
+ struct ctdb_context *ctdb;
+ ctdb_sock_addr addr;
+ const char *iface;
+ int count;
+};
+
+/*
+ send a control_gratuitous arp
+ */
+static void send_gratious_arp(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *private_data)
+{
+ int ret;
+ struct control_gratious_arp *arp = talloc_get_type(private_data,
+ struct control_gratious_arp);
+
+ ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
+ if (ret != 0) {
+ DBG_ERR("Failed to send gratuitous ARP on iface %s: %s\n",
+ arp->iface, strerror(ret));
+ }
+
+
+ arp->count++;
+ if (arp->count == CTDB_ARP_REPEAT) {
+ talloc_free(arp);
+ return;
+ }
+
+ tevent_add_timer(arp->ctdb->ev, arp,
+ timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
+ send_gratious_arp, arp);
+}
+
+
+/*
+ send a gratious arp
+ */
+int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+ struct ctdb_addr_info_old *gratious_arp = (struct ctdb_addr_info_old *)indata.dptr;
+ struct control_gratious_arp *arp;
+
+ /* verify the size of indata */
+ if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
+ DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n",
+ (unsigned)indata.dsize,
+ (unsigned)offsetof(struct ctdb_addr_info_old, iface)));
+ return -1;
+ }
+ if (indata.dsize !=
+ ( offsetof(struct ctdb_addr_info_old, iface)
+ + gratious_arp->len ) ){
+
+ DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
+ "but should be %u bytes\n",
+ (unsigned)indata.dsize,
+ (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+gratious_arp->len)));
+ return -1;
+ }
+
+
+ arp = talloc(ctdb, struct control_gratious_arp);
+ CTDB_NO_MEMORY(ctdb, arp);
+
+ arp->ctdb = ctdb;
+ arp->addr = gratious_arp->addr;
+ arp->iface = talloc_strdup(arp, gratious_arp->iface);
+ CTDB_NO_MEMORY(ctdb, arp->iface);
+ arp->count = 0;
+
+ tevent_add_timer(arp->ctdb->ev, arp,
+ timeval_zero(), send_gratious_arp, arp);
+
+ return 0;
+}
+
+int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+ struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
+ int ret;
+
+ /* verify the size of indata */
+ if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
+ DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
+ return -1;
+ }
+ if (indata.dsize !=
+ ( offsetof(struct ctdb_addr_info_old, iface)
+ + pub->len ) ){
+
+ DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
+ "but should be %u bytes\n",
+ (unsigned)indata.dsize,
+ (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
+ return -1;
+ }
+
+ DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
+
+ ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
+
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
+ return -1;
+ }
+
+ return 0;
+}
+
+int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+ struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
+ struct ctdb_vnn *vnn;
+
+ /* verify the size of indata */
+ if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
+ DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
+ return -1;
+ }
+ if (indata.dsize !=
+ ( offsetof(struct ctdb_addr_info_old, iface)
+ + pub->len ) ){
+
+ DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
+ "but should be %u bytes\n",
+ (unsigned)indata.dsize,
+ (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
+ return -1;
+ }
+
+ DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
+
+ /* walk over all public addresses until we find a match */
+ for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
+ if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
+ if (vnn->pnn == ctdb->pnn) {
+ /* This IP is currently being hosted.
+ * Defer the deletion until the next
+ * takeover run. "ctdb reloadips" will
+ * always cause a takeover run. "ctdb
+ * delip" will now need an explicit
+ * "ctdb ipreallocated" afterwards. */
+ vnn->delete_pending = true;
+ } else {
+ /* This IP is not hosted on the
+ * current node so just delete it
+ * now. */
+ do_delete_ip(ctdb, vnn);
+ }
+
+ return 0;
+ }
+ }
+
+ DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
+ ctdb_addr_to_str(&pub->addr)));
+ return -1;
+}
+
+
+struct ipreallocated_callback_state {
+ struct ctdb_req_control_old *c;
+};
+
+static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
+ int status, void *p)
+{
+ struct ipreallocated_callback_state *state =
+ talloc_get_type(p, struct ipreallocated_callback_state);
+
+ if (status != 0) {
+ DEBUG(DEBUG_ERR,
+ (" \"ipreallocated\" event script failed (status %d)\n",
+ status));
+ if (status == -ETIMEDOUT) {
+ ctdb_ban_self(ctdb);
+ }
+ }
+
+ ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
+ talloc_free(state);
+}
+
+/* A control to run the ipreallocated event */
+int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
+ struct ctdb_req_control_old *c,
+ bool *async_reply)
+{
+ int ret;
+ struct ipreallocated_callback_state *state;
+
+ state = talloc(ctdb, struct ipreallocated_callback_state);
+ CTDB_NO_MEMORY(ctdb, state);
+
+ DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
+
+ ret = ctdb_event_script_callback(ctdb, state,
+ ctdb_ipreallocated_callback, state,
+ CTDB_EVENT_IPREALLOCATED,
+ "%s", "");
+
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
+ talloc_free(state);
+ return -1;
+ }
+
+ /* tell the control that we will be reply asynchronously */
+ state->c = talloc_steal(state, c);
+ *async_reply = true;
+
+ return 0;
+}
+
+
+struct ctdb_reloadips_handle {
+ struct ctdb_context *ctdb;
+ struct ctdb_req_control_old *c;
+ int status;
+ int fd[2];
+ pid_t child;
+ struct tevent_fd *fde;
+};
+
+static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
+{
+ if (h == h->ctdb->reload_ips) {
+ h->ctdb->reload_ips = NULL;
+ }
+ if (h->c != NULL) {
+ ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
+ h->c = NULL;
+ }
+ ctdb_kill(h->ctdb, h->child, SIGKILL);
+ return 0;
+}
+
+static void ctdb_reloadips_timeout_event(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *private_data)
+{
+ struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
+
+ talloc_free(h);
+}
+
+static void ctdb_reloadips_child_handler(struct tevent_context *ev,
+ struct tevent_fd *fde,
+ uint16_t flags, void *private_data)
+{
+ struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
+
+ char res;
+ int ret;
+
+ ret = sys_read(h->fd[0], &res, 1);
+ if (ret < 1 || res != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
+ res = 1;
+ }
+ h->status = res;
+
+ talloc_free(h);
+}
+
+static int ctdb_reloadips_child(struct ctdb_context *ctdb)
+{
+ TALLOC_CTX *mem_ctx = talloc_new(NULL);
+ struct ctdb_public_ip_list_old *ips;
+ struct ctdb_vnn *vnn;
+ struct client_async_data *async_data;
+ struct timeval timeout;
+ TDB_DATA data;
+ struct ctdb_client_control_state *state;
+ bool first_add;
+ unsigned int i;
+ int ret;
+
+ CTDB_NO_MEMORY(ctdb, mem_ctx);
+
+ /* Read IPs from local node */
+ ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
+ CTDB_CURRENT_NODE, mem_ctx, &ips);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,
+ ("Unable to fetch public IPs from local node\n"));
+ talloc_free(mem_ctx);
+ return -1;
+ }
+
+ /* Read IPs file - this is safe since this is a child process */
+ ctdb->vnn = NULL;
+ if (ctdb_set_public_addresses(ctdb, false) != 0) {
+ DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
+ talloc_free(mem_ctx);
+ return -1;
+ }
+
+ async_data = talloc_zero(mem_ctx, struct client_async_data);
+ CTDB_NO_MEMORY(ctdb, async_data);
+
+ /* Compare IPs between node and file for IPs to be deleted */
+ for (i = 0; i < ips->num; i++) {
+ /* */
+ for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
+ if (ctdb_same_ip(&vnn->public_address,
+ &ips->ips[i].addr)) {
+ /* IP is still in file */
+ break;
+ }
+ }
+
+ if (vnn == NULL) {
+ /* Delete IP ips->ips[i] */
+ struct ctdb_addr_info_old *pub;
+
+ DEBUG(DEBUG_NOTICE,
+ ("IP %s no longer configured, deleting it\n",
+ ctdb_addr_to_str(&ips->ips[i].addr)));
+
+ pub = talloc_zero(mem_ctx, struct ctdb_addr_info_old);
+ CTDB_NO_MEMORY(ctdb, pub);
+
+ pub->addr = ips->ips[i].addr;
+ pub->mask = 0;
+ pub->len = 0;
+
+ timeout = TAKEOVER_TIMEOUT();
+
+ data.dsize = offsetof(struct ctdb_addr_info_old,
+ iface) + pub->len;
+ data.dptr = (uint8_t *)pub;
+
+ state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
+ CTDB_CONTROL_DEL_PUBLIC_IP,
+ 0, data, async_data,
+ &timeout, NULL);
+ if (state == NULL) {
+ DEBUG(DEBUG_ERR,
+ (__location__
+ " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
+ goto failed;
+ }
+
+ ctdb_client_async_add(async_data, state);
+ }
+ }
+
+ /* Compare IPs between node and file for IPs to be added */
+ first_add = true;
+ for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
+ for (i = 0; i < ips->num; i++) {
+ if (ctdb_same_ip(&vnn->public_address,
+ &ips->ips[i].addr)) {
+ /* IP already on node */
+ break;
+ }
+ }
+ if (i == ips->num) {
+ /* Add IP ips->ips[i] */
+ struct ctdb_addr_info_old *pub;
+ const char *ifaces = NULL;
+ uint32_t len;
+ struct vnn_interface *iface = NULL;
+
+ DEBUG(DEBUG_NOTICE,
+ ("New IP %s configured, adding it\n",
+ ctdb_addr_to_str(&vnn->public_address)));
+ if (first_add) {
+ uint32_t pnn = ctdb_get_pnn(ctdb);
+
+ data.dsize = sizeof(pnn);
+ data.dptr = (uint8_t *)&pnn;
+
+ ret = ctdb_client_send_message(
+ ctdb,
+ CTDB_BROADCAST_CONNECTED,
+ CTDB_SRVID_REBALANCE_NODE,
+ data);
+ if (ret != 0) {
+ DEBUG(DEBUG_WARNING,
+ ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
+ }
+
+ first_add = false;
+ }
+
+ ifaces = vnn->ifaces->iface->name;
+ iface = vnn->ifaces->next;
+ while (iface != NULL) {
+ ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
+ iface->iface->name);
+ iface = iface->next;
+ }
+
+ len = strlen(ifaces) + 1;
+ pub = talloc_zero_size(mem_ctx,
+ offsetof(struct ctdb_addr_info_old, iface) + len);
+ CTDB_NO_MEMORY(ctdb, pub);
+
+ pub->addr = vnn->public_address;
+ pub->mask = vnn->public_netmask_bits;
+ pub->len = len;
+ memcpy(&pub->iface[0], ifaces, pub->len);
+
+ timeout = TAKEOVER_TIMEOUT();
+
+ data.dsize = offsetof(struct ctdb_addr_info_old,
+ iface) + pub->len;
+ data.dptr = (uint8_t *)pub;
+
+ state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
+ CTDB_CONTROL_ADD_PUBLIC_IP,
+ 0, data, async_data,
+ &timeout, NULL);
+ if (state == NULL) {
+ DEBUG(DEBUG_ERR,
+ (__location__
+ " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
+ goto failed;
+ }
+
+ ctdb_client_async_add(async_data, state);
+ }
+ }
+
+ if (ctdb_client_async_wait(ctdb, async_data) != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
+ goto failed;
+ }
+
+ talloc_free(mem_ctx);
+ return 0;
+
+failed:
+ talloc_free(mem_ctx);
+ return -1;
+}
+
+/* This control is sent to force the node to re-read the public addresses file
+ and drop any addresses we should nnot longer host, and add new addresses
+ that we are now able to host
+*/
+int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control_old *c, bool *async_reply)
+{
+ struct ctdb_reloadips_handle *h;
+ pid_t parent = getpid();
+
+ if (ctdb->reload_ips != NULL) {
+ talloc_free(ctdb->reload_ips);
+ ctdb->reload_ips = NULL;
+ }
+
+ h = talloc(ctdb, struct ctdb_reloadips_handle);
+ CTDB_NO_MEMORY(ctdb, h);
+ h->ctdb = ctdb;
+ h->c = NULL;
+ h->status = -1;
+
+ if (pipe(h->fd) == -1) {
+ DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
+ talloc_free(h);
+ return -1;
+ }
+
+ h->child = ctdb_fork(ctdb);
+ if (h->child == (pid_t)-1) {
+ DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
+ close(h->fd[0]);
+ close(h->fd[1]);
+ talloc_free(h);
+ return -1;
+ }
+
+ /* child process */
+ if (h->child == 0) {
+ signed char res = 0;
+
+ close(h->fd[0]);
+
+ prctl_set_comment("ctdb_reloadips");
+ if (switch_from_server_to_client(ctdb) != 0) {
+ DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
+ res = -1;
+ } else {
+ res = ctdb_reloadips_child(ctdb);
+ if (res != 0) {
+ DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
+ }
+ }
+
+ sys_write(h->fd[1], &res, 1);
+ ctdb_wait_for_process_to_exit(parent);
+ _exit(0);
+ }
+
+ h->c = talloc_steal(h, c);
+
+ close(h->fd[1]);
+ set_close_on_exec(h->fd[0]);
+
+ talloc_set_destructor(h, ctdb_reloadips_destructor);
+
+
+ h->fde = tevent_add_fd(ctdb->ev, h, h->fd[0], TEVENT_FD_READ,
+ ctdb_reloadips_child_handler, (void *)h);
+ tevent_fd_set_auto_close(h->fde);
+
+ tevent_add_timer(ctdb->ev, h, timeval_current_ofs(120, 0),
+ ctdb_reloadips_timeout_event, h);
+
+ /* we reply later */
+ *async_reply = true;
+ return 0;
+}
diff --git a/ctdb/server/ctdb_takeover_helper.c b/ctdb/server/ctdb_takeover_helper.c
new file mode 100644
index 0000000..c088970
--- /dev/null
+++ b/ctdb/server/ctdb_takeover_helper.c
@@ -0,0 +1,1276 @@
+/*
+ CTDB IP takeover helper
+
+ Copyright (C) Martin Schwenke 2016
+
+ Based on ctdb_recovery_helper.c
+ Copyright (C) Amitay Isaacs 2015
+
+ and ctdb_takeover.c
+ Copyright (C) Ronnie Sahlberg 2007
+ Copyright (C) Andrew Tridgell 2007
+ Copyright (C) Martin Schwenke 2011
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+#include "system/filesys.h"
+
+#include <popt.h>
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/util/debug.h"
+#include "lib/util/strv.h"
+#include "lib/util/strv_util.h"
+#include "lib/util/sys_rw.h"
+#include "lib/util/time.h"
+#include "lib/util/tevent_unix.h"
+
+#include "protocol/protocol.h"
+#include "protocol/protocol_api.h"
+#include "protocol/protocol_util.h"
+#include "client/client.h"
+
+#include "common/logging.h"
+
+#include "server/ipalloc.h"
+
+static int takeover_timeout = 9;
+
+#define TIMEOUT() timeval_current_ofs(takeover_timeout, 0)
+
+/*
+ * Utility functions
+ */
+
+static bool generic_recv(struct tevent_req *req, int *perr)
+{
+ int err;
+
+ if (tevent_req_is_unix_error(req, &err)) {
+ if (perr != NULL) {
+ *perr = err;
+ }
+ return false;
+ }
+
+ return true;
+}
+
+static enum ipalloc_algorithm
+determine_algorithm(const struct ctdb_tunable_list *tunables)
+{
+ switch (tunables->ip_alloc_algorithm) {
+ case 0:
+ return IPALLOC_DETERMINISTIC;
+ case 1:
+ return IPALLOC_NONDETERMINISTIC;
+ case 2:
+ return IPALLOC_LCP2;
+ default:
+ return IPALLOC_LCP2;
+ };
+}
+
+/**********************************************************************/
+
+struct get_public_ips_state {
+ uint32_t *pnns;
+ int count;
+ struct ctdb_public_ip_list *ips;
+ uint32_t *ban_credits;
+};
+
+static void get_public_ips_done(struct tevent_req *subreq);
+
+static struct tevent_req *get_public_ips_send(
+ TALLOC_CTX *mem_ctx,
+ struct tevent_context *ev,
+ struct ctdb_client_context *client,
+ uint32_t *pnns,
+ int count, int num_nodes,
+ uint32_t *ban_credits,
+ bool available_only)
+{
+ struct tevent_req *req, *subreq;
+ struct get_public_ips_state *state;
+ struct ctdb_req_control request;
+
+ req = tevent_req_create(mem_ctx, &state, struct get_public_ips_state);
+ if (req == NULL) {
+ return NULL;
+ }
+
+ state->pnns = pnns;
+ state->count = count;
+ state->ban_credits = ban_credits;
+
+ state->ips = talloc_zero_array(state,
+ struct ctdb_public_ip_list,
+ num_nodes);
+ if (tevent_req_nomem(state->ips, req)) {
+ return tevent_req_post(req, ev);
+ }
+
+ /* Short circuit if no nodes being asked for IPs */
+ if (state->count == 0) {
+ tevent_req_done(req);
+ return tevent_req_post(req, ev);
+ }
+
+ ctdb_req_control_get_public_ips(&request, available_only);
+ subreq = ctdb_client_control_multi_send(mem_ctx, ev, client,
+ state->pnns,
+ state->count,
+ TIMEOUT(), &request);
+ if (tevent_req_nomem(subreq, req)) {
+ return tevent_req_post(req, ev);
+ }
+ tevent_req_set_callback(subreq, get_public_ips_done, req);
+
+ return req;
+}
+
+static void get_public_ips_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct get_public_ips_state *state = tevent_req_data(
+ req, struct get_public_ips_state);
+ struct ctdb_reply_control **reply;
+ int *err_list;
+ int ret, i;
+ bool status, found_errors;
+
+ status = ctdb_client_control_multi_recv(subreq, &ret, state, &err_list,
+ &reply);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ for (i = 0; i < state->count; i++) {
+ if (err_list[i] != 0) {
+ uint32_t pnn = state->pnns[i];
+
+ D_ERR("control GET_PUBLIC_IPS failed on "
+ "node %u, ret=%d\n", pnn, err_list[i]);
+
+ state->ban_credits[pnn]++;
+ }
+ }
+
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ found_errors = false;
+ for (i = 0; i < state->count; i++) {
+ uint32_t pnn;
+ struct ctdb_public_ip_list *ips;
+
+ pnn = state->pnns[i];
+ ret = ctdb_reply_control_get_public_ips(reply[i], state->ips,
+ &ips);
+ if (ret != 0) {
+ D_ERR("control GET_PUBLIC_IPS failed on "
+ "node %u\n", pnn);
+ state->ban_credits[pnn]++;
+ found_errors = true;
+ continue;
+ }
+
+ D_INFO("Fetched public IPs from node %u\n", pnn);
+ state->ips[pnn] = *ips;
+ }
+
+ if (found_errors) {
+ tevent_req_error(req, EIO);
+ return;
+ }
+
+ talloc_free(reply);
+
+ tevent_req_done(req);
+}
+
+static bool get_public_ips_recv(struct tevent_req *req, int *perr,
+ TALLOC_CTX *mem_ctx,
+ struct ctdb_public_ip_list **ips)
+{
+ struct get_public_ips_state *state = tevent_req_data(
+ req, struct get_public_ips_state);
+ int err;
+
+ if (tevent_req_is_unix_error(req, &err)) {
+ if (perr != NULL) {
+ *perr = err;
+ }
+ return false;
+ }
+
+ *ips = talloc_steal(mem_ctx, state->ips);
+
+ return true;
+}
+
+/**********************************************************************/
+
+struct release_ip_state {
+ int num_sent;
+ int num_replies;
+ int num_fails;
+ int err_any;
+ uint32_t *ban_credits;
+};
+
+struct release_ip_one_state {
+ struct tevent_req *req;
+ uint32_t *pnns;
+ int count;
+ const char *ip_str;
+};
+
+static void release_ip_done(struct tevent_req *subreq);
+
+static struct tevent_req *release_ip_send(TALLOC_CTX *mem_ctx,
+ struct tevent_context *ev,
+ struct ctdb_client_context *client,
+ uint32_t *pnns,
+ int count,
+ struct timeval timeout,
+ struct public_ip_list *all_ips,
+ uint32_t *ban_credits)
+{
+ struct tevent_req *req, *subreq;
+ struct release_ip_state *state;
+ struct ctdb_req_control request;
+ struct public_ip_list *tmp_ip;
+
+ req = tevent_req_create(mem_ctx, &state, struct release_ip_state);
+ if (req == NULL) {
+ return NULL;
+ }
+
+ state->num_sent = 0;
+ state->num_replies = 0;
+ state->num_fails = 0;
+ state->ban_credits = ban_credits;
+
+ /* Send a RELEASE_IP to all nodes that should not be hosting
+ * each IP. For each IP, all but one of these will be
+ * redundant. However, the redundant ones are used to tell
+ * nodes which node should be hosting the IP so that commands
+ * like "ctdb ip" can display a particular nodes idea of who
+ * is hosting what. */
+ for (tmp_ip = all_ips; tmp_ip != NULL; tmp_ip = tmp_ip->next) {
+ struct release_ip_one_state *substate;
+ struct ctdb_public_ip ip;
+ int i;
+
+ substate = talloc_zero(state, struct release_ip_one_state);
+ if (tevent_req_nomem(substate, req)) {
+ return tevent_req_post(req, ev);
+ }
+
+ substate->pnns = talloc_zero_array(substate, uint32_t, count);
+ if (tevent_req_nomem(substate->pnns, req)) {
+ return tevent_req_post(req, ev);
+ }
+
+ substate->count = 0;
+ substate->req = req;
+
+ substate->ip_str = ctdb_sock_addr_to_string(substate,
+ &tmp_ip->addr,
+ false);
+ if (tevent_req_nomem(substate->ip_str, req)) {
+ return tevent_req_post(req, ev);
+ }
+
+ for (i = 0; i < count; i++) {
+ uint32_t pnn = pnns[i];
+
+ /* Skip this node if IP is not known */
+ if (! bitmap_query(tmp_ip->known_on, pnn)) {
+ continue;
+ }
+
+ /* If pnn is not the node that should be
+ * hosting the IP then add it to the list of
+ * nodes that need to do a release. */
+ if (tmp_ip->pnn != pnn) {
+ substate->pnns[substate->count] = pnn;
+ substate->count++;
+ }
+ }
+
+ if (substate->count == 0) {
+ /* No releases to send for this address... */
+ TALLOC_FREE(substate);
+ continue;
+ }
+
+ ip.pnn = tmp_ip->pnn;
+ ip.addr = tmp_ip->addr;
+ ctdb_req_control_release_ip(&request, &ip);
+ subreq = ctdb_client_control_multi_send(state, ev, client,
+ substate->pnns,
+ substate->count,
+ timeout,/* cumulative */
+ &request);
+ if (tevent_req_nomem(subreq, req)) {
+ return tevent_req_post(req, ev);
+ }
+ tevent_req_set_callback(subreq, release_ip_done, substate);
+
+ state->num_sent++;
+ }
+
+ /* None sent, finished... */
+ if (state->num_sent == 0) {
+ tevent_req_done(req);
+ return tevent_req_post(req, ev);
+ }
+
+ return req;
+}
+
+static void release_ip_done(struct tevent_req *subreq)
+{
+ struct release_ip_one_state *substate = tevent_req_callback_data(
+ subreq, struct release_ip_one_state);
+ struct tevent_req *req = substate->req;
+ struct release_ip_state *state = tevent_req_data(
+ req, struct release_ip_state);
+ int ret, i;
+ int *err_list;
+ bool status, found_errors;
+
+ status = ctdb_client_control_multi_recv(subreq, &ret, state,
+ &err_list, NULL);
+ TALLOC_FREE(subreq);
+
+ if (status) {
+ D_INFO("RELEASE_IP %s succeeded on %d nodes\n",
+ substate->ip_str, substate->count);
+ goto done;
+ }
+
+ /* Get some clear error messages out of err_list and count
+ * banning credits
+ */
+ found_errors = false;
+ for (i = 0; i < substate->count; i++) {
+ int err = err_list[i];
+ if (err != 0) {
+ uint32_t pnn = substate->pnns[i];
+
+ D_ERR("RELEASE_IP %s failed on node %u, "
+ "ret=%d\n", substate->ip_str, pnn, err);
+
+ state->ban_credits[pnn]++;
+ state->err_any = err;
+ found_errors = true;
+ }
+ }
+ if (! found_errors) {
+ D_ERR("RELEASE_IP %s internal error, ret=%d\n",
+ substate->ip_str, ret);
+ state->err_any = EIO;
+ }
+
+ state->num_fails++;
+
+done:
+ talloc_free(substate);
+
+ state->num_replies++;
+
+ if (state->num_replies < state->num_sent) {
+ /* Not all replies received, don't go further */
+ return;
+ }
+
+ if (state->num_fails > 0) {
+ tevent_req_error(req, state->err_any);
+ return;
+ }
+
+ tevent_req_done(req);
+}
+
+static bool release_ip_recv(struct tevent_req *req, int *perr)
+{
+ return generic_recv(req, perr);
+}
+
+/**********************************************************************/
+
+struct take_ip_state {
+ int num_sent;
+ int num_replies;
+ int num_fails;
+ int err_any;
+ uint32_t *ban_credits;
+};
+
+struct take_ip_one_state {
+ struct tevent_req *req;
+ uint32_t pnn;
+ const char *ip_str;
+};
+
+static void take_ip_done(struct tevent_req *subreq);
+
+static struct tevent_req *take_ip_send(TALLOC_CTX *mem_ctx,
+ struct tevent_context *ev,
+ struct ctdb_client_context *client,
+ struct timeval timeout,
+ struct public_ip_list *all_ips,
+ uint32_t *ban_credits)
+{
+ struct tevent_req *req, *subreq;
+ struct take_ip_state *state;
+ struct ctdb_req_control request;
+ struct public_ip_list *tmp_ip;
+
+ req = tevent_req_create(mem_ctx, &state, struct take_ip_state);
+ if (req == NULL) {
+ return NULL;
+ }
+
+ state->num_sent = 0;
+ state->num_replies = 0;
+ state->num_fails = 0;
+ state->ban_credits = ban_credits;
+
+ /* For each IP, send a TAKOVER_IP to the node that should be
+ * hosting it. Many of these will often be redundant (since
+ * the allocation won't have changed) but they can be useful
+ * to recover from inconsistencies. */
+ for (tmp_ip = all_ips; tmp_ip != NULL; tmp_ip = tmp_ip->next) {
+ struct take_ip_one_state *substate;
+ struct ctdb_public_ip ip;
+
+ if (tmp_ip->pnn == CTDB_UNKNOWN_PNN) {
+ /* IP will be unassigned */
+ continue;
+ }
+
+ substate = talloc_zero(state, struct take_ip_one_state);
+ if (tevent_req_nomem(substate, req)) {
+ return tevent_req_post(req, ev);
+ }
+
+ substate->req = req;
+ substate->pnn = tmp_ip->pnn;
+
+ substate->ip_str = ctdb_sock_addr_to_string(substate,
+ &tmp_ip->addr,
+ false);
+ if (tevent_req_nomem(substate->ip_str, req)) {
+ return tevent_req_post(req, ev);
+ }
+
+ ip.pnn = tmp_ip->pnn;
+ ip.addr = tmp_ip->addr;
+ ctdb_req_control_takeover_ip(&request, &ip);
+ subreq = ctdb_client_control_send(
+ state, ev, client, tmp_ip->pnn,
+ timeout, /* cumulative */
+ &request);
+ if (tevent_req_nomem(subreq, req)) {
+ return tevent_req_post(req, ev);
+ }
+ tevent_req_set_callback(subreq, take_ip_done, substate);
+
+ state->num_sent++;
+ }
+
+ /* None sent, finished... */
+ if (state->num_sent == 0) {
+ tevent_req_done(req);
+ return tevent_req_post(req, ev);
+ }
+
+ return req;
+}
+
+static void take_ip_done(struct tevent_req *subreq)
+{
+ struct take_ip_one_state *substate = tevent_req_callback_data(
+ subreq, struct take_ip_one_state);
+ struct tevent_req *req = substate->req;
+ struct ctdb_reply_control *reply;
+ struct take_ip_state *state = tevent_req_data(
+ req, struct take_ip_state);
+ int ret = 0;
+ bool status;
+
+ status = ctdb_client_control_recv(subreq, &ret, state, &reply);
+ TALLOC_FREE(subreq);
+
+ if (! status) {
+ D_ERR("TAKEOVER_IP %s failed to node %u, ret=%d\n",
+ substate->ip_str, substate->pnn, ret);
+ goto fail;
+ }
+
+ ret = ctdb_reply_control_takeover_ip(reply);
+ if (ret != 0) {
+ D_ERR("TAKEOVER_IP %s failed on node %u, ret=%d\n",
+ substate->ip_str, substate->pnn, ret);
+ goto fail;
+ }
+
+ D_INFO("TAKEOVER_IP %s succeeded on node %u\n",
+ substate->ip_str, substate->pnn);
+ goto done;
+
+fail:
+ state->ban_credits[substate->pnn]++;
+ state->num_fails++;
+ state->err_any = ret;
+
+done:
+ talloc_free(substate);
+
+ state->num_replies++;
+
+ if (state->num_replies < state->num_sent) {
+ /* Not all replies received, don't go further */
+ return;
+ }
+
+ if (state->num_fails > 0) {
+ tevent_req_error(req, state->err_any);
+ return;
+ }
+
+ tevent_req_done(req);
+}
+
+static bool take_ip_recv(struct tevent_req *req, int *perr)
+{
+ return generic_recv(req, perr);
+}
+
+/**********************************************************************/
+
+struct ipreallocated_state {
+ uint32_t *pnns;
+ int count;
+ uint32_t *ban_credits;
+};
+
+static void ipreallocated_done(struct tevent_req *subreq);
+
+static struct tevent_req *ipreallocated_send(TALLOC_CTX *mem_ctx,
+ struct tevent_context *ev,
+ struct ctdb_client_context *client,
+ uint32_t *pnns,
+ int count,
+ struct timeval timeout,
+ uint32_t *ban_credits)
+{
+ struct tevent_req *req, *subreq;
+ struct ipreallocated_state *state;
+ struct ctdb_req_control request;
+
+ req = tevent_req_create(mem_ctx, &state, struct ipreallocated_state);
+ if (req == NULL) {
+ return NULL;
+ }
+
+ state->pnns = pnns;
+ state->count = count;
+ state->ban_credits = ban_credits;
+
+ ctdb_req_control_ipreallocated(&request);
+ subreq = ctdb_client_control_multi_send(state, ev, client,
+ pnns, count,
+ timeout, /* cumulative */
+ &request);
+ if (tevent_req_nomem(subreq, req)) {
+ return tevent_req_post(req, ev);
+ }
+ tevent_req_set_callback(subreq, ipreallocated_done, req);
+
+ return req;
+}
+
+static void ipreallocated_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct ipreallocated_state *state = tevent_req_data(
+ req, struct ipreallocated_state);
+ int *err_list = NULL;
+ int ret, i;
+ bool status, found_errors;
+
+ status = ctdb_client_control_multi_recv(subreq, &ret, state,
+ &err_list, NULL);
+ TALLOC_FREE(subreq);
+
+ if (status) {
+ D_INFO("IPREALLOCATED succeeded on %d nodes\n", state->count);
+ tevent_req_done(req);
+ return;
+ }
+
+ /* Get some clear error messages out of err_list and count
+ * banning credits
+ */
+ found_errors = false;
+ for (i = 0; i < state->count; i++) {
+ int err = err_list[i];
+ if (err != 0) {
+ uint32_t pnn = state->pnns[i];
+
+ D_ERR("IPREALLOCATED failed on node %u, ret=%d\n",
+ pnn, err);
+
+ state->ban_credits[pnn]++;
+ found_errors = true;
+ }
+ }
+
+ if (! found_errors) {
+ D_ERR("IPREALLOCATED internal error, ret=%d\n", ret);
+ }
+
+ tevent_req_error(req, ret);
+}
+
+static bool ipreallocated_recv(struct tevent_req *req, int *perr)
+{
+ return generic_recv(req, perr);
+}
+
+/**********************************************************************/
+
+/*
+ * Recalculate the allocation of public IPs to nodes and have the
+ * nodes host their allocated addresses.
+ *
+ * - Get tunables
+ * - Get nodemap
+ * - Initialise IP allocation state. Pass:
+ * + algorithm to be used;
+ * + various tunables (NoIPTakeover, NoIPFailback)
+ * + list of nodes to force rebalance (internal structure, currently
+ * no way to fetch, only used by LCP2 for nodes that have had new
+ * IP addresses added).
+ * - Set IP flags for IP allocation based on node map
+ * - Retrieve known and available IP addresses (done separately so
+ * values can be faked in unit testing)
+ * - Use ipalloc_set_public_ips() to set known and available IP
+ * addresses for allocation
+ * - If cluster can't host IP addresses then jump to IPREALLOCATED
+ * - Run IP allocation algorithm
+ * - Send RELEASE_IP to all nodes for IPs they should not host
+ * - Send TAKE_IP to all nodes for IPs they should host
+ * - Send IPREALLOCATED to all nodes
+ */
+
+struct takeover_state {
+ struct tevent_context *ev;
+ struct ctdb_client_context *client;
+ struct timeval timeout;
+ unsigned int num_nodes;
+ uint32_t *pnns_connected;
+ int num_connected;
+ uint32_t *pnns_active;
+ int num_active;
+ uint32_t destnode;
+ uint32_t *force_rebalance_nodes;
+ struct ctdb_tunable_list *tun_list;
+ struct ipalloc_state *ipalloc_state;
+ struct ctdb_public_ip_list *known_ips;
+ struct public_ip_list *all_ips;
+ uint32_t *ban_credits;
+};
+
+static void takeover_tunables_done(struct tevent_req *subreq);
+static void takeover_nodemap_done(struct tevent_req *subreq);
+static void takeover_known_ips_done(struct tevent_req *subreq);
+static void takeover_avail_ips_done(struct tevent_req *subreq);
+static void takeover_release_ip_done(struct tevent_req *subreq);
+static void takeover_take_ip_done(struct tevent_req *subreq);
+static void takeover_ipreallocated(struct tevent_req *req);
+static void takeover_ipreallocated_done(struct tevent_req *subreq);
+static void takeover_failed(struct tevent_req *subreq, int ret);
+static void takeover_failed_done(struct tevent_req *subreq);
+
+static struct tevent_req *takeover_send(TALLOC_CTX *mem_ctx,
+ struct tevent_context *ev,
+ struct ctdb_client_context *client,
+ uint32_t *force_rebalance_nodes)
+{
+ struct tevent_req *req, *subreq;
+ struct takeover_state *state;
+ struct ctdb_req_control request;
+
+ req = tevent_req_create(mem_ctx, &state, struct takeover_state);
+ if (req == NULL) {
+ return NULL;
+ }
+
+ state->ev = ev;
+ state->client = client;
+ state->force_rebalance_nodes = force_rebalance_nodes;
+ state->destnode = ctdb_client_pnn(client);
+
+ ctdb_req_control_get_all_tunables(&request);
+ subreq = ctdb_client_control_send(state, state->ev, state->client,
+ state->destnode, TIMEOUT(),
+ &request);
+ if (tevent_req_nomem(subreq, req)) {
+ return tevent_req_post(req, ev);
+ }
+ tevent_req_set_callback(subreq, takeover_tunables_done, req);
+
+ return req;
+}
+
+static void takeover_tunables_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct takeover_state *state = tevent_req_data(
+ req, struct takeover_state);
+ struct ctdb_reply_control *reply;
+ struct ctdb_req_control request;
+ int ret;
+ bool status;
+
+ status = ctdb_client_control_recv(subreq, &ret, state, &reply);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ D_ERR("control GET_ALL_TUNABLES failed, ret=%d\n", ret);
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ ret = ctdb_reply_control_get_all_tunables(reply, state,
+ &state->tun_list);
+ if (ret != 0) {
+ D_ERR("control GET_ALL_TUNABLES failed, ret=%d\n", ret);
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ talloc_free(reply);
+
+ takeover_timeout = state->tun_list->takeover_timeout;
+
+ ctdb_req_control_get_nodemap(&request);
+ subreq = ctdb_client_control_send(state, state->ev, state->client,
+ state->destnode, TIMEOUT(),
+ &request);
+ if (tevent_req_nomem(subreq, req)) {
+ return;
+ }
+ tevent_req_set_callback(subreq, takeover_nodemap_done, req);
+}
+
+static void takeover_nodemap_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct takeover_state *state = tevent_req_data(
+ req, struct takeover_state);
+ struct ctdb_reply_control *reply;
+ bool status;
+ int ret;
+ struct ctdb_node_map *nodemap;
+ const char *ptr;
+
+ status = ctdb_client_control_recv(subreq, &ret, state, &reply);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ D_ERR("control GET_NODEMAP failed to node %u, ret=%d\n",
+ state->destnode, ret);
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ ret = ctdb_reply_control_get_nodemap(reply, state, &nodemap);
+ if (ret != 0) {
+ D_ERR("control GET_NODEMAP failed, ret=%d\n", ret);
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ state->num_nodes = nodemap->num;
+
+ state->num_connected = list_of_connected_nodes(nodemap,
+ CTDB_UNKNOWN_PNN, state,
+ &state->pnns_connected);
+ if (state->num_connected <= 0) {
+ tevent_req_error(req, ENOMEM);
+ return;
+ }
+
+ state->num_active = list_of_active_nodes(nodemap,
+ CTDB_UNKNOWN_PNN, state,
+ &state->pnns_active);
+ if (state->num_active <= 0) {
+ tevent_req_error(req, ENOMEM);
+ return;
+ }
+
+ /* Default timeout for early jump to IPREALLOCATED. See below
+ * for explanation of 3 times...
+ */
+ state->timeout = timeval_current_ofs(3 * takeover_timeout, 0);
+
+ state->ban_credits = talloc_zero_array(state, uint32_t,
+ state->num_nodes);
+ if (tevent_req_nomem(state->ban_credits, req)) {
+ return;
+ }
+
+ ptr = getenv("CTDB_DISABLE_IP_FAILOVER");
+ if (ptr != NULL) {
+ /* IP failover is completely disabled so just send out
+ * ipreallocated event.
+ */
+ takeover_ipreallocated(req);
+ return;
+ }
+
+ state->ipalloc_state =
+ ipalloc_state_init(
+ state, state->num_nodes,
+ determine_algorithm(state->tun_list),
+ (state->tun_list->no_ip_takeover != 0),
+ (state->tun_list->no_ip_failback != 0),
+ state->force_rebalance_nodes);
+ if (tevent_req_nomem(state->ipalloc_state, req)) {
+ return;
+ }
+
+ subreq = get_public_ips_send(state, state->ev, state->client,
+ state->pnns_connected, state->num_connected,
+ state->num_nodes, state->ban_credits,
+ false);
+ if (tevent_req_nomem(subreq, req)) {
+ return;
+ }
+
+ tevent_req_set_callback(subreq, takeover_known_ips_done, req);
+}
+
+static void takeover_known_ips_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct takeover_state *state = tevent_req_data(
+ req, struct takeover_state);
+ int ret;
+ bool status;
+ uint32_t *pnns = NULL;
+ int count, i;
+
+ status = get_public_ips_recv(subreq, &ret, state, &state->known_ips);
+ TALLOC_FREE(subreq);
+
+ if (! status) {
+ D_ERR("Failed to fetch known public IPs\n");
+ takeover_failed(req, ret);
+ return;
+ }
+
+ /* Get available IPs from active nodes that actually have known IPs */
+
+ pnns = talloc_zero_array(state, uint32_t, state->num_active);
+ if (tevent_req_nomem(pnns, req)) {
+ return;
+ }
+
+ count = 0;
+ for (i = 0; i < state->num_active; i++) {
+ uint32_t pnn = state->pnns_active[i];
+
+ /* If pnn has IPs then fetch available IPs from it */
+ if (state->known_ips[pnn].num > 0) {
+ pnns[count] = pnn;
+ count++;
+ }
+ }
+
+ subreq = get_public_ips_send(state, state->ev, state->client,
+ pnns, count,
+ state->num_nodes, state->ban_credits,
+ true);
+ if (tevent_req_nomem(subreq, req)) {
+ return;
+ }
+
+ tevent_req_set_callback(subreq, takeover_avail_ips_done, req);
+}
+
+static void takeover_avail_ips_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct takeover_state *state = tevent_req_data(
+ req, struct takeover_state);
+ bool status;
+ int ret;
+ struct ctdb_public_ip_list *available_ips;
+
+ status = get_public_ips_recv(subreq, &ret, state, &available_ips);
+ TALLOC_FREE(subreq);
+
+ if (! status) {
+ D_ERR("Failed to fetch available public IPs\n");
+ takeover_failed(req, ret);
+ return;
+ }
+
+ ipalloc_set_public_ips(state->ipalloc_state,
+ state->known_ips, available_ips);
+
+ if (! ipalloc_can_host_ips(state->ipalloc_state)) {
+ D_NOTICE("No nodes available to host public IPs yet\n");
+ takeover_ipreallocated(req);
+ return;
+ }
+
+ /* Do the IP reassignment calculations */
+ state->all_ips = ipalloc(state->ipalloc_state);
+ if (tevent_req_nomem(state->all_ips, req)) {
+ return;
+ }
+
+ /* Each of the following stages (RELEASE_IP, TAKEOVER_IP,
+ * IPREALLOCATED) notionally has a timeout of TakeoverTimeout
+ * seconds. However, RELEASE_IP can take longer due to TCP
+ * connection killing, so sometimes needs more time.
+ * Therefore, use a cumulative timeout of TakeoverTimeout * 3
+ * seconds across all 3 stages. No explicit expiry checks are
+ * needed before each stage because tevent is smart enough to
+ * fire the timeouts even if they are in the past. Initialise
+ * this here so it explicitly covers the stages we're
+ * interested in but, in particular, not the time taken by the
+ * ipalloc().
+ */
+ state->timeout = timeval_current_ofs(3 * takeover_timeout, 0);
+
+ subreq = release_ip_send(state, state->ev, state->client,
+ state->pnns_connected, state->num_connected,
+ state->timeout, state->all_ips,
+ state->ban_credits);
+ if (tevent_req_nomem(subreq, req)) {
+ return;
+ }
+ tevent_req_set_callback(subreq, takeover_release_ip_done, req);
+}
+
+static void takeover_release_ip_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ struct takeover_state *state = tevent_req_data(
+ req, struct takeover_state);
+ int ret;
+ bool status;
+
+ status = release_ip_recv(subreq, &ret);
+ TALLOC_FREE(subreq);
+
+ if (! status) {
+ takeover_failed(req, ret);
+ return;
+ }
+
+ /* All released, now for takeovers */
+
+ subreq = take_ip_send(state, state->ev, state->client,
+ state->timeout, state->all_ips,
+ state->ban_credits);
+ if (tevent_req_nomem(subreq, req)) {
+ return;
+ }
+ tevent_req_set_callback(subreq, takeover_take_ip_done, req);
+}
+
+static void takeover_take_ip_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ int ret = 0;
+ bool status;
+
+ status = take_ip_recv(subreq, &ret);
+ TALLOC_FREE(subreq);
+
+ if (! status) {
+ takeover_failed(req, ret);
+ return;
+ }
+
+ takeover_ipreallocated(req);
+}
+
+static void takeover_ipreallocated(struct tevent_req *req)
+{
+ struct takeover_state *state = tevent_req_data(
+ req, struct takeover_state);
+ struct tevent_req *subreq;
+
+ subreq = ipreallocated_send(state, state->ev, state->client,
+ state->pnns_connected,
+ state->num_connected,
+ state->timeout,
+ state->ban_credits);
+ if (tevent_req_nomem(subreq, req)) {
+ return;
+ }
+ tevent_req_set_callback(subreq, takeover_ipreallocated_done, req);
+}
+
+static void takeover_ipreallocated_done(struct tevent_req *subreq)
+{
+ struct tevent_req *req = tevent_req_callback_data(
+ subreq, struct tevent_req);
+ int ret;
+ bool status;
+
+ status = ipreallocated_recv(subreq, &ret);
+ TALLOC_FREE(subreq);
+
+ if (! status) {
+ takeover_failed(req, ret);
+ return;
+ }
+
+ tevent_req_done(req);
+}
+
+struct takeover_failed_state {
+ struct tevent_req *req;
+ int ret;
+};
+
+void takeover_failed(struct tevent_req *req, int ret)
+{
+ struct takeover_state *state = tevent_req_data(
+ req, struct takeover_state);
+ struct tevent_req *subreq;
+ uint32_t max_pnn = CTDB_UNKNOWN_PNN;
+ unsigned int max_credits = 0;
+ uint32_t pnn;
+
+ /* Check that bans are enabled */
+ if (state->tun_list->enable_bans == 0) {
+ tevent_req_error(req, ret);
+ return;
+ }
+
+ for (pnn = 0; pnn < state->num_nodes; pnn++) {
+ if (state->ban_credits[pnn] > max_credits) {
+ max_pnn = pnn;
+ max_credits = state->ban_credits[pnn];
+ }
+ }
+
+ if (max_credits > 0) {
+ struct ctdb_req_message message;
+ struct takeover_failed_state *substate;
+
+ D_WARNING("Assigning banning credits to node %u\n", max_pnn);
+
+ substate = talloc_zero(state, struct takeover_failed_state);
+ if (tevent_req_nomem(substate, req)) {
+ return;
+ }
+ substate->req = req;
+ substate->ret = ret;
+
+ message.srvid = CTDB_SRVID_BANNING;
+ message.data.pnn = max_pnn;
+
+ subreq = ctdb_client_message_send(
+ state, state->ev, state->client,
+ ctdb_client_pnn(state->client),
+ &message);
+ if (subreq == NULL) {
+ D_ERR("failed to assign banning credits\n");
+ tevent_req_error(req, ret);
+ return;
+ }
+ tevent_req_set_callback(subreq, takeover_failed_done, substate);
+ } else {
+ tevent_req_error(req, ret);
+ }
+}
+
+static void takeover_failed_done(struct tevent_req *subreq)
+{
+ struct takeover_failed_state *substate = tevent_req_callback_data(
+ subreq, struct takeover_failed_state);
+ struct tevent_req *req = substate->req;
+ int ret;
+ bool status;
+
+ status = ctdb_client_message_recv(subreq, &ret);
+ TALLOC_FREE(subreq);
+ if (! status) {
+ D_ERR("failed to assign banning credits, ret=%d\n", ret);
+ }
+
+ ret = substate->ret;
+ talloc_free(substate);
+ tevent_req_error(req, ret);
+}
+
+static void takeover_recv(struct tevent_req *req, int *perr)
+{
+ generic_recv(req, perr);
+}
+
+static uint32_t *parse_node_list(TALLOC_CTX *mem_ctx, const char* s)
+{
+ char *strv = NULL;
+ int num, i, ret;
+ char *t;
+ uint32_t *nodes;
+
+ ret = strv_split(mem_ctx, &strv, s, ",");
+ if (ret != 0) {
+ D_ERR("out of memory\n");
+ return NULL;
+ }
+
+ num = strv_count(strv);
+
+ nodes = talloc_array(mem_ctx, uint32_t, num);
+ if (nodes == NULL) {
+ D_ERR("out of memory\n");
+ return NULL;
+ }
+
+ t = NULL;
+ for (i = 0; i < num; i++) {
+ t = strv_next(strv, t);
+ nodes[i] = atoi(t);
+ }
+
+ return nodes;
+}
+
+static void usage(const char *progname)
+{
+ fprintf(stderr,
+ "\nUsage: %s <output-fd> <ctdb-socket-path> "
+ "[<force-rebalance-nodes>]\n",
+ progname);
+}
+
+/*
+ * Arguments - write fd, socket path
+ */
+int main(int argc, const char *argv[])
+{
+ int write_fd;
+ const char *sockpath;
+ TALLOC_CTX *mem_ctx;
+ struct tevent_context *ev;
+ struct ctdb_client_context *client;
+ bool status;
+ int ret;
+ struct tevent_req *req;
+ uint32_t *force_rebalance_nodes = NULL;
+
+ if (argc < 3 || argc > 4) {
+ usage(argv[0]);
+ exit(1);
+ }
+
+ write_fd = atoi(argv[1]);
+ sockpath = argv[2];
+
+ mem_ctx = talloc_new(NULL);
+ if (mem_ctx == NULL) {
+ fprintf(stderr, "talloc_new() failed\n");
+ ret = ENOMEM;
+ goto done;
+ }
+
+ if (argc == 4) {
+ force_rebalance_nodes = parse_node_list(mem_ctx, argv[3]);
+ if (force_rebalance_nodes == NULL) {
+ usage(argv[0]);
+ ret = EINVAL;
+ goto done;
+ }
+ }
+
+ ret = logging_init(mem_ctx, NULL, NULL, "ctdb-takeover");
+ if (ret != 0) {
+ fprintf(stderr,
+ "ctdb-takeover: Unable to initialize logging\n");
+ goto done;
+ }
+
+ ev = tevent_context_init(mem_ctx);
+ if (ev == NULL) {
+ D_ERR("tevent_context_init() failed\n");
+ ret = ENOMEM;
+ goto done;
+ }
+
+ status = logging_setup_sighup_handler(ev, mem_ctx, NULL, NULL);
+ if (!status) {
+ D_ERR("logging_setup_sighup_handler() failed\n");
+ ret = ENOMEM;
+ goto done;
+ }
+
+ ret = ctdb_client_init(mem_ctx, ev, sockpath, &client);
+ if (ret != 0) {
+ D_ERR("ctdb_client_init() failed, ret=%d\n", ret);
+ goto done;
+ }
+
+ req = takeover_send(mem_ctx, ev, client, force_rebalance_nodes);
+ if (req == NULL) {
+ D_ERR("takeover_send() failed\n");
+ ret = 1;
+ goto done;
+ }
+
+ if (! tevent_req_poll(req, ev)) {
+ D_ERR("tevent_req_poll() failed\n");
+ ret = 1;
+ goto done;
+ }
+
+ takeover_recv(req, &ret);
+ TALLOC_FREE(req);
+ if (ret != 0) {
+ D_ERR("takeover run failed, ret=%d\n", ret);
+ }
+
+done:
+ sys_write_v(write_fd, &ret, sizeof(ret));
+
+ talloc_free(mem_ctx);
+ return ret;
+}
diff --git a/ctdb/server/ctdb_traverse.c b/ctdb/server/ctdb_traverse.c
new file mode 100644
index 0000000..4865dcc
--- /dev/null
+++ b/ctdb/server/ctdb_traverse.c
@@ -0,0 +1,781 @@
+/*
+ efficient async ctdb traverse
+
+ Copyright (C) Andrew Tridgell 2007
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/filesys.h"
+#include "system/network.h"
+#include "system/wait.h"
+#include "system/time.h"
+
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/tdb_wrap/tdb_wrap.h"
+#include "lib/util/dlinklist.h"
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+#include "lib/util/sys_rw.h"
+#include "lib/util/util_process.h"
+
+#include "ctdb_private.h"
+#include "ctdb_client.h"
+
+#include "common/reqid.h"
+#include "common/system.h"
+#include "common/common.h"
+#include "common/logging.h"
+
+typedef void (*ctdb_traverse_fn_t)(void *private_data, TDB_DATA key, TDB_DATA data);
+
+/*
+ handle returned to caller - freeing this handler will kill the child and
+ terminate the traverse
+ */
+struct ctdb_traverse_local_handle {
+ struct ctdb_traverse_local_handle *next, *prev;
+ struct ctdb_db_context *ctdb_db;
+ int fd[2];
+ pid_t child;
+ uint64_t srvid;
+ uint32_t client_reqid;
+ uint32_t reqid;
+ int srcnode;
+ void *private_data;
+ ctdb_traverse_fn_t callback;
+ bool withemptyrecords;
+ struct tevent_fd *fde;
+ int records_failed;
+ int records_sent;
+};
+
+/*
+ * called when traverse is completed by child or on error
+ */
+static void ctdb_traverse_child_handler(struct tevent_context *ev, struct tevent_fd *fde,
+ uint16_t flags, void *private_data)
+{
+ struct ctdb_traverse_local_handle *h = talloc_get_type(private_data,
+ struct ctdb_traverse_local_handle);
+ ctdb_traverse_fn_t callback = h->callback;
+ void *p = h->private_data;
+ int res;
+ ssize_t n;
+
+ /* Read the number of records sent by traverse child */
+ n = sys_read(h->fd[0], &res, sizeof(res));
+ if (n < 0 || n != sizeof(res)) {
+ /* Traverse child failed */
+ DEBUG(DEBUG_ERR, ("Local traverse failed db:%s reqid:%d\n",
+ h->ctdb_db->db_name, h->reqid));
+ } else if (res < 0) {
+ /* Traverse failed */
+ res = -res;
+ DEBUG(DEBUG_ERR, ("Local traverse failed db:%s reqid:%d records:%d\n",
+ h->ctdb_db->db_name, h->reqid, res));
+ } else {
+ DEBUG(DEBUG_INFO, ("Local traverse end db:%s reqid:%d records:%d\n",
+ h->ctdb_db->db_name, h->reqid, res));
+ }
+
+ callback(p, tdb_null, tdb_null);
+}
+
+/*
+ destroy a in-flight traverse operation
+ */
+static int traverse_local_destructor(struct ctdb_traverse_local_handle *h)
+{
+ DLIST_REMOVE(h->ctdb_db->traverse, h);
+ ctdb_kill(h->ctdb_db->ctdb, h->child, SIGKILL);
+ return 0;
+}
+
+/*
+ callback from tdb_traverse_read()
+ */
+static int ctdb_traverse_local_fn(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
+{
+ struct ctdb_traverse_local_handle *h = talloc_get_type(p,
+ struct ctdb_traverse_local_handle);
+ struct ctdb_rec_data_old *d;
+ struct ctdb_ltdb_header *hdr;
+ int res, status;
+ TDB_DATA outdata;
+
+ hdr = (struct ctdb_ltdb_header *)data.dptr;
+
+ if (ctdb_db_volatile(h->ctdb_db)) {
+ /* filter out zero-length records */
+ if (!h->withemptyrecords &&
+ data.dsize <= sizeof(struct ctdb_ltdb_header))
+ {
+ return 0;
+ }
+
+ /* filter out non-authoritative records */
+ if (hdr->dmaster != h->ctdb_db->ctdb->pnn) {
+ return 0;
+ }
+ }
+
+ d = ctdb_marshall_record(h, h->reqid, key, NULL, data);
+ if (d == NULL) {
+ /* error handling is tricky in this child code .... */
+ h->records_failed++;
+ return -1;
+ }
+
+ outdata.dptr = (uint8_t *)d;
+ outdata.dsize = d->length;
+
+ res = ctdb_control(h->ctdb_db->ctdb, h->srcnode, 0, CTDB_CONTROL_TRAVERSE_DATA,
+ CTDB_CTRL_FLAG_NOREPLY, outdata, NULL, NULL, &status, NULL, NULL);
+ if (res != 0 || status != 0) {
+ h->records_failed++;
+ return -1;
+ }
+
+ h->records_sent++;
+ return 0;
+}
+
+struct traverse_all_state {
+ struct ctdb_context *ctdb;
+ struct ctdb_traverse_local_handle *h;
+ uint32_t reqid;
+ uint32_t srcnode;
+ uint32_t client_reqid;
+ uint64_t srvid;
+ bool withemptyrecords;
+};
+
+/*
+ setup a non-blocking traverse of a local ltdb. The callback function
+ will be called on every record in the local ltdb. To stop the
+ traverse, talloc_free() the traverse_handle.
+
+ The traverse is finished when the callback is called with tdb_null for key and data
+ */
+static struct ctdb_traverse_local_handle *ctdb_traverse_local(struct ctdb_db_context *ctdb_db,
+ ctdb_traverse_fn_t callback,
+ struct traverse_all_state *all_state)
+{
+ struct ctdb_traverse_local_handle *h;
+ int ret;
+
+ h = talloc_zero(all_state, struct ctdb_traverse_local_handle);
+ if (h == NULL) {
+ return NULL;
+ }
+
+ ret = pipe(h->fd);
+
+ if (ret != 0) {
+ talloc_free(h);
+ return NULL;
+ }
+
+ h->child = ctdb_fork(ctdb_db->ctdb);
+
+ if (h->child == (pid_t)-1) {
+ close(h->fd[0]);
+ close(h->fd[1]);
+ talloc_free(h);
+ return NULL;
+ }
+
+ h->callback = callback;
+ h->private_data = all_state;
+ h->ctdb_db = ctdb_db;
+ h->client_reqid = all_state->client_reqid;
+ h->reqid = all_state->reqid;
+ h->srvid = all_state->srvid;
+ h->srcnode = all_state->srcnode;
+ h->withemptyrecords = all_state->withemptyrecords;
+
+ if (h->child == 0) {
+ /* start the traverse in the child */
+ int res, status;
+ pid_t parent = getpid();
+ struct ctdb_context *ctdb = ctdb_db->ctdb;
+ struct ctdb_rec_data_old *d;
+ TDB_DATA outdata;
+
+ close(h->fd[0]);
+
+ prctl_set_comment("ctdb_traverse");
+ if (switch_from_server_to_client(ctdb) != 0) {
+ DEBUG(DEBUG_CRIT, ("Failed to switch traverse child into client mode\n"));
+ _exit(0);
+ }
+
+ d = ctdb_marshall_record(h, h->reqid, tdb_null, NULL, tdb_null);
+ if (d == NULL) {
+ res = 0;
+ sys_write(h->fd[1], &res, sizeof(int));
+ _exit(0);
+ }
+
+ res = tdb_traverse_read(ctdb_db->ltdb->tdb, ctdb_traverse_local_fn, h);
+ if (res == -1 || h->records_failed > 0) {
+ /* traverse failed */
+ res = -(h->records_sent);
+ } else {
+ res = h->records_sent;
+ }
+
+ /* Wait till all the data is flushed from output queue */
+ while (ctdb_queue_length(ctdb->daemon.queue) > 0) {
+ tevent_loop_once(ctdb->ev);
+ }
+
+ /* End traverse by sending empty record */
+ outdata.dptr = (uint8_t *)d;
+ outdata.dsize = d->length;
+ ret = ctdb_control(ctdb, h->srcnode, 0,
+ CTDB_CONTROL_TRAVERSE_DATA,
+ CTDB_CTRL_FLAG_NOREPLY, outdata,
+ NULL, NULL, &status, NULL, NULL);
+ if (ret == -1 || status == -1) {
+ if (res > 0) {
+ res = -res;
+ }
+ }
+
+ sys_write(h->fd[1], &res, sizeof(res));
+
+ ctdb_wait_for_process_to_exit(parent);
+ _exit(0);
+ }
+
+ close(h->fd[1]);
+ set_close_on_exec(h->fd[0]);
+
+ talloc_set_destructor(h, traverse_local_destructor);
+
+ DLIST_ADD(ctdb_db->traverse, h);
+
+ h->fde = tevent_add_fd(ctdb_db->ctdb->ev, h, h->fd[0], TEVENT_FD_READ,
+ ctdb_traverse_child_handler, h);
+ if (h->fde == NULL) {
+ close(h->fd[0]);
+ talloc_free(h);
+ return NULL;
+ }
+ tevent_fd_set_auto_close(h->fde);
+
+ return h;
+}
+
+
+struct ctdb_traverse_all_handle {
+ struct ctdb_context *ctdb;
+ struct ctdb_db_context *ctdb_db;
+ uint32_t reqid;
+ ctdb_traverse_fn_t callback;
+ void *private_data;
+ uint32_t null_count;
+ bool timedout;
+};
+
+/*
+ destroy a traverse_all op
+ */
+static int ctdb_traverse_all_destructor(struct ctdb_traverse_all_handle *state)
+{
+ reqid_remove(state->ctdb->idr, state->reqid);
+ return 0;
+}
+
+/* called when a traverse times out */
+static void ctdb_traverse_all_timeout(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *private_data)
+{
+ struct ctdb_traverse_all_handle *state = talloc_get_type(private_data, struct ctdb_traverse_all_handle);
+
+ DEBUG(DEBUG_ERR,(__location__ " Traverse all timeout on database:%s\n", state->ctdb_db->db_name));
+ CTDB_INCREMENT_STAT(state->ctdb, timeouts.traverse);
+
+ state->timedout = true;
+ state->callback(state->private_data, tdb_null, tdb_null);
+}
+
+
+struct traverse_start_state {
+ struct ctdb_context *ctdb;
+ struct ctdb_traverse_all_handle *h;
+ uint32_t srcnode;
+ uint32_t reqid;
+ uint32_t db_id;
+ uint64_t srvid;
+ bool withemptyrecords;
+ int num_records;
+};
+
+
+/*
+ setup a cluster-wide non-blocking traverse of a ctdb. The
+ callback function will be called on every record in the local
+ ltdb. To stop the traverse, talloc_free() the traverse_handle.
+
+ The traverse is finished when the callback is called with tdb_null
+ for key and data
+ */
+static struct ctdb_traverse_all_handle *ctdb_daemon_traverse_all(struct ctdb_db_context *ctdb_db,
+ ctdb_traverse_fn_t callback,
+ struct traverse_start_state *start_state)
+{
+ struct ctdb_traverse_all_handle *state;
+ struct ctdb_context *ctdb = ctdb_db->ctdb;
+ int ret;
+ TDB_DATA data;
+ struct ctdb_traverse_all r;
+ struct ctdb_traverse_all_ext r_ext;
+ uint32_t destination;
+
+ state = talloc(start_state, struct ctdb_traverse_all_handle);
+ if (state == NULL) {
+ return NULL;
+ }
+
+ state->ctdb = ctdb;
+ state->ctdb_db = ctdb_db;
+ state->reqid = reqid_new(ctdb_db->ctdb->idr, state);
+ state->callback = callback;
+ state->private_data = start_state;
+ state->null_count = 0;
+ state->timedout = false;
+
+ talloc_set_destructor(state, ctdb_traverse_all_destructor);
+
+ if (start_state->withemptyrecords) {
+ r_ext.db_id = ctdb_db->db_id;
+ r_ext.reqid = state->reqid;
+ r_ext.pnn = ctdb->pnn;
+ r_ext.client_reqid = start_state->reqid;
+ r_ext.srvid = start_state->srvid;
+ r_ext.withemptyrecords = start_state->withemptyrecords;
+
+ data.dptr = (uint8_t *)&r_ext;
+ data.dsize = sizeof(r_ext);
+ } else {
+ r.db_id = ctdb_db->db_id;
+ r.reqid = state->reqid;
+ r.pnn = ctdb->pnn;
+ r.client_reqid = start_state->reqid;
+ r.srvid = start_state->srvid;
+
+ data.dptr = (uint8_t *)&r;
+ data.dsize = sizeof(r);
+ }
+
+ if (ctdb_db_volatile(ctdb_db)) {
+ /* volatile database, traverse all active nodes */
+ destination = CTDB_BROADCAST_ACTIVE;
+ } else {
+ unsigned int i;
+ /* persistent database, traverse one node, preferably
+ * the local one
+ */
+ destination = ctdb->pnn;
+ /* check we are in the vnnmap */
+ for (i=0; i < ctdb->vnn_map->size; i++) {
+ if (ctdb->vnn_map->map[i] == ctdb->pnn) {
+ break;
+ }
+ }
+ /* if we are not in the vnn map we just pick the first
+ * node instead
+ */
+ if (i == ctdb->vnn_map->size) {
+ destination = ctdb->vnn_map->map[0];
+ }
+ }
+
+ /* tell all the nodes in the cluster to start sending records to this
+ * node, or if it is a persistent database, just tell the local
+ * node
+ */
+
+ if (start_state->withemptyrecords) {
+ ret = ctdb_daemon_send_control(ctdb, destination, 0,
+ CTDB_CONTROL_TRAVERSE_ALL_EXT,
+ 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
+ } else {
+ ret = ctdb_daemon_send_control(ctdb, destination, 0,
+ CTDB_CONTROL_TRAVERSE_ALL,
+ 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
+ }
+
+ if (ret != 0) {
+ talloc_free(state);
+ return NULL;
+ }
+
+ DEBUG(DEBUG_NOTICE,("Starting traverse on DB %s (id %d)\n",
+ ctdb_db->db_name, state->reqid));
+
+ /* timeout the traverse */
+ tevent_add_timer(ctdb->ev, state,
+ timeval_current_ofs(ctdb->tunable.traverse_timeout, 0),
+ ctdb_traverse_all_timeout, state);
+
+ return state;
+}
+
+/*
+ called when local traverse ends
+ */
+static void traverse_all_callback(void *p, TDB_DATA key, TDB_DATA data)
+{
+ struct traverse_all_state *state = talloc_get_type(p, struct traverse_all_state);
+
+ /* we're done */
+ talloc_free(state);
+}
+
+/*
+ * extended version to take the "withemptyrecords" parameter"
+ */
+int32_t ctdb_control_traverse_all_ext(struct ctdb_context *ctdb, TDB_DATA data, TDB_DATA *outdata)
+{
+ struct ctdb_traverse_all_ext *c = (struct ctdb_traverse_all_ext *)data.dptr;
+ struct traverse_all_state *state;
+ struct ctdb_db_context *ctdb_db;
+
+ if (data.dsize != sizeof(struct ctdb_traverse_all_ext)) {
+ DEBUG(DEBUG_ERR,(__location__ " Invalid size in ctdb_control_traverse_all_ext\n"));
+ return -1;
+ }
+
+ ctdb_db = find_ctdb_db(ctdb, c->db_id);
+ if (ctdb_db == NULL) {
+ return -1;
+ }
+
+ if (ctdb_db->unhealthy_reason) {
+ if (ctdb->tunable.allow_unhealthy_db_read == 0) {
+ DEBUG(DEBUG_ERR,("db(%s) unhealty in ctdb_control_traverse_all: %s\n",
+ ctdb_db->db_name, ctdb_db->unhealthy_reason));
+ return -1;
+ }
+ DEBUG(DEBUG_WARNING,("warn: db(%s) unhealty in ctdb_control_traverse_all: %s\n",
+ ctdb_db->db_name, ctdb_db->unhealthy_reason));
+ }
+
+ state = talloc(ctdb_db, struct traverse_all_state);
+ if (state == NULL) {
+ return -1;
+ }
+
+ state->reqid = c->reqid;
+ state->srcnode = c->pnn;
+ state->ctdb = ctdb;
+ state->client_reqid = c->client_reqid;
+ state->srvid = c->srvid;
+ state->withemptyrecords = c->withemptyrecords;
+
+ state->h = ctdb_traverse_local(ctdb_db, traverse_all_callback, state);
+ if (state->h == NULL) {
+ talloc_free(state);
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ called when a CTDB_CONTROL_TRAVERSE_ALL control comes in. We then
+ setup a traverse of our local ltdb, sending the records as
+ CTDB_CONTROL_TRAVERSE_DATA records back to the originator
+ */
+int32_t ctdb_control_traverse_all(struct ctdb_context *ctdb, TDB_DATA data, TDB_DATA *outdata)
+{
+ struct ctdb_traverse_all *c = (struct ctdb_traverse_all *)data.dptr;
+ struct traverse_all_state *state;
+ struct ctdb_db_context *ctdb_db;
+
+ if (data.dsize != sizeof(struct ctdb_traverse_all)) {
+ DEBUG(DEBUG_ERR,(__location__ " Invalid size in ctdb_control_traverse_all\n"));
+ return -1;
+ }
+
+ ctdb_db = find_ctdb_db(ctdb, c->db_id);
+ if (ctdb_db == NULL) {
+ return -1;
+ }
+
+ if (ctdb_db->unhealthy_reason) {
+ if (ctdb->tunable.allow_unhealthy_db_read == 0) {
+ DEBUG(DEBUG_ERR,("db(%s) unhealty in ctdb_control_traverse_all: %s\n",
+ ctdb_db->db_name, ctdb_db->unhealthy_reason));
+ return -1;
+ }
+ DEBUG(DEBUG_WARNING,("warn: db(%s) unhealty in ctdb_control_traverse_all: %s\n",
+ ctdb_db->db_name, ctdb_db->unhealthy_reason));
+ }
+
+ state = talloc(ctdb_db, struct traverse_all_state);
+ if (state == NULL) {
+ return -1;
+ }
+
+ state->reqid = c->reqid;
+ state->srcnode = c->pnn;
+ state->ctdb = ctdb;
+ state->client_reqid = c->client_reqid;
+ state->srvid = c->srvid;
+ state->withemptyrecords = false;
+
+ state->h = ctdb_traverse_local(ctdb_db, traverse_all_callback, state);
+ if (state->h == NULL) {
+ talloc_free(state);
+ return -1;
+ }
+
+ return 0;
+}
+
+
+/*
+ called when a CTDB_CONTROL_TRAVERSE_DATA control comes in. We then
+ call the traverse_all callback with the record
+ */
+int32_t ctdb_control_traverse_data(struct ctdb_context *ctdb, TDB_DATA data, TDB_DATA *outdata)
+{
+ struct ctdb_rec_data_old *d = (struct ctdb_rec_data_old *)data.dptr;
+ struct ctdb_traverse_all_handle *state;
+ TDB_DATA key;
+ ctdb_traverse_fn_t callback;
+ void *private_data;
+
+ if (data.dsize < sizeof(uint32_t) || data.dsize != d->length) {
+ DEBUG(DEBUG_ERR,("Bad record size in ctdb_control_traverse_data\n"));
+ return -1;
+ }
+
+ state = reqid_find(ctdb->idr, d->reqid, struct ctdb_traverse_all_handle);
+ if (state == NULL || d->reqid != state->reqid) {
+ /* traverse might have been terminated already */
+ return -1;
+ }
+
+ key.dsize = d->keylen;
+ key.dptr = &d->data[0];
+ data.dsize = d->datalen;
+ data.dptr = &d->data[d->keylen];
+
+ if (key.dsize == 0 && data.dsize == 0) {
+ state->null_count++;
+ /* Persistent databases are only scanned on one node (the local
+ * node)
+ */
+ if (ctdb_db_volatile(state->ctdb_db)) {
+ if (state->null_count != ctdb_get_num_active_nodes(ctdb)) {
+ return 0;
+ }
+ }
+ }
+
+ callback = state->callback;
+ private_data = state->private_data;
+
+ callback(private_data, key, data);
+ return 0;
+}
+
+/*
+ kill a in-progress traverse, used when a client disconnects
+ */
+int32_t ctdb_control_traverse_kill(struct ctdb_context *ctdb, TDB_DATA data,
+ TDB_DATA *outdata, uint32_t srcnode)
+{
+ struct ctdb_traverse_start *d = (struct ctdb_traverse_start *)data.dptr;
+ struct ctdb_db_context *ctdb_db;
+ struct ctdb_traverse_local_handle *t;
+
+ ctdb_db = find_ctdb_db(ctdb, d->db_id);
+ if (ctdb_db == NULL) {
+ return -1;
+ }
+
+ for (t=ctdb_db->traverse; t; t=t->next) {
+ if (t->client_reqid == d->reqid &&
+ t->srvid == d->srvid) {
+ talloc_free(t);
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+/*
+ this is called when a client disconnects during a traverse
+ we need to notify all the nodes taking part in the search that they
+ should kill their traverse children
+ */
+static int ctdb_traverse_start_destructor(struct traverse_start_state *state)
+{
+ struct ctdb_traverse_start r;
+ TDB_DATA data;
+
+ DEBUG(DEBUG_ERR,(__location__ " Traverse cancelled by client disconnect for database:0x%08x\n", state->db_id));
+ r.db_id = state->db_id;
+ r.reqid = state->reqid;
+ r.srvid = state->srvid;
+
+ data.dptr = (uint8_t *)&r;
+ data.dsize = sizeof(r);
+
+ ctdb_daemon_send_control(state->ctdb, CTDB_BROADCAST_CONNECTED, 0,
+ CTDB_CONTROL_TRAVERSE_KILL,
+ 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
+ return 0;
+}
+
+/*
+ callback which sends records as messages to the client
+ */
+static void traverse_start_callback(void *p, TDB_DATA key, TDB_DATA data)
+{
+ struct traverse_start_state *state;
+ struct ctdb_rec_data_old *d;
+ TDB_DATA cdata;
+
+ state = talloc_get_type(p, struct traverse_start_state);
+
+ d = ctdb_marshall_record(state, state->reqid, key, NULL, data);
+ if (d == NULL) {
+ return;
+ }
+
+ cdata.dptr = (uint8_t *)d;
+ cdata.dsize = d->length;
+
+ srvid_dispatch(state->ctdb->srv, state->srvid, 0, cdata);
+ if (key.dsize == 0 && data.dsize == 0) {
+ DEBUG(DEBUG_NOTICE, ("Ending traverse on DB %s (id %d), records %d\n",
+ state->h->ctdb_db->db_name, state->h->reqid,
+ state->num_records));
+
+ if (state->h->timedout) {
+ /* timed out, send TRAVERSE_KILL control */
+ talloc_free(state);
+ } else {
+ /* end of traverse */
+ talloc_set_destructor(state, NULL);
+ talloc_free(state);
+ }
+ } else {
+ state->num_records++;
+ }
+}
+
+
+/**
+ * start a traverse_all - called as a control from a client.
+ * extended version to take the "withemptyrecords" parameter.
+ */
+int32_t ctdb_control_traverse_start_ext(struct ctdb_context *ctdb,
+ TDB_DATA data,
+ TDB_DATA *outdata,
+ uint32_t srcnode,
+ uint32_t client_id)
+{
+ struct ctdb_traverse_start_ext *d = (struct ctdb_traverse_start_ext *)data.dptr;
+ struct traverse_start_state *state;
+ struct ctdb_db_context *ctdb_db;
+ struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
+
+ if (client == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " No client found\n"));
+ return -1;
+ }
+
+ if (data.dsize != sizeof(*d)) {
+ DEBUG(DEBUG_ERR,("Bad record size in ctdb_control_traverse_start\n"));
+ return -1;
+ }
+
+ ctdb_db = find_ctdb_db(ctdb, d->db_id);
+ if (ctdb_db == NULL) {
+ return -1;
+ }
+
+ if (ctdb_db->unhealthy_reason) {
+ if (ctdb->tunable.allow_unhealthy_db_read == 0) {
+ DEBUG(DEBUG_ERR,("db(%s) unhealty in ctdb_control_traverse_start: %s\n",
+ ctdb_db->db_name, ctdb_db->unhealthy_reason));
+ return -1;
+ }
+ DEBUG(DEBUG_WARNING,("warn: db(%s) unhealty in ctdb_control_traverse_start: %s\n",
+ ctdb_db->db_name, ctdb_db->unhealthy_reason));
+ }
+
+ state = talloc(client, struct traverse_start_state);
+ if (state == NULL) {
+ return -1;
+ }
+
+ state->srcnode = srcnode;
+ state->reqid = d->reqid;
+ state->srvid = d->srvid;
+ state->db_id = d->db_id;
+ state->ctdb = ctdb;
+ state->withemptyrecords = d->withemptyrecords;
+ state->num_records = 0;
+
+ state->h = ctdb_daemon_traverse_all(ctdb_db, traverse_start_callback, state);
+ if (state->h == NULL) {
+ talloc_free(state);
+ return -1;
+ }
+
+ talloc_set_destructor(state, ctdb_traverse_start_destructor);
+
+ return 0;
+}
+
+/**
+ * start a traverse_all - called as a control from a client.
+ */
+int32_t ctdb_control_traverse_start(struct ctdb_context *ctdb,
+ TDB_DATA data,
+ TDB_DATA *outdata,
+ uint32_t srcnode,
+ uint32_t client_id)
+{
+ struct ctdb_traverse_start *d = (struct ctdb_traverse_start *)data.dptr;
+ struct ctdb_traverse_start_ext d2;
+ TDB_DATA data2;
+
+ ZERO_STRUCT(d2);
+ d2.db_id = d->db_id;
+ d2.reqid = d->reqid;
+ d2.srvid = d->srvid;
+ d2.withemptyrecords = false;
+
+ data2.dsize = sizeof(d2);
+ data2.dptr = (uint8_t *)&d2;
+
+ return ctdb_control_traverse_start_ext(ctdb, data2, outdata, srcnode, client_id);
+}
diff --git a/ctdb/server/ctdb_tunables.c b/ctdb/server/ctdb_tunables.c
new file mode 100644
index 0000000..0dce656
--- /dev/null
+++ b/ctdb/server/ctdb_tunables.c
@@ -0,0 +1,170 @@
+/*
+ ctdb tunables code
+
+ Copyright (C) Andrew Tridgell 2007
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "replace.h"
+#include "system/network.h"
+
+#include <talloc.h>
+#include <tdb.h>
+
+#include "lib/util/debug.h"
+
+#include "ctdb_private.h"
+
+#include "common/common.h"
+#include "common/logging.h"
+#include "common/path.h"
+#include "common/tunable.h"
+
+/*
+ set all tunables to defaults
+ */
+void ctdb_tunables_set_defaults(struct ctdb_context *ctdb)
+{
+ ctdb_tunable_set_defaults(&ctdb->tunable);
+}
+
+
+/*
+ get a tunable
+ */
+int32_t ctdb_control_get_tunable(struct ctdb_context *ctdb, TDB_DATA indata,
+ TDB_DATA *outdata)
+{
+ struct ctdb_control_get_tunable *t =
+ (struct ctdb_control_get_tunable *)indata.dptr;
+ char *name;
+ uint32_t val;
+ bool ret;
+
+ if (indata.dsize < sizeof(*t) ||
+ t->length > indata.dsize - offsetof(struct ctdb_control_get_tunable, name)) {
+ DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_get_tunable\n"));
+ return -1;
+ }
+
+ name = talloc_strndup(ctdb, (char*)t->name, t->length);
+ CTDB_NO_MEMORY(ctdb, name);
+
+ ret = ctdb_tunable_get_value(&ctdb->tunable, name, &val);
+ talloc_free(name);
+ if (! ret) {
+ return -EINVAL;
+ }
+
+ outdata->dptr = (uint8_t *)talloc(outdata, uint32_t);
+ CTDB_NO_MEMORY(ctdb, outdata->dptr);
+
+ *(uint32_t *)outdata->dptr = val;
+ outdata->dsize = sizeof(uint32_t);
+
+ return 0;
+}
+
+
+/*
+ set a tunable
+ */
+int32_t ctdb_control_set_tunable(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+ struct ctdb_tunable_old *t =
+ (struct ctdb_tunable_old *)indata.dptr;
+ char *name;
+ int ret;
+ bool obsolete;
+
+ if (indata.dsize < sizeof(*t) ||
+ t->length > indata.dsize - offsetof(struct ctdb_tunable_old, name)) {
+ DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tunable\n"));
+ return -1;
+ }
+
+ name = talloc_strndup(ctdb, (char *)t->name, t->length);
+ CTDB_NO_MEMORY(ctdb, name);
+
+ ret = ctdb_tunable_set_value(&ctdb->tunable, name, t->value,
+ &obsolete);
+ if (! ret) {
+ talloc_free(name);
+ return -1;
+ }
+
+ if (obsolete) {
+ DEBUG(DEBUG_WARNING,
+ ("Setting obsolete tunable \"%s\"\n", name));
+ talloc_free(name);
+ return 1;
+ }
+
+ talloc_free(name);
+ return 0;
+}
+
+/*
+ list tunables
+ */
+int32_t ctdb_control_list_tunables(struct ctdb_context *ctdb, TDB_DATA *outdata)
+{
+ char *list = NULL;
+ struct ctdb_control_list_tunable *t;
+
+ list = ctdb_tunable_names_to_string(outdata);
+ CTDB_NO_MEMORY(ctdb, list);
+
+ outdata->dsize = offsetof(struct ctdb_control_list_tunable, data) +
+ strlen(list) + 1;
+ outdata->dptr = talloc_size(outdata, outdata->dsize);
+ CTDB_NO_MEMORY(ctdb, outdata->dptr);
+
+ t = (struct ctdb_control_list_tunable *)outdata->dptr;
+ t->length = strlen(list)+1;
+
+ memcpy(t->data, list, t->length);
+ talloc_free(list);
+
+ return 0;
+}
+
+bool ctdb_tunables_load(struct ctdb_context *ctdb)
+{
+ bool status;
+ TALLOC_CTX *tmp_ctx;
+ char *file = NULL;
+
+ /* Fail by default */
+ status = false;
+
+ tmp_ctx = talloc_new(ctdb);
+ if (tmp_ctx == NULL) {
+ DBG_ERR("Memory allocation error\n");
+ goto done;
+ }
+
+ file = path_etcdir_append(tmp_ctx, "ctdb.tunables");
+ if (file == NULL) {
+ D_ERR("Failed to construct path for ctdb.tunables\n");
+ goto done;
+ }
+
+ status = ctdb_tunable_load_file(tmp_ctx, &ctdb->tunable, file);
+ /* No need to log error, already logged above */
+
+done:
+ talloc_free(tmp_ctx);
+ return status;
+}
diff --git a/ctdb/server/ctdb_tunnel.c b/ctdb/server/ctdb_tunnel.c
new file mode 100644
index 0000000..2df9474
--- /dev/null
+++ b/ctdb/server/ctdb_tunnel.c
@@ -0,0 +1,141 @@
+/*
+ ctdb_tunnel protocol code
+
+ Copyright (C) Amitay Isaacs 2017
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+
+#include <talloc.h>
+#include <tevent.h>
+#include <tdb.h>
+
+#include "lib/util/debug.h"
+
+#include "common/logging.h"
+#include "common/reqid.h"
+#include "common/srvid.h"
+
+#include "ctdb_private.h"
+
+int32_t ctdb_control_tunnel_register(struct ctdb_context *ctdb,
+ uint32_t client_id, uint64_t tunnel_id)
+{
+ struct ctdb_client *client;
+ int ret;
+
+ client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
+ if (client == NULL) {
+ DEBUG(DEBUG_ERR, ("Bad client_id in ctdb_tunnel_register\n"));
+ return -1;
+ }
+
+ ret = srvid_exists(ctdb->tunnels, tunnel_id, NULL);
+ if (ret == 0) {
+ DEBUG(DEBUG_ERR,
+ ("Tunnel id 0x%"PRIx64" already registered\n",
+ tunnel_id));
+ return -1;
+ }
+
+ ret = srvid_register(ctdb->tunnels, client, tunnel_id,
+ daemon_tunnel_handler, client);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,
+ ("Failed to register tunnel id 0x%"PRIx64"\n",
+ tunnel_id));
+ return -1;
+ }
+
+ DEBUG(DEBUG_INFO, ("Registered tunnel for id 0x%"PRIx64"\n",
+ tunnel_id));
+ return 0;
+}
+
+int32_t ctdb_control_tunnel_deregister(struct ctdb_context *ctdb,
+ uint32_t client_id, uint64_t tunnel_id)
+{
+ struct ctdb_client *client;
+ int ret;
+
+ client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
+ if (client == NULL) {
+ DEBUG(DEBUG_ERR, ("Bad client_id in ctdb_tunnel_deregister\n"));
+ return -1;
+ }
+
+ ret = srvid_deregister(ctdb->tunnels, tunnel_id, client);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,
+ ("Failed to deregister tunnel id 0x%"PRIx64"\n",
+ tunnel_id));
+ return -1;
+ }
+
+ return 0;
+}
+
+int ctdb_daemon_send_tunnel(struct ctdb_context *ctdb, uint32_t destnode,
+ uint64_t tunnel_id, uint32_t flags, TDB_DATA data)
+{
+ struct ctdb_req_tunnel_old *c;
+ size_t len;
+
+ if (ctdb->methods == NULL) {
+ DEBUG(DEBUG_INFO,
+ ("Failed to send tunnel. Transport is DOWN\n"));
+ return -1;
+ }
+
+ len = offsetof(struct ctdb_req_tunnel_old, data) + data.dsize;
+ c = ctdb_transport_allocate(ctdb, ctdb, CTDB_REQ_TUNNEL, len,
+ struct ctdb_req_tunnel_old);
+ if (c == NULL) {
+ DEBUG(DEBUG_ERR,
+ ("Memory error in ctdb_daemon_send_tunnel()\n"));
+ return -1;
+ }
+
+ c->hdr.destnode = destnode;
+ c->tunnel_id = tunnel_id;
+ c->flags = flags;
+ c->datalen = data.dsize;
+ memcpy(c->data, data.dptr, data.dsize);
+
+ ctdb_queue_packet(ctdb, &c->hdr);
+
+ talloc_free(c);
+ return 0;
+}
+
+void ctdb_request_tunnel(struct ctdb_context *ctdb,
+ struct ctdb_req_header *hdr)
+{
+ struct ctdb_req_tunnel_old *c =
+ (struct ctdb_req_tunnel_old *)hdr;
+ TDB_DATA data;
+ int ret;
+
+ data.dsize = hdr->length;
+ data.dptr = (uint8_t *)c;
+
+ ret = srvid_dispatch(ctdb->tunnels, c->tunnel_id, 0, data);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, ("Tunnel id 0x%"PRIx64" not registered\n",
+ c->tunnel_id));
+ }
+}
diff --git a/ctdb/server/ctdb_update_record.c b/ctdb/server/ctdb_update_record.c
new file mode 100644
index 0000000..405499c
--- /dev/null
+++ b/ctdb/server/ctdb_update_record.c
@@ -0,0 +1,372 @@
+/*
+ implementation of the update record control
+
+ Copyright (C) Andrew Tridgell 2007
+ Copyright (C) Ronnie Sahlberg 2007
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+#include "system/time.h"
+
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/tdb_wrap/tdb_wrap.h"
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+#include "lib/util/sys_rw.h"
+#include "lib/util/util_process.h"
+
+#include "ctdb_private.h"
+#include "ctdb_client.h"
+
+#include "common/system.h"
+#include "common/common.h"
+#include "common/logging.h"
+
+struct ctdb_persistent_write_state {
+ struct ctdb_db_context *ctdb_db;
+ struct ctdb_marshall_buffer *m;
+ struct ctdb_req_control_old *c;
+ uint32_t flags;
+};
+
+/* don't create/update records that does not exist locally */
+#define UPDATE_FLAGS_REPLACE_ONLY 1
+
+/*
+ called from a child process to write the data
+ */
+static int ctdb_persistent_store(struct ctdb_persistent_write_state *state)
+{
+ unsigned int i;
+ int ret;
+ struct ctdb_rec_data_old *rec = NULL;
+ struct ctdb_marshall_buffer *m = state->m;
+
+ ret = tdb_transaction_start(state->ctdb_db->ltdb->tdb);
+ if (ret == -1) {
+ DEBUG(DEBUG_ERR,("Failed to start transaction for db_id 0x%08x in ctdb_persistent_store\n",
+ state->ctdb_db->db_id));
+ return -1;
+ }
+
+ for (i=0;i<m->count;i++) {
+ struct ctdb_ltdb_header oldheader;
+ struct ctdb_ltdb_header header;
+ TDB_DATA key, data, olddata;
+ TALLOC_CTX *tmp_ctx = talloc_new(state);
+
+ rec = ctdb_marshall_loop_next(m, rec, NULL, &header, &key, &data);
+
+ if (rec == NULL) {
+ D_ERR("Failed to get next record %u for db_id 0x%08x "
+ "in ctdb_persistent_store\n",
+ i,
+ state->ctdb_db->db_id);
+ talloc_free(tmp_ctx);
+ goto failed;
+ }
+
+ /* we must check if the record exists or not because
+ ctdb_ltdb_fetch will unconditionally create a record
+ */
+ if (state->flags & UPDATE_FLAGS_REPLACE_ONLY) {
+ TDB_DATA trec;
+ trec = tdb_fetch(state->ctdb_db->ltdb->tdb, key);
+ if (trec.dsize == 0) {
+ talloc_free(tmp_ctx);
+ continue;
+ }
+ free(trec.dptr);
+ }
+
+ /* fetch the old header and ensure the rsn is less than the new rsn */
+ ret = ctdb_ltdb_fetch(state->ctdb_db, key, &oldheader, tmp_ctx, &olddata);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,("Failed to fetch old record for db_id 0x%08x in ctdb_persistent_store\n",
+ state->ctdb_db->db_id));
+ talloc_free(tmp_ctx);
+ goto failed;
+ }
+
+ if (oldheader.rsn >= header.rsn &&
+ (olddata.dsize != data.dsize ||
+ memcmp(olddata.dptr, data.dptr, data.dsize) != 0)) {
+ DEBUG(DEBUG_CRIT,("existing header for db_id 0x%08x has larger RSN %llu than new RSN %llu in ctdb_persistent_store\n",
+ state->ctdb_db->db_id,
+ (unsigned long long)oldheader.rsn, (unsigned long long)header.rsn));
+ talloc_free(tmp_ctx);
+ goto failed;
+ }
+
+ talloc_free(tmp_ctx);
+
+ ret = ctdb_ltdb_store(state->ctdb_db, key, &header, data);
+ if (ret != 0) {
+ DEBUG(DEBUG_CRIT,("Failed to store record for db_id 0x%08x in ctdb_persistent_store\n",
+ state->ctdb_db->db_id));
+ goto failed;
+ }
+ }
+
+ ret = tdb_transaction_commit(state->ctdb_db->ltdb->tdb);
+ if (ret == -1) {
+ DEBUG(DEBUG_ERR,("Failed to commit transaction for db_id 0x%08x in ctdb_persistent_store\n",
+ state->ctdb_db->db_id));
+ return -1;
+ }
+
+ return 0;
+
+failed:
+ tdb_transaction_cancel(state->ctdb_db->ltdb->tdb);
+ return -1;
+}
+
+
+/*
+ called when we the child has completed the persistent write
+ on our behalf
+ */
+static void ctdb_persistent_write_callback(int status, void *private_data)
+{
+ struct ctdb_persistent_write_state *state = talloc_get_type(private_data,
+ struct ctdb_persistent_write_state);
+
+
+ ctdb_request_control_reply(state->ctdb_db->ctdb, state->c, NULL, status, NULL);
+
+ talloc_free(state);
+}
+
+/*
+ called if our lockwait child times out
+ */
+static void ctdb_persistent_lock_timeout(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *private_data)
+{
+ struct ctdb_persistent_write_state *state = talloc_get_type(private_data,
+ struct ctdb_persistent_write_state);
+ ctdb_request_control_reply(state->ctdb_db->ctdb, state->c, NULL, -1, "timeout in ctdb_persistent_lock");
+ talloc_free(state);
+}
+
+struct childwrite_handle {
+ struct ctdb_context *ctdb;
+ struct ctdb_db_context *ctdb_db;
+ struct tevent_fd *fde;
+ int fd[2];
+ pid_t child;
+ void *private_data;
+ void (*callback)(int, void *);
+ struct timeval start_time;
+};
+
+static int childwrite_destructor(struct childwrite_handle *h)
+{
+ CTDB_DECREMENT_STAT(h->ctdb, pending_childwrite_calls);
+ ctdb_kill(h->ctdb, h->child, SIGKILL);
+ return 0;
+}
+
+/* called when the child process has finished writing the record to the
+ database
+*/
+static void childwrite_handler(struct tevent_context *ev,
+ struct tevent_fd *fde,
+ uint16_t flags, void *private_data)
+{
+ struct childwrite_handle *h = talloc_get_type(private_data,
+ struct childwrite_handle);
+ void *p = h->private_data;
+ void (*callback)(int, void *) = h->callback;
+ pid_t child = h->child;
+ TALLOC_CTX *tmp_ctx = talloc_new(ev);
+ int ret;
+ char c;
+
+ CTDB_UPDATE_LATENCY(h->ctdb, h->ctdb_db, "persistent", childwrite_latency, h->start_time);
+ CTDB_DECREMENT_STAT(h->ctdb, pending_childwrite_calls);
+
+ /* the handle needs to go away when the context is gone - when
+ the handle goes away this implicitly closes the pipe, which
+ kills the child */
+ talloc_steal(tmp_ctx, h);
+
+ talloc_set_destructor(h, NULL);
+
+ ret = sys_read(h->fd[0], &c, 1);
+ if (ret < 1) {
+ DEBUG(DEBUG_ERR, (__location__ " Read returned %d. Childwrite failed\n", ret));
+ c = 1;
+ }
+
+ callback(c, p);
+
+ ctdb_kill(h->ctdb, child, SIGKILL);
+ talloc_free(tmp_ctx);
+}
+
+/* this creates a child process which will take out a tdb transaction
+ and write the record to the database.
+*/
+static struct childwrite_handle *ctdb_childwrite(
+ struct ctdb_db_context *ctdb_db,
+ void (*callback)(int, void *private_data),
+ struct ctdb_persistent_write_state *state)
+{
+ struct childwrite_handle *result;
+ int ret;
+ pid_t parent = getpid();
+
+ CTDB_INCREMENT_STAT(ctdb_db->ctdb, childwrite_calls);
+ CTDB_INCREMENT_STAT(ctdb_db->ctdb, pending_childwrite_calls);
+
+ if (!(result = talloc_zero(state, struct childwrite_handle))) {
+ CTDB_DECREMENT_STAT(ctdb_db->ctdb, pending_childwrite_calls);
+ return NULL;
+ }
+
+ ret = pipe(result->fd);
+
+ if (ret != 0) {
+ talloc_free(result);
+ CTDB_DECREMENT_STAT(ctdb_db->ctdb, pending_childwrite_calls);
+ return NULL;
+ }
+
+ result->child = ctdb_fork(ctdb_db->ctdb);
+
+ if (result->child == (pid_t)-1) {
+ close(result->fd[0]);
+ close(result->fd[1]);
+ talloc_free(result);
+ CTDB_DECREMENT_STAT(ctdb_db->ctdb, pending_childwrite_calls);
+ return NULL;
+ }
+
+ result->callback = callback;
+ result->private_data = state;
+ result->ctdb = ctdb_db->ctdb;
+ result->ctdb_db = ctdb_db;
+
+ if (result->child == 0) {
+ char c = 0;
+
+ close(result->fd[0]);
+ prctl_set_comment("ctdb_write_persistent");
+ ret = ctdb_persistent_store(state);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " Failed to write persistent data\n"));
+ c = 1;
+ }
+
+ sys_write(result->fd[1], &c, 1);
+
+ ctdb_wait_for_process_to_exit(parent);
+ _exit(0);
+ }
+
+ close(result->fd[1]);
+ set_close_on_exec(result->fd[0]);
+
+ talloc_set_destructor(result, childwrite_destructor);
+
+ DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for ctdb_childwrite\n", result->fd[0]));
+
+ result->fde = tevent_add_fd(ctdb_db->ctdb->ev, result, result->fd[0],
+ TEVENT_FD_READ, childwrite_handler,
+ (void *)result);
+ if (result->fde == NULL) {
+ talloc_free(result);
+ CTDB_DECREMENT_STAT(ctdb_db->ctdb, pending_childwrite_calls);
+ return NULL;
+ }
+ tevent_fd_set_auto_close(result->fde);
+
+ result->start_time = timeval_current();
+
+ return result;
+}
+
+/*
+ update a record on this node if the new record has a higher rsn than the
+ current record
+ */
+int32_t ctdb_control_update_record(struct ctdb_context *ctdb,
+ struct ctdb_req_control_old *c, TDB_DATA recdata,
+ bool *async_reply)
+{
+ struct ctdb_db_context *ctdb_db;
+ struct ctdb_persistent_write_state *state;
+ struct childwrite_handle *handle;
+ struct ctdb_marshall_buffer *m = (struct ctdb_marshall_buffer *)recdata.dptr;
+
+ if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
+ DEBUG(DEBUG_INFO,("rejecting ctdb_control_update_record when recovery active\n"));
+ return -1;
+ }
+
+ ctdb_db = find_ctdb_db(ctdb, m->db_id);
+ if (ctdb_db == NULL) {
+ DEBUG(DEBUG_ERR,("Unknown database 0x%08x in ctdb_control_update_record\n", m->db_id));
+ return -1;
+ }
+
+ if (ctdb_db->unhealthy_reason) {
+ DEBUG(DEBUG_ERR,("db(%s) unhealty in ctdb_control_update_record: %s\n",
+ ctdb_db->db_name, ctdb_db->unhealthy_reason));
+ return -1;
+ }
+
+ state = talloc(ctdb, struct ctdb_persistent_write_state);
+ CTDB_NO_MEMORY(ctdb, state);
+
+ state->ctdb_db = ctdb_db;
+ state->c = c;
+ state->m = m;
+ state->flags = 0;
+ if (ctdb_db_volatile(ctdb_db)) {
+ state->flags = UPDATE_FLAGS_REPLACE_ONLY;
+ }
+
+ /* create a child process to take out a transaction and
+ write the data.
+ */
+ handle = ctdb_childwrite(ctdb_db, ctdb_persistent_write_callback, state);
+ if (handle == NULL) {
+ DEBUG(DEBUG_ERR,("Failed to setup childwrite handler in ctdb_control_update_record\n"));
+ talloc_free(state);
+ return -1;
+ }
+
+ /* we need to wait for the replies */
+ *async_reply = true;
+
+ /* need to keep the control structure around */
+ talloc_steal(state, c);
+
+ /* but we won't wait forever */
+ tevent_add_timer(ctdb->ev, state,
+ timeval_current_ofs(ctdb->tunable.control_timeout, 0),
+ ctdb_persistent_lock_timeout, state);
+
+ return 0;
+}
+
diff --git a/ctdb/server/ctdb_uptime.c b/ctdb/server/ctdb_uptime.c
new file mode 100644
index 0000000..53025f5
--- /dev/null
+++ b/ctdb/server/ctdb_uptime.c
@@ -0,0 +1,55 @@
+/*
+ ctdb uptime code
+
+ Copyright (C) Ronnie Sahlberg 2008
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/syslog.h"
+#include "system/time.h"
+#include "system/filesys.h"
+#include "system/network.h"
+
+#include <talloc.h>
+
+#include "lib/util/debug.h"
+
+#include "ctdb_private.h"
+#include "ctdb_client.h"
+
+#include "common/common.h"
+#include "common/logging.h"
+
+/*
+ returns the ctdb uptime
+*/
+int32_t ctdb_control_uptime(struct ctdb_context *ctdb, TDB_DATA *outdata)
+{
+ struct ctdb_uptime *uptime;
+
+ uptime = talloc_zero(outdata, struct ctdb_uptime);
+ CTDB_NO_MEMORY(ctdb, uptime);
+
+ gettimeofday(&uptime->current_time, NULL);
+ uptime->ctdbd_start_time = ctdb->ctdbd_start_time;
+ uptime->last_recovery_started = ctdb->last_recovery_started;
+ uptime->last_recovery_finished = ctdb->last_recovery_finished;
+
+ outdata->dsize = sizeof(struct ctdb_uptime);
+ outdata->dptr = (uint8_t *)uptime;
+
+ return 0;
+}
diff --git a/ctdb/server/ctdb_vacuum.c b/ctdb/server/ctdb_vacuum.c
new file mode 100644
index 0000000..74d7215
--- /dev/null
+++ b/ctdb/server/ctdb_vacuum.c
@@ -0,0 +1,1990 @@
+/*
+ ctdb vacuuming events
+
+ Copyright (C) Ronnie Sahlberg 2009
+ Copyright (C) Michael Adam 2010-2013
+ Copyright (C) Stefan Metzmacher 2010-2011
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "system/time.h"
+
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/tdb_wrap/tdb_wrap.h"
+#include "lib/util/dlinklist.h"
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+#include "lib/util/sys_rw.h"
+#include "lib/util/util_process.h"
+
+#include "ctdb_private.h"
+#include "ctdb_client.h"
+
+#include "protocol/protocol_private.h"
+
+#include "common/rb_tree.h"
+#include "common/common.h"
+#include "common/logging.h"
+
+#include "protocol/protocol_api.h"
+
+#define TIMELIMIT() timeval_current_ofs(10, 0)
+
+enum vacuum_child_status { VACUUM_RUNNING, VACUUM_OK, VACUUM_ERROR, VACUUM_TIMEOUT};
+
+struct ctdb_vacuum_child_context {
+ struct ctdb_vacuum_handle *vacuum_handle;
+ /* fd child writes status to */
+ int fd[2];
+ pid_t child_pid;
+ enum vacuum_child_status status;
+ struct timeval start_time;
+ bool scheduled;
+};
+
+struct ctdb_vacuum_handle {
+ struct ctdb_db_context *ctdb_db;
+ uint32_t fast_path_count;
+ uint32_t vacuum_interval;
+};
+
+
+/* a list of records to possibly delete */
+struct vacuum_data {
+ struct ctdb_context *ctdb;
+ struct ctdb_db_context *ctdb_db;
+ struct tdb_context *dest_db;
+ trbt_tree_t *delete_list;
+ struct ctdb_marshall_buffer **vacuum_fetch_list;
+ struct timeval start;
+ bool traverse_error;
+ bool vacuum;
+ struct {
+ struct {
+ uint32_t added_to_vacuum_fetch_list;
+ uint32_t added_to_delete_list;
+ uint32_t deleted;
+ uint32_t skipped;
+ uint32_t error;
+ uint32_t total;
+ } delete_queue;
+ struct {
+ uint32_t scheduled;
+ uint32_t skipped;
+ uint32_t error;
+ uint32_t total;
+ } db_traverse;
+ struct {
+ uint32_t total;
+ uint32_t remote_error;
+ uint32_t local_error;
+ uint32_t deleted;
+ uint32_t skipped;
+ uint32_t left;
+ } delete_list;
+ struct {
+ uint32_t vacuumed;
+ uint32_t copied;
+ } repack;
+ } count;
+};
+
+/* this structure contains the information for one record to be deleted */
+struct delete_record_data {
+ struct ctdb_context *ctdb;
+ struct ctdb_db_context *ctdb_db;
+ struct ctdb_ltdb_header hdr;
+ uint32_t remote_fail_count;
+ TDB_DATA key;
+ uint8_t keydata[1];
+};
+
+struct delete_records_list {
+ struct ctdb_marshall_buffer *records;
+ struct vacuum_data *vdata;
+};
+
+struct fetch_record_data {
+ TDB_DATA key;
+ uint8_t keydata[1];
+};
+
+static int insert_record_into_delete_queue(struct ctdb_db_context *ctdb_db,
+ const struct ctdb_ltdb_header *hdr,
+ TDB_DATA key);
+
+/**
+ * Store key and header in a tree, indexed by the key hash.
+ */
+static int insert_delete_record_data_into_tree(struct ctdb_context *ctdb,
+ struct ctdb_db_context *ctdb_db,
+ trbt_tree_t *tree,
+ const struct ctdb_ltdb_header *hdr,
+ TDB_DATA key)
+{
+ struct delete_record_data *dd;
+ uint32_t hash;
+ size_t len;
+
+ len = offsetof(struct delete_record_data, keydata) + key.dsize;
+
+ dd = (struct delete_record_data *)talloc_size(tree, len);
+ if (dd == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+ return -1;
+ }
+ talloc_set_name_const(dd, "struct delete_record_data");
+
+ dd->ctdb = ctdb;
+ dd->ctdb_db = ctdb_db;
+ dd->key.dsize = key.dsize;
+ dd->key.dptr = dd->keydata;
+ memcpy(dd->keydata, key.dptr, key.dsize);
+
+ dd->hdr = *hdr;
+ dd->remote_fail_count = 0;
+
+ hash = ctdb_hash(&key);
+
+ trbt_insert32(tree, hash, dd);
+
+ return 0;
+}
+
+static int add_record_to_delete_list(struct vacuum_data *vdata, TDB_DATA key,
+ struct ctdb_ltdb_header *hdr)
+{
+ struct ctdb_context *ctdb = vdata->ctdb;
+ struct ctdb_db_context *ctdb_db = vdata->ctdb_db;
+ uint32_t hash;
+ int ret;
+
+ hash = ctdb_hash(&key);
+
+ if (trbt_lookup32(vdata->delete_list, hash)) {
+ DEBUG(DEBUG_INFO, (__location__ " Hash collision when vacuuming, skipping this record.\n"));
+ return 0;
+ }
+
+ ret = insert_delete_record_data_into_tree(ctdb, ctdb_db,
+ vdata->delete_list,
+ hdr, key);
+ if (ret != 0) {
+ return -1;
+ }
+
+ vdata->count.delete_list.total++;
+
+ return 0;
+}
+
+/**
+ * Add a record to the list of records to be sent
+ * to their lmaster with VACUUM_FETCH.
+ */
+static int add_record_to_vacuum_fetch_list(struct vacuum_data *vdata,
+ TDB_DATA key)
+{
+ struct ctdb_context *ctdb = vdata->ctdb;
+ uint32_t lmaster;
+ struct ctdb_marshall_buffer *vfl;
+
+ lmaster = ctdb_lmaster(ctdb, &key);
+
+ vfl = vdata->vacuum_fetch_list[lmaster];
+
+ vfl = ctdb_marshall_add(ctdb, vfl, vfl->db_id, ctdb->pnn,
+ key, NULL, tdb_null);
+ if (vfl == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+ vdata->traverse_error = true;
+ return -1;
+ }
+
+ vdata->vacuum_fetch_list[lmaster] = vfl;
+
+ return 0;
+}
+
+
+static void ctdb_vacuum_event(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *private_data);
+
+static int vacuum_record_parser(TDB_DATA key, TDB_DATA data, void *private_data)
+{
+ struct ctdb_ltdb_header *header =
+ (struct ctdb_ltdb_header *)private_data;
+
+ if (data.dsize != sizeof(struct ctdb_ltdb_header)) {
+ return -1;
+ }
+
+ *header = *(struct ctdb_ltdb_header *)data.dptr;
+
+ return 0;
+}
+
+/*
+ * traverse function for gathering the records that can be deleted
+ */
+static int vacuum_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data,
+ void *private_data)
+{
+ struct vacuum_data *vdata = talloc_get_type(private_data,
+ struct vacuum_data);
+ struct ctdb_context *ctdb = vdata->ctdb;
+ struct ctdb_db_context *ctdb_db = vdata->ctdb_db;
+ uint32_t lmaster;
+ struct ctdb_ltdb_header *hdr;
+ int res = 0;
+
+ vdata->count.db_traverse.total++;
+
+ lmaster = ctdb_lmaster(ctdb, &key);
+ if (lmaster >= ctdb->num_nodes) {
+ vdata->count.db_traverse.error++;
+ DEBUG(DEBUG_CRIT, (__location__
+ " lmaster[%u] >= ctdb->num_nodes[%u] for key"
+ " with hash[%u]!\n",
+ (unsigned)lmaster,
+ (unsigned)ctdb->num_nodes,
+ (unsigned)ctdb_hash(&key)));
+ return -1;
+ }
+
+ if (data.dsize != sizeof(struct ctdb_ltdb_header)) {
+ /* it is not a deleted record */
+ vdata->count.db_traverse.skipped++;
+ return 0;
+ }
+
+ hdr = (struct ctdb_ltdb_header *)data.dptr;
+
+ if (hdr->dmaster != ctdb->pnn) {
+ vdata->count.db_traverse.skipped++;
+ return 0;
+ }
+
+ /*
+ * Add the record to this process's delete_queue for processing
+ * in the subsequent traverse in the fast vacuum run.
+ */
+ res = insert_record_into_delete_queue(ctdb_db, hdr, key);
+ if (res != 0) {
+ vdata->count.db_traverse.error++;
+ } else {
+ vdata->count.db_traverse.scheduled++;
+ }
+
+ return 0;
+}
+
+/*
+ * traverse the tree of records to delete and marshall them into
+ * a blob
+ */
+static int delete_marshall_traverse(void *param, void *data)
+{
+ struct delete_record_data *dd = talloc_get_type(data, struct delete_record_data);
+ struct delete_records_list *recs = talloc_get_type(param, struct delete_records_list);
+ struct ctdb_marshall_buffer *m;
+
+ m = ctdb_marshall_add(recs, recs->records, recs->records->db_id,
+ recs->records->db_id,
+ dd->key, &dd->hdr, tdb_null);
+ if (m == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " failed to marshall record\n"));
+ return -1;
+ }
+
+ recs->records = m;
+ return 0;
+}
+
+struct fetch_queue_state {
+ struct ctdb_db_context *ctdb_db;
+ int count;
+};
+
+struct fetch_record_migrate_state {
+ struct fetch_queue_state *fetch_queue;
+ TDB_DATA key;
+};
+
+static void fetch_record_migrate_callback(struct ctdb_client_call_state *state)
+{
+ struct fetch_record_migrate_state *fetch = talloc_get_type_abort(
+ state->async.private_data, struct fetch_record_migrate_state);
+ struct fetch_queue_state *fetch_queue = fetch->fetch_queue;
+ struct ctdb_ltdb_header hdr;
+ struct ctdb_call call = { 0 };
+ int ret;
+
+ ret = ctdb_call_recv(state, &call);
+ fetch_queue->count--;
+ if (ret != 0) {
+ D_ERR("Failed to migrate record for vacuuming\n");
+ goto done;
+ }
+
+ ret = tdb_chainlock_nonblock(fetch_queue->ctdb_db->ltdb->tdb,
+ fetch->key);
+ if (ret != 0) {
+ goto done;
+ }
+
+ ret = tdb_parse_record(fetch_queue->ctdb_db->ltdb->tdb,
+ fetch->key,
+ vacuum_record_parser,
+ &hdr);
+
+ tdb_chainunlock(fetch_queue->ctdb_db->ltdb->tdb, fetch->key);
+
+ if (ret != 0) {
+ goto done;
+ }
+
+ D_INFO("Vacuum Fetch record, key=%.*s\n",
+ (int)fetch->key.dsize,
+ fetch->key.dptr);
+
+ (void) ctdb_local_schedule_for_deletion(fetch_queue->ctdb_db,
+ &hdr,
+ fetch->key);
+
+done:
+ talloc_free(fetch);
+}
+
+static int fetch_record_parser(TDB_DATA key, TDB_DATA data, void *private_data)
+{
+ struct ctdb_ltdb_header *header =
+ (struct ctdb_ltdb_header *)private_data;
+
+ if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
+ return -1;
+ }
+
+ memcpy(header, data.dptr, sizeof(*header));
+ return 0;
+}
+
+/**
+ * traverse function for the traversal of the fetch_queue.
+ *
+ * Send a record migration request.
+ */
+static int fetch_queue_traverse(void *param, void *data)
+{
+ struct fetch_record_data *rd = talloc_get_type_abort(
+ data, struct fetch_record_data);
+ struct fetch_queue_state *fetch_queue =
+ (struct fetch_queue_state *)param;
+ struct ctdb_db_context *ctdb_db = fetch_queue->ctdb_db;
+ struct ctdb_client_call_state *state;
+ struct fetch_record_migrate_state *fetch;
+ struct ctdb_call call = { 0 };
+ struct ctdb_ltdb_header header;
+ int ret;
+
+ ret = tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, rd->key);
+ if (ret != 0) {
+ return 0;
+ }
+
+ ret = tdb_parse_record(ctdb_db->ltdb->tdb,
+ rd->key,
+ fetch_record_parser,
+ &header);
+
+ tdb_chainunlock(ctdb_db->ltdb->tdb, rd->key);
+
+ if (ret != 0) {
+ goto skipped;
+ }
+
+ if (header.dmaster == ctdb_db->ctdb->pnn) {
+ /* If the record is already migrated, skip */
+ goto skipped;
+ }
+
+ fetch = talloc_zero(ctdb_db, struct fetch_record_migrate_state);
+ if (fetch == NULL) {
+ D_ERR("Failed to setup fetch record migrate state\n");
+ return 0;
+ }
+
+ fetch->fetch_queue = fetch_queue;
+
+ fetch->key.dsize = rd->key.dsize;
+ fetch->key.dptr = talloc_memdup(fetch, rd->key.dptr, rd->key.dsize);
+ if (fetch->key.dptr == NULL) {
+ D_ERR("Memory error in fetch_queue_traverse\n");
+ talloc_free(fetch);
+ return 0;
+ }
+
+ call.call_id = CTDB_NULL_FUNC;
+ call.flags = CTDB_IMMEDIATE_MIGRATION |
+ CTDB_CALL_FLAG_VACUUM_MIGRATION;
+ call.key = fetch->key;
+
+ state = ctdb_call_send(ctdb_db, &call);
+ if (state == NULL) {
+ DEBUG(DEBUG_ERR, ("Failed to setup vacuum fetch call\n"));
+ talloc_free(fetch);
+ return 0;
+ }
+
+ state->async.fn = fetch_record_migrate_callback;
+ state->async.private_data = fetch;
+
+ fetch_queue->count++;
+
+ return 0;
+
+skipped:
+ D_INFO("Skipped Fetch record, key=%.*s\n",
+ (int)rd->key.dsize,
+ rd->key.dptr);
+ return 0;
+}
+
+/**
+ * Traverse the fetch.
+ * Records are migrated to the local node and
+ * added to delete queue for further processing.
+ */
+static void ctdb_process_fetch_queue(struct ctdb_db_context *ctdb_db)
+{
+ struct fetch_queue_state state;
+ int ret;
+
+ state.ctdb_db = ctdb_db;
+ state.count = 0;
+
+ ret = trbt_traversearray32(ctdb_db->fetch_queue, 1,
+ fetch_queue_traverse, &state);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " Error traversing "
+ "the fetch queue.\n"));
+ }
+
+ /* Wait for all migrations to complete */
+ while (state.count > 0) {
+ tevent_loop_once(ctdb_db->ctdb->ev);
+ }
+}
+
+/**
+ * traverse function for the traversal of the delete_queue,
+ * the fast-path vacuuming list.
+ *
+ * - If the record has been migrated off the node
+ * or has been revived (filled with data) on the node,
+ * then skip the record.
+ *
+ * - If the current node is the record's lmaster and it is
+ * a record that has never been migrated with data, then
+ * delete the record from the local tdb.
+ *
+ * - If the current node is the record's lmaster and it has
+ * been migrated with data, then schedule it for the normal
+ * vacuuming procedure (i.e. add it to the delete_list).
+ *
+ * - If the current node is NOT the record's lmaster then
+ * add it to the list of records that are to be sent to
+ * the lmaster with the VACUUM_FETCH message.
+ */
+static int delete_queue_traverse(void *param, void *data)
+{
+ struct delete_record_data *dd =
+ talloc_get_type(data, struct delete_record_data);
+ struct vacuum_data *vdata = talloc_get_type(param, struct vacuum_data);
+ struct ctdb_db_context *ctdb_db = dd->ctdb_db;
+ struct ctdb_context *ctdb = ctdb_db->ctdb; /* or dd->ctdb ??? */
+ int res;
+ struct ctdb_ltdb_header header;
+ uint32_t lmaster;
+ uint32_t hash = ctdb_hash(&(dd->key));
+
+ vdata->count.delete_queue.total++;
+
+ res = tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, dd->key);
+ if (res != 0) {
+ vdata->count.delete_queue.error++;
+ return 0;
+ }
+
+ res = tdb_parse_record(ctdb_db->ltdb->tdb, dd->key,
+ vacuum_record_parser, &header);
+ if (res != 0) {
+ goto skipped;
+ }
+
+ if (header.dmaster != ctdb->pnn) {
+ /* The record has been migrated off the node. Skip. */
+ goto skipped;
+ }
+
+ if (header.rsn != dd->hdr.rsn) {
+ /*
+ * The record has been migrated off the node and back again.
+ * But not requeued for deletion. Skip it.
+ */
+ goto skipped;
+ }
+
+ /*
+ * We are dmaster, and the record has no data, and it has
+ * not been migrated after it has been queued for deletion.
+ *
+ * At this stage, the record could still have been revived locally
+ * and last been written with empty data. This can only be
+ * fixed with the addition of an active or delete flag. (TODO)
+ */
+
+ lmaster = ctdb_lmaster(ctdb_db->ctdb, &dd->key);
+
+ if (lmaster != ctdb->pnn) {
+ res = add_record_to_vacuum_fetch_list(vdata, dd->key);
+
+ if (res != 0) {
+ DEBUG(DEBUG_ERR,
+ (__location__ " Error adding record to list "
+ "of records to send to lmaster.\n"));
+ vdata->count.delete_queue.error++;
+ } else {
+ vdata->count.delete_queue.added_to_vacuum_fetch_list++;
+ }
+ goto done;
+ }
+
+ /* use header->flags or dd->hdr.flags ?? */
+ if (dd->hdr.flags & CTDB_REC_FLAG_MIGRATED_WITH_DATA) {
+ res = add_record_to_delete_list(vdata, dd->key, &dd->hdr);
+
+ if (res != 0) {
+ DEBUG(DEBUG_ERR,
+ (__location__ " Error adding record to list "
+ "of records for deletion on lmaster.\n"));
+ vdata->count.delete_queue.error++;
+ } else {
+ vdata->count.delete_queue.added_to_delete_list++;
+ }
+ } else {
+ res = tdb_delete(ctdb_db->ltdb->tdb, dd->key);
+
+ if (res != 0) {
+ DEBUG(DEBUG_ERR,
+ (__location__ " Error deleting record with key "
+ "hash [0x%08x] from local data base db[%s].\n",
+ hash, ctdb_db->db_name));
+ vdata->count.delete_queue.error++;
+ goto done;
+ }
+
+ DEBUG(DEBUG_DEBUG,
+ (__location__ " Deleted record with key hash "
+ "[0x%08x] from local data base db[%s].\n",
+ hash, ctdb_db->db_name));
+ vdata->count.delete_queue.deleted++;
+ }
+
+ goto done;
+
+skipped:
+ vdata->count.delete_queue.skipped++;
+
+done:
+ tdb_chainunlock(ctdb_db->ltdb->tdb, dd->key);
+
+ return 0;
+}
+
+/**
+ * Delete the records that we are lmaster and dmaster for and
+ * that could be deleted on all other nodes via the TRY_DELETE_RECORDS
+ * control.
+ */
+static int delete_record_traverse(void *param, void *data)
+{
+ struct delete_record_data *dd =
+ talloc_get_type(data, struct delete_record_data);
+ struct vacuum_data *vdata = talloc_get_type(param, struct vacuum_data);
+ struct ctdb_db_context *ctdb_db = dd->ctdb_db;
+ struct ctdb_context *ctdb = ctdb_db->ctdb;
+ int res;
+ struct ctdb_ltdb_header header;
+ uint32_t lmaster;
+ uint32_t hash = ctdb_hash(&(dd->key));
+
+ if (dd->remote_fail_count > 0) {
+ vdata->count.delete_list.remote_error++;
+ vdata->count.delete_list.left--;
+ talloc_free(dd);
+ return 0;
+ }
+
+ res = tdb_chainlock(ctdb_db->ltdb->tdb, dd->key);
+ if (res != 0) {
+ DEBUG(DEBUG_ERR,
+ (__location__ " Error getting chainlock on record with "
+ "key hash [0x%08x] on database db[%s].\n",
+ hash, ctdb_db->db_name));
+ vdata->count.delete_list.local_error++;
+ vdata->count.delete_list.left--;
+ talloc_free(dd);
+ return 0;
+ }
+
+ /*
+ * Verify that the record is still empty, its RSN has not
+ * changed and that we are still its lmaster and dmaster.
+ */
+
+ res = tdb_parse_record(ctdb_db->ltdb->tdb, dd->key,
+ vacuum_record_parser, &header);
+ if (res != 0) {
+ goto skip;
+ }
+
+ if (header.flags & CTDB_REC_RO_FLAGS) {
+ DEBUG(DEBUG_INFO, (__location__ ": record with hash [0x%08x] "
+ "on database db[%s] has read-only flags. "
+ "skipping.\n",
+ hash, ctdb_db->db_name));
+ goto skip;
+ }
+
+ if (header.dmaster != ctdb->pnn) {
+ DEBUG(DEBUG_INFO, (__location__ ": record with hash [0x%08x] "
+ "on database db[%s] has been migrated away. "
+ "skipping.\n",
+ hash, ctdb_db->db_name));
+ goto skip;
+ }
+
+ if (header.rsn != dd->hdr.rsn) {
+ /*
+ * The record has been migrated off the node and back again.
+ * But not requeued for deletion. Skip it.
+ */
+ DEBUG(DEBUG_INFO, (__location__ ": record with hash [0x%08x] "
+ "on database db[%s] seems to have been "
+ "migrated away and back again (with empty "
+ "data). skipping.\n",
+ hash, ctdb_db->db_name));
+ goto skip;
+ }
+
+ lmaster = ctdb_lmaster(ctdb_db->ctdb, &dd->key);
+
+ if (lmaster != ctdb->pnn) {
+ DEBUG(DEBUG_INFO, (__location__ ": not lmaster for record in "
+ "delete list (key hash [0x%08x], db[%s]). "
+ "Strange! skipping.\n",
+ hash, ctdb_db->db_name));
+ goto skip;
+ }
+
+ res = tdb_delete(ctdb_db->ltdb->tdb, dd->key);
+
+ if (res != 0) {
+ DEBUG(DEBUG_ERR,
+ (__location__ " Error deleting record with key hash "
+ "[0x%08x] from local data base db[%s].\n",
+ hash, ctdb_db->db_name));
+ vdata->count.delete_list.local_error++;
+ goto done;
+ }
+
+ DEBUG(DEBUG_DEBUG,
+ (__location__ " Deleted record with key hash [0x%08x] from "
+ "local data base db[%s].\n", hash, ctdb_db->db_name));
+
+ vdata->count.delete_list.deleted++;
+ goto done;
+
+skip:
+ vdata->count.delete_list.skipped++;
+
+done:
+ tdb_chainunlock(ctdb_db->ltdb->tdb, dd->key);
+
+ talloc_free(dd);
+ vdata->count.delete_list.left--;
+
+ return 0;
+}
+
+/**
+ * Traverse the delete_queue.
+ * Records are either deleted directly or filled
+ * into the delete list or the vacuum fetch lists
+ * for further processing.
+ */
+static void ctdb_process_delete_queue(struct ctdb_db_context *ctdb_db,
+ struct vacuum_data *vdata)
+{
+ uint32_t sum;
+ int ret;
+
+ ret = trbt_traversearray32(ctdb_db->delete_queue, 1,
+ delete_queue_traverse, vdata);
+
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " Error traversing "
+ "the delete queue.\n"));
+ }
+
+ sum = vdata->count.delete_queue.deleted
+ + vdata->count.delete_queue.skipped
+ + vdata->count.delete_queue.error
+ + vdata->count.delete_queue.added_to_delete_list
+ + vdata->count.delete_queue.added_to_vacuum_fetch_list;
+
+ if (vdata->count.delete_queue.total != sum) {
+ DEBUG(DEBUG_ERR, (__location__ " Inconsistency in fast vacuum "
+ "counts for db[%s]: total[%u] != sum[%u]\n",
+ ctdb_db->db_name,
+ (unsigned)vdata->count.delete_queue.total,
+ (unsigned)sum));
+ }
+
+ if (vdata->count.delete_queue.total > 0) {
+ DEBUG(DEBUG_INFO,
+ (__location__
+ " fast vacuuming delete_queue traverse statistics: "
+ "db[%s] "
+ "total[%u] "
+ "del[%u] "
+ "skp[%u] "
+ "err[%u] "
+ "adl[%u] "
+ "avf[%u]\n",
+ ctdb_db->db_name,
+ (unsigned)vdata->count.delete_queue.total,
+ (unsigned)vdata->count.delete_queue.deleted,
+ (unsigned)vdata->count.delete_queue.skipped,
+ (unsigned)vdata->count.delete_queue.error,
+ (unsigned)vdata->count.delete_queue.added_to_delete_list,
+ (unsigned)vdata->count.delete_queue.added_to_vacuum_fetch_list));
+ }
+
+ return;
+}
+
+/**
+ * read-only traverse of the database, looking for records that
+ * might be able to be vacuumed.
+ *
+ * This is not done each time but only every tunable
+ * VacuumFastPathCount times.
+ */
+static void ctdb_vacuum_traverse_db(struct ctdb_db_context *ctdb_db,
+ struct vacuum_data *vdata)
+{
+ int ret;
+
+ ret = tdb_traverse_read(ctdb_db->ltdb->tdb, vacuum_traverse, vdata);
+ if (ret == -1 || vdata->traverse_error) {
+ DEBUG(DEBUG_ERR, (__location__ " Traverse error in vacuuming "
+ "'%s'\n", ctdb_db->db_name));
+ return;
+ }
+
+ if (vdata->count.db_traverse.total > 0) {
+ DEBUG(DEBUG_INFO,
+ (__location__
+ " full vacuuming db traverse statistics: "
+ "db[%s] "
+ "total[%u] "
+ "skp[%u] "
+ "err[%u] "
+ "sched[%u]\n",
+ ctdb_db->db_name,
+ (unsigned)vdata->count.db_traverse.total,
+ (unsigned)vdata->count.db_traverse.skipped,
+ (unsigned)vdata->count.db_traverse.error,
+ (unsigned)vdata->count.db_traverse.scheduled));
+ }
+
+ return;
+}
+
+/**
+ * Process the vacuum fetch lists:
+ * For records for which we are not the lmaster, tell the lmaster to
+ * fetch the record.
+ */
+static void ctdb_process_vacuum_fetch_lists(struct ctdb_db_context *ctdb_db,
+ struct vacuum_data *vdata)
+{
+ unsigned int i;
+ struct ctdb_context *ctdb = ctdb_db->ctdb;
+ int ret, res;
+
+ for (i = 0; i < ctdb->num_nodes; i++) {
+ TDB_DATA data;
+ struct ctdb_marshall_buffer *vfl = vdata->vacuum_fetch_list[i];
+
+ if (ctdb->nodes[i]->pnn == ctdb->pnn) {
+ continue;
+ }
+
+ if (vfl->count == 0) {
+ continue;
+ }
+
+ DEBUG(DEBUG_INFO, ("Found %u records for lmaster %u in '%s'\n",
+ vfl->count, ctdb->nodes[i]->pnn,
+ ctdb_db->db_name));
+
+ data = ctdb_marshall_finish(vfl);
+
+ ret = ctdb_control(ctdb, ctdb->nodes[i]->pnn, 0,
+ CTDB_CONTROL_VACUUM_FETCH, 0,
+ data, NULL, NULL, &res, NULL, NULL);
+ if (ret != 0 || res != 0) {
+ DEBUG(DEBUG_ERR, ("Failed to send vacuum "
+ "fetch control to node %u\n",
+ ctdb->nodes[i]->pnn));
+ }
+ }
+}
+
+/**
+ * Process the delete list:
+ *
+ * This is the last step of vacuuming that consistently deletes
+ * those records that have been migrated with data and can hence
+ * not be deleted when leaving a node.
+ *
+ * In this step, the lmaster does the final deletion of those empty
+ * records that it is also dmaster for. It has ususally received
+ * at least some of these records previously from the former dmasters
+ * with the vacuum fetch message.
+ *
+ * 1) Send the records to all active nodes with the TRY_DELETE_RECORDS
+ * control. The remote notes delete their local copy.
+ * 2) The lmaster locally deletes its copies of all records that
+ * could successfully be deleted remotely in step #2.
+ */
+static void ctdb_process_delete_list(struct ctdb_db_context *ctdb_db,
+ struct vacuum_data *vdata)
+{
+ int ret, i;
+ struct ctdb_context *ctdb = ctdb_db->ctdb;
+ struct delete_records_list *recs;
+ TDB_DATA indata;
+ struct ctdb_node_map_old *nodemap;
+ uint32_t *active_nodes;
+ int num_active_nodes;
+ TALLOC_CTX *tmp_ctx;
+ uint32_t sum;
+
+ if (vdata->count.delete_list.total == 0) {
+ return;
+ }
+
+ tmp_ctx = talloc_new(vdata);
+ if (tmp_ctx == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+ return;
+ }
+
+ vdata->count.delete_list.left = vdata->count.delete_list.total;
+
+ /*
+ * get the list of currently active nodes
+ */
+
+ ret = ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(),
+ CTDB_CURRENT_NODE,
+ tmp_ctx,
+ &nodemap);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
+ goto done;
+ }
+
+ active_nodes = list_of_active_nodes(ctdb, nodemap,
+ nodemap, /* talloc context */
+ false /* include self */);
+ /* yuck! ;-) */
+ num_active_nodes = talloc_get_size(active_nodes)/sizeof(*active_nodes);
+
+ /*
+ * Now delete the records all active nodes in a two-phase process:
+ * 1) tell all active remote nodes to delete all their copy
+ * 2) if all remote nodes deleted their record copy, delete it locally
+ */
+
+ recs = talloc_zero(tmp_ctx, struct delete_records_list);
+ if (recs == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+ goto done;
+ }
+
+ /*
+ * Step 1:
+ * Send all records to all active nodes for deletion.
+ */
+
+ /*
+ * Create a marshall blob from the remaining list of records to delete.
+ */
+
+ recs->records = (struct ctdb_marshall_buffer *)
+ talloc_zero_size(recs,
+ offsetof(struct ctdb_marshall_buffer, data));
+ if (recs->records == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+ goto done;
+ }
+ recs->records->db_id = ctdb_db->db_id;
+
+ ret = trbt_traversearray32(vdata->delete_list, 1,
+ delete_marshall_traverse, recs);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " Error traversing the "
+ "delete list for second marshalling.\n"));
+ goto done;
+ }
+
+ indata = ctdb_marshall_finish(recs->records);
+
+ for (i = 0; i < num_active_nodes; i++) {
+ struct ctdb_marshall_buffer *records;
+ struct ctdb_rec_data_old *rec;
+ int32_t res;
+ TDB_DATA outdata;
+
+ ret = ctdb_control(ctdb, active_nodes[i], 0,
+ CTDB_CONTROL_TRY_DELETE_RECORDS, 0,
+ indata, recs, &outdata, &res,
+ NULL, NULL);
+ if (ret != 0 || res != 0) {
+ DEBUG(DEBUG_ERR, ("Failed to delete records on "
+ "node %u: ret[%d] res[%d]\n",
+ active_nodes[i], ret, res));
+ goto done;
+ }
+
+ /*
+ * outdata contains the list of records coming back
+ * from the node: These are the records that the
+ * remote node could not delete. We remove these from
+ * the list to delete locally.
+ */
+ records = (struct ctdb_marshall_buffer *)outdata.dptr;
+ rec = (struct ctdb_rec_data_old *)&records->data[0];
+ while (records->count-- > 0) {
+ TDB_DATA reckey, recdata;
+ struct ctdb_ltdb_header *rechdr;
+ struct delete_record_data *dd;
+
+ reckey.dptr = &rec->data[0];
+ reckey.dsize = rec->keylen;
+ recdata.dptr = &rec->data[reckey.dsize];
+ recdata.dsize = rec->datalen;
+
+ if (recdata.dsize < sizeof(struct ctdb_ltdb_header)) {
+ DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
+ goto done;
+ }
+ rechdr = (struct ctdb_ltdb_header *)recdata.dptr;
+ recdata.dptr += sizeof(*rechdr);
+ recdata.dsize -= sizeof(*rechdr);
+
+ dd = (struct delete_record_data *)trbt_lookup32(
+ vdata->delete_list,
+ ctdb_hash(&reckey));
+ if (dd != NULL) {
+ /*
+ * The remote node could not delete the
+ * record. Since other remote nodes can
+ * also fail, we just mark the record.
+ */
+ dd->remote_fail_count++;
+ } else {
+ DEBUG(DEBUG_ERR, (__location__ " Failed to "
+ "find record with hash 0x%08x coming "
+ "back from TRY_DELETE_RECORDS "
+ "control in delete list.\n",
+ ctdb_hash(&reckey)));
+ }
+
+ rec = (struct ctdb_rec_data_old *)(rec->length + (uint8_t *)rec);
+ }
+ }
+
+ /*
+ * Step 2:
+ * Delete the remaining records locally.
+ *
+ * These records have successfully been deleted on all
+ * active remote nodes.
+ */
+
+ ret = trbt_traversearray32(vdata->delete_list, 1,
+ delete_record_traverse, vdata);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " Error traversing the "
+ "delete list for deletion.\n"));
+ }
+
+ if (vdata->count.delete_list.left != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " Vaccum db[%s] error: "
+ "there are %u records left for deletion after "
+ "processing delete list\n",
+ ctdb_db->db_name,
+ (unsigned)vdata->count.delete_list.left));
+ }
+
+ sum = vdata->count.delete_list.deleted
+ + vdata->count.delete_list.skipped
+ + vdata->count.delete_list.remote_error
+ + vdata->count.delete_list.local_error
+ + vdata->count.delete_list.left;
+
+ if (vdata->count.delete_list.total != sum) {
+ DEBUG(DEBUG_ERR, (__location__ " Inconsistency in vacuum "
+ "delete list counts for db[%s]: total[%u] != sum[%u]\n",
+ ctdb_db->db_name,
+ (unsigned)vdata->count.delete_list.total,
+ (unsigned)sum));
+ }
+
+ if (vdata->count.delete_list.total > 0) {
+ DEBUG(DEBUG_INFO,
+ (__location__
+ " vacuum delete list statistics: "
+ "db[%s] "
+ "total[%u] "
+ "del[%u] "
+ "skip[%u] "
+ "rem.err[%u] "
+ "loc.err[%u] "
+ "left[%u]\n",
+ ctdb_db->db_name,
+ (unsigned)vdata->count.delete_list.total,
+ (unsigned)vdata->count.delete_list.deleted,
+ (unsigned)vdata->count.delete_list.skipped,
+ (unsigned)vdata->count.delete_list.remote_error,
+ (unsigned)vdata->count.delete_list.local_error,
+ (unsigned)vdata->count.delete_list.left));
+ }
+
+done:
+ talloc_free(tmp_ctx);
+
+ return;
+}
+
+/**
+ * initialize the vacuum_data
+ */
+static struct vacuum_data *ctdb_vacuum_init_vacuum_data(
+ struct ctdb_db_context *ctdb_db,
+ TALLOC_CTX *mem_ctx)
+{
+ unsigned int i;
+ struct ctdb_context *ctdb = ctdb_db->ctdb;
+ struct vacuum_data *vdata;
+
+ vdata = talloc_zero(mem_ctx, struct vacuum_data);
+ if (vdata == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+ return NULL;
+ }
+
+ vdata->ctdb = ctdb_db->ctdb;
+ vdata->ctdb_db = ctdb_db;
+ vdata->delete_list = trbt_create(vdata, 0);
+ if (vdata->delete_list == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+ goto fail;
+ }
+
+ vdata->start = timeval_current();
+
+ vdata->count.delete_queue.added_to_delete_list = 0;
+ vdata->count.delete_queue.added_to_vacuum_fetch_list = 0;
+ vdata->count.delete_queue.deleted = 0;
+ vdata->count.delete_queue.skipped = 0;
+ vdata->count.delete_queue.error = 0;
+ vdata->count.delete_queue.total = 0;
+ vdata->count.db_traverse.scheduled = 0;
+ vdata->count.db_traverse.skipped = 0;
+ vdata->count.db_traverse.error = 0;
+ vdata->count.db_traverse.total = 0;
+ vdata->count.delete_list.total = 0;
+ vdata->count.delete_list.left = 0;
+ vdata->count.delete_list.remote_error = 0;
+ vdata->count.delete_list.local_error = 0;
+ vdata->count.delete_list.skipped = 0;
+ vdata->count.delete_list.deleted = 0;
+
+ /* the list needs to be of length num_nodes */
+ vdata->vacuum_fetch_list = talloc_zero_array(vdata,
+ struct ctdb_marshall_buffer *,
+ ctdb->num_nodes);
+ if (vdata->vacuum_fetch_list == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+ goto fail;
+ }
+ for (i = 0; i < ctdb->num_nodes; i++) {
+ vdata->vacuum_fetch_list[i] = (struct ctdb_marshall_buffer *)
+ talloc_zero_size(vdata->vacuum_fetch_list,
+ offsetof(struct ctdb_marshall_buffer, data));
+ if (vdata->vacuum_fetch_list[i] == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+ talloc_free(vdata);
+ return NULL;
+ }
+ vdata->vacuum_fetch_list[i]->db_id = ctdb_db->db_id;
+ }
+
+ return vdata;
+
+fail:
+ talloc_free(vdata);
+ return NULL;
+}
+
+/**
+ * Vacuum a DB:
+ * - Always do the fast vacuuming run, which traverses
+ * - the in-memory fetch queue: these records have been
+ * scheduled for migration
+ * - the in-memory delete queue: these records have been
+ * scheduled for deletion.
+ * - Only if explicitly requested, the database is traversed
+ * in order to use the traditional heuristics on empty records
+ * to trigger deletion.
+ * This is done only every VacuumFastPathCount'th vacuuming run.
+ *
+ * The traverse runs fill two lists:
+ *
+ * - The delete_list:
+ * This is the list of empty records the current
+ * node is lmaster and dmaster for. These records are later
+ * deleted first on other nodes and then locally.
+ *
+ * The fast vacuuming run has a short cut for those records
+ * that have never been migrated with data: these records
+ * are immediately deleted locally, since they have left
+ * no trace on other nodes.
+ *
+ * - The vacuum_fetch lists
+ * (one for each other lmaster node):
+ * The records in this list are sent for deletion to
+ * their lmaster in a bulk VACUUM_FETCH control.
+ *
+ * The lmaster then migrates all these records to itelf
+ * so that they can be vacuumed there.
+ *
+ * This executes in the child context.
+ */
+static int ctdb_vacuum_db(struct ctdb_db_context *ctdb_db,
+ bool full_vacuum_run)
+{
+ struct ctdb_context *ctdb = ctdb_db->ctdb;
+ int ret, pnn;
+ struct vacuum_data *vdata;
+ TALLOC_CTX *tmp_ctx;
+
+ DEBUG(DEBUG_INFO, (__location__ " Entering %s vacuum run for db "
+ "%s db_id[0x%08x]\n",
+ full_vacuum_run ? "full" : "fast",
+ ctdb_db->db_name, ctdb_db->db_id));
+
+ ret = ctdb_ctrl_getvnnmap(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, ctdb, &ctdb->vnn_map);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, ("Unable to get vnnmap from local node\n"));
+ return ret;
+ }
+
+ pnn = ctdb_ctrl_getpnn(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE);
+ if (pnn == -1) {
+ DEBUG(DEBUG_ERR, ("Unable to get pnn from local node\n"));
+ return -1;
+ }
+
+ ctdb->pnn = pnn;
+
+ tmp_ctx = talloc_new(ctdb_db);
+ if (tmp_ctx == NULL) {
+ DEBUG(DEBUG_ERR, ("Out of memory!\n"));
+ return -1;
+ }
+
+ vdata = ctdb_vacuum_init_vacuum_data(ctdb_db, tmp_ctx);
+ if (vdata == NULL) {
+ talloc_free(tmp_ctx);
+ return -1;
+ }
+
+ if (full_vacuum_run) {
+ ctdb_vacuum_traverse_db(ctdb_db, vdata);
+ }
+
+ ctdb_process_fetch_queue(ctdb_db);
+
+ ctdb_process_delete_queue(ctdb_db, vdata);
+
+ ctdb_process_vacuum_fetch_lists(ctdb_db, vdata);
+
+ ctdb_process_delete_list(ctdb_db, vdata);
+
+ talloc_free(tmp_ctx);
+
+ return 0;
+}
+
+/*
+ * repack and vaccum a db
+ * called from the child context
+ */
+static int ctdb_vacuum_and_repack_db(struct ctdb_db_context *ctdb_db,
+ bool full_vacuum_run)
+{
+ uint32_t repack_limit = ctdb_db->ctdb->tunable.repack_limit;
+ const char *name = ctdb_db->db_name;
+ int freelist_size = 0;
+ int ret;
+
+ if (ctdb_vacuum_db(ctdb_db, full_vacuum_run) != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to vacuum '%s'\n", name));
+ }
+
+ freelist_size = tdb_freelist_size(ctdb_db->ltdb->tdb);
+ if (freelist_size == -1) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to get freelist size for '%s'\n", name));
+ return -1;
+ }
+
+ /*
+ * decide if a repack is necessary
+ */
+ if ((repack_limit == 0 || (uint32_t)freelist_size < repack_limit))
+ {
+ return 0;
+ }
+
+ D_NOTICE("Repacking %s with %u freelist entries\n",
+ name,
+ freelist_size);
+
+ ret = tdb_repack(ctdb_db->ltdb->tdb);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to repack '%s'\n", name));
+ return -1;
+ }
+
+ return 0;
+}
+
+static uint32_t get_vacuum_interval(struct ctdb_db_context *ctdb_db)
+{
+ uint32_t interval = ctdb_db->ctdb->tunable.vacuum_interval;
+
+ return interval;
+}
+
+static int vacuum_child_destructor(struct ctdb_vacuum_child_context *child_ctx)
+{
+ double l = timeval_elapsed(&child_ctx->start_time);
+ struct ctdb_vacuum_handle *vacuum_handle = child_ctx->vacuum_handle;
+ struct ctdb_db_context *ctdb_db = vacuum_handle->ctdb_db;
+ struct ctdb_context *ctdb = ctdb_db->ctdb;
+
+ CTDB_UPDATE_DB_LATENCY(ctdb_db, "vacuum", vacuum.latency, l);
+ DEBUG(DEBUG_INFO,("Vacuuming took %.3f seconds for database %s\n", l, ctdb_db->db_name));
+
+ if (child_ctx->child_pid != -1) {
+ ctdb_kill(ctdb, child_ctx->child_pid, SIGKILL);
+ } else {
+ /* Bump the number of successful fast-path runs. */
+ vacuum_handle->fast_path_count++;
+ }
+
+ ctdb->vacuumer = NULL;
+
+ if (child_ctx->scheduled) {
+ vacuum_handle->vacuum_interval = get_vacuum_interval(ctdb_db);
+
+ tevent_add_timer(
+ ctdb->ev,
+ vacuum_handle,
+ timeval_current_ofs(vacuum_handle->vacuum_interval, 0),
+ ctdb_vacuum_event,
+ vacuum_handle);
+ }
+
+ return 0;
+}
+
+/*
+ * this event is generated when a vacuum child process times out
+ */
+static void vacuum_child_timeout(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *private_data)
+{
+ struct ctdb_vacuum_child_context *child_ctx = talloc_get_type(private_data, struct ctdb_vacuum_child_context);
+
+ DEBUG(DEBUG_ERR,("Vacuuming child process timed out for db %s\n", child_ctx->vacuum_handle->ctdb_db->db_name));
+
+ child_ctx->status = VACUUM_TIMEOUT;
+
+ talloc_free(child_ctx);
+}
+
+
+/*
+ * this event is generated when a vacuum child process has completed
+ */
+static void vacuum_child_handler(struct tevent_context *ev,
+ struct tevent_fd *fde,
+ uint16_t flags, void *private_data)
+{
+ struct ctdb_vacuum_child_context *child_ctx = talloc_get_type(private_data, struct ctdb_vacuum_child_context);
+ char c = 0;
+ int ret;
+
+ DEBUG(DEBUG_INFO,("Vacuuming child process %d finished for db %s\n", child_ctx->child_pid, child_ctx->vacuum_handle->ctdb_db->db_name));
+ child_ctx->child_pid = -1;
+
+ ret = sys_read(child_ctx->fd[0], &c, 1);
+ if (ret != 1 || c != 0) {
+ child_ctx->status = VACUUM_ERROR;
+ DEBUG(DEBUG_ERR, ("A vacuum child process failed with an error for database %s. ret=%d c=%d\n", child_ctx->vacuum_handle->ctdb_db->db_name, ret, c));
+ } else {
+ child_ctx->status = VACUUM_OK;
+ }
+
+ talloc_free(child_ctx);
+}
+
+/*
+ * this event is called every time we need to start a new vacuum process
+ */
+static int vacuum_db_child(TALLOC_CTX *mem_ctx,
+ struct ctdb_db_context *ctdb_db,
+ bool scheduled,
+ bool full_vacuum_run,
+ struct ctdb_vacuum_child_context **out)
+{
+ struct ctdb_context *ctdb = ctdb_db->ctdb;
+ struct ctdb_vacuum_child_context *child_ctx;
+ struct tevent_fd *fde;
+ int ret;
+
+ /* we don't vacuum if we are in recovery mode, or db frozen */
+ if (ctdb->recovery_mode == CTDB_RECOVERY_ACTIVE ||
+ ctdb_db_frozen(ctdb_db)) {
+ D_INFO("Not vacuuming %s (%s)\n", ctdb_db->db_name,
+ ctdb->recovery_mode == CTDB_RECOVERY_ACTIVE ?
+ "in recovery" : "frozen");
+ return EAGAIN;
+ }
+
+ /* Do not allow multiple vacuuming child processes to be active at the
+ * same time. If there is vacuuming child process active, delay
+ * new vacuuming event to stagger vacuuming events.
+ */
+ if (ctdb->vacuumer != NULL) {
+ return EBUSY;
+ }
+
+ child_ctx = talloc_zero(mem_ctx, struct ctdb_vacuum_child_context);
+ if (child_ctx == NULL) {
+ DBG_ERR("Failed to allocate child context for vacuuming of %s\n",
+ ctdb_db->db_name);
+ return ENOMEM;
+ }
+
+
+ ret = pipe(child_ctx->fd);
+ if (ret != 0) {
+ talloc_free(child_ctx);
+ D_ERR("Failed to create pipe for vacuum child process.\n");
+ return EAGAIN;
+ }
+
+ child_ctx->child_pid = ctdb_fork(ctdb);
+ if (child_ctx->child_pid == (pid_t)-1) {
+ close(child_ctx->fd[0]);
+ close(child_ctx->fd[1]);
+ talloc_free(child_ctx);
+ D_ERR("Failed to fork vacuum child process.\n");
+ return EAGAIN;
+ }
+
+
+ if (child_ctx->child_pid == 0) {
+ char cc = 0;
+ close(child_ctx->fd[0]);
+
+ D_INFO("Vacuuming child process %d for db %s started\n",
+ getpid(),
+ ctdb_db->db_name);
+ prctl_set_comment("ctdb_vacuum");
+ ret = switch_from_server_to_client(ctdb);
+ if (ret != 0) {
+ DBG_ERR("ERROR: failed to switch vacuum daemon "
+ "into client mode.\n");
+ return EIO;
+ }
+
+ cc = ctdb_vacuum_and_repack_db(ctdb_db, full_vacuum_run);
+
+ sys_write(child_ctx->fd[1], &cc, 1);
+ _exit(0);
+ }
+
+ set_close_on_exec(child_ctx->fd[0]);
+ close(child_ctx->fd[1]);
+
+ child_ctx->status = VACUUM_RUNNING;
+ child_ctx->scheduled = scheduled;
+ child_ctx->start_time = timeval_current();
+
+ ctdb->vacuumer = child_ctx;
+ talloc_set_destructor(child_ctx, vacuum_child_destructor);
+
+ /*
+ * Clear the fastpath vacuuming list in the parent.
+ */
+ talloc_free(ctdb_db->delete_queue);
+ ctdb_db->delete_queue = trbt_create(ctdb_db, 0);
+ if (ctdb_db->delete_queue == NULL) {
+ DBG_ERR("Out of memory when re-creating vacuum tree\n");
+ return ENOMEM;
+ }
+
+ talloc_free(ctdb_db->fetch_queue);
+ ctdb_db->fetch_queue = trbt_create(ctdb_db, 0);
+ if (ctdb_db->fetch_queue == NULL) {
+ ctdb_fatal(ctdb, "Out of memory when re-create fetch queue "
+ " in parent context. Shutting down\n");
+ }
+
+ tevent_add_timer(ctdb->ev, child_ctx,
+ timeval_current_ofs(ctdb->tunable.vacuum_max_run_time,
+ 0),
+ vacuum_child_timeout, child_ctx);
+
+ DBG_DEBUG(" Created PIPE FD:%d to child vacuum process\n",
+ child_ctx->fd[0]);
+
+ fde = tevent_add_fd(ctdb->ev, child_ctx, child_ctx->fd[0],
+ TEVENT_FD_READ, vacuum_child_handler, child_ctx);
+ tevent_fd_set_auto_close(fde);
+
+ child_ctx->vacuum_handle = ctdb_db->vacuum_handle;
+
+ *out = child_ctx;
+ return 0;
+}
+
+static void ctdb_vacuum_event(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *private_data)
+{
+ struct ctdb_vacuum_handle *vacuum_handle = talloc_get_type(
+ private_data, struct ctdb_vacuum_handle);
+ struct ctdb_db_context *ctdb_db = vacuum_handle->ctdb_db;
+ struct ctdb_context *ctdb = ctdb_db->ctdb;
+ struct ctdb_vacuum_child_context *child_ctx = NULL;
+ uint32_t fast_path_max = ctdb->tunable.vacuum_fast_path_count;
+ uint32_t vacuum_interval = get_vacuum_interval(ctdb_db);
+ bool full_vacuum_run = false;
+ int ret;
+
+ if (vacuum_interval > vacuum_handle->vacuum_interval) {
+ uint32_t d = vacuum_interval - vacuum_handle->vacuum_interval;
+
+ DBG_INFO("Vacuum interval increased from "
+ "%"PRIu32" to %"PRIu32", rescheduling\n",
+ vacuum_handle->vacuum_interval,
+ vacuum_interval);
+ vacuum_handle->vacuum_interval = vacuum_interval;
+ tevent_add_timer(ctdb->ev,
+ vacuum_handle,
+ timeval_current_ofs(d, 0),
+ ctdb_vacuum_event,
+ vacuum_handle);
+ return;
+ }
+
+ vacuum_handle->vacuum_interval = vacuum_interval;
+
+ if (vacuum_handle->fast_path_count >= fast_path_max) {
+ if (fast_path_max > 0) {
+ full_vacuum_run = true;
+ }
+ vacuum_handle->fast_path_count = 0;
+ }
+
+ ret = vacuum_db_child(vacuum_handle,
+ ctdb_db,
+ true,
+ full_vacuum_run,
+ &child_ctx);
+
+ if (ret == 0) {
+ return;
+ }
+
+ switch (ret) {
+ case EBUSY:
+ /* Stagger */
+ tevent_add_timer(ctdb->ev,
+ vacuum_handle,
+ timeval_current_ofs(0, 500*1000),
+ ctdb_vacuum_event,
+ vacuum_handle);
+ break;
+
+ default:
+ /* Temporary failure, schedule next attempt */
+ tevent_add_timer(ctdb->ev,
+ vacuum_handle,
+ timeval_current_ofs(
+ vacuum_handle->vacuum_interval, 0),
+ ctdb_vacuum_event,
+ vacuum_handle);
+ }
+
+}
+
+struct vacuum_control_state {
+ struct ctdb_vacuum_child_context *child_ctx;
+ struct ctdb_req_control_old *c;
+ struct ctdb_context *ctdb;
+};
+
+static int vacuum_control_state_destructor(struct vacuum_control_state *state)
+{
+ struct ctdb_vacuum_child_context *child_ctx = state->child_ctx;
+ int32_t status;
+
+ status = (child_ctx->status == VACUUM_OK ? 0 : -1);
+ ctdb_request_control_reply(state->ctdb, state->c, NULL, status, NULL);
+
+ return 0;
+}
+
+int32_t ctdb_control_db_vacuum(struct ctdb_context *ctdb,
+ struct ctdb_req_control_old *c,
+ TDB_DATA indata,
+ bool *async_reply)
+{
+ struct ctdb_db_context *ctdb_db;
+ struct ctdb_vacuum_child_context *child_ctx = NULL;
+ struct ctdb_db_vacuum *db_vacuum;
+ struct vacuum_control_state *state;
+ size_t np;
+ int ret;
+
+ ret = ctdb_db_vacuum_pull(indata.dptr,
+ indata.dsize,
+ ctdb,
+ &db_vacuum,
+ &np);
+ if (ret != 0) {
+ DBG_ERR("Invalid data\n");
+ return -1;
+ }
+
+ ctdb_db = find_ctdb_db(ctdb, db_vacuum->db_id);
+ if (ctdb_db == NULL) {
+ DBG_ERR("Unknown db id 0x%08x\n", db_vacuum->db_id);
+ talloc_free(db_vacuum);
+ return -1;
+ }
+
+ state = talloc(ctdb, struct vacuum_control_state);
+ if (state == NULL) {
+ DBG_ERR("Memory allocation error\n");
+ return -1;
+ }
+
+ ret = vacuum_db_child(ctdb_db,
+ ctdb_db,
+ false,
+ db_vacuum->full_vacuum_run,
+ &child_ctx);
+
+ talloc_free(db_vacuum);
+
+ if (ret == 0) {
+ (void) talloc_steal(child_ctx, state);
+
+ state->child_ctx = child_ctx;
+ state->c = talloc_steal(state, c);
+ state->ctdb = ctdb;
+
+ talloc_set_destructor(state, vacuum_control_state_destructor);
+
+ *async_reply = true;
+ return 0;
+ }
+
+ talloc_free(state);
+
+ switch (ret) {
+ case EBUSY:
+ DBG_WARNING("Vacuuming collision\n");
+ break;
+
+ default:
+ DBG_ERR("Temporary vacuuming failure, ret=%d\n", ret);
+ }
+
+ return -1;
+}
+
+void ctdb_stop_vacuuming(struct ctdb_context *ctdb)
+{
+ if (ctdb->vacuumer != NULL) {
+ D_INFO("Aborting vacuuming for %s (%i)\n",
+ ctdb->vacuumer->vacuum_handle->ctdb_db->db_name,
+ (int)ctdb->vacuumer->child_pid);
+ /* vacuum_child_destructor kills it, removes from list */
+ talloc_free(ctdb->vacuumer);
+ }
+}
+
+/* this function initializes the vacuuming context for a database
+ * starts the vacuuming events
+ */
+int ctdb_vacuum_init(struct ctdb_db_context *ctdb_db)
+{
+ struct ctdb_vacuum_handle *vacuum_handle;
+
+ if (! ctdb_db_volatile(ctdb_db)) {
+ DEBUG(DEBUG_ERR,
+ ("Vacuuming is disabled for non-volatile database %s\n",
+ ctdb_db->db_name));
+ return 0;
+ }
+
+ vacuum_handle = talloc(ctdb_db, struct ctdb_vacuum_handle);
+ if (vacuum_handle == NULL) {
+ DBG_ERR("Memory allocation error\n");
+ return -1;
+ }
+
+ vacuum_handle->ctdb_db = ctdb_db;
+ vacuum_handle->fast_path_count = 0;
+ vacuum_handle->vacuum_interval = get_vacuum_interval(ctdb_db);
+
+ ctdb_db->vacuum_handle = vacuum_handle;
+
+ tevent_add_timer(ctdb_db->ctdb->ev,
+ vacuum_handle,
+ timeval_current_ofs(vacuum_handle->vacuum_interval, 0),
+ ctdb_vacuum_event,
+ vacuum_handle);
+
+ return 0;
+}
+
+static void remove_record_from_delete_queue(struct ctdb_db_context *ctdb_db,
+ const struct ctdb_ltdb_header *hdr,
+ const TDB_DATA key)
+{
+ struct delete_record_data *kd;
+ uint32_t hash;
+
+ hash = (uint32_t)ctdb_hash(&key);
+
+ DEBUG(DEBUG_DEBUG, (__location__
+ " remove_record_from_delete_queue: "
+ "db[%s] "
+ "db_id[0x%08x] "
+ "key_hash[0x%08x] "
+ "lmaster[%u] "
+ "migrated_with_data[%s]\n",
+ ctdb_db->db_name, ctdb_db->db_id,
+ hash,
+ ctdb_lmaster(ctdb_db->ctdb, &key),
+ hdr->flags & CTDB_REC_FLAG_MIGRATED_WITH_DATA ? "yes" : "no"));
+
+ kd = (struct delete_record_data *)trbt_lookup32(ctdb_db->delete_queue, hash);
+ if (kd == NULL) {
+ DEBUG(DEBUG_DEBUG, (__location__
+ " remove_record_from_delete_queue: "
+ "record not in queue (hash[0x%08x])\n.",
+ hash));
+ return;
+ }
+
+ if ((kd->key.dsize != key.dsize) ||
+ (memcmp(kd->key.dptr, key.dptr, key.dsize) != 0))
+ {
+ DEBUG(DEBUG_DEBUG, (__location__
+ " remove_record_from_delete_queue: "
+ "hash collision for key with hash[0x%08x] "
+ "in db[%s] - skipping\n",
+ hash, ctdb_db->db_name));
+ return;
+ }
+
+ DEBUG(DEBUG_DEBUG, (__location__
+ " remove_record_from_delete_queue: "
+ "removing key with hash[0x%08x]\n",
+ hash));
+
+ talloc_free(kd);
+
+ return;
+}
+
+/**
+ * Insert a record into the ctdb_db context's delete queue,
+ * handling hash collisions.
+ */
+static int insert_record_into_delete_queue(struct ctdb_db_context *ctdb_db,
+ const struct ctdb_ltdb_header *hdr,
+ TDB_DATA key)
+{
+ struct delete_record_data *kd;
+ uint32_t hash;
+ int ret;
+
+ hash = (uint32_t)ctdb_hash(&key);
+
+ DEBUG(DEBUG_DEBUG, (__location__ " schedule for deletion: db[%s] "
+ "db_id[0x%08x] "
+ "key_hash[0x%08x] "
+ "lmaster[%u] "
+ "migrated_with_data[%s]\n",
+ ctdb_db->db_name, ctdb_db->db_id,
+ hash,
+ ctdb_lmaster(ctdb_db->ctdb, &key),
+ hdr->flags & CTDB_REC_FLAG_MIGRATED_WITH_DATA ? "yes" : "no"));
+
+ kd = (struct delete_record_data *)trbt_lookup32(ctdb_db->delete_queue, hash);
+ if (kd != NULL) {
+ if ((kd->key.dsize != key.dsize) ||
+ (memcmp(kd->key.dptr, key.dptr, key.dsize) != 0))
+ {
+ DEBUG(DEBUG_INFO,
+ (__location__ " schedule for deletion: "
+ "hash collision for key hash [0x%08x]. "
+ "Skipping the record.\n", hash));
+ return 0;
+ } else {
+ DEBUG(DEBUG_DEBUG,
+ (__location__ " schedule for deletion: "
+ "updating entry for key with hash [0x%08x].\n",
+ hash));
+ }
+ }
+
+ ret = insert_delete_record_data_into_tree(ctdb_db->ctdb, ctdb_db,
+ ctdb_db->delete_queue,
+ hdr, key);
+ if (ret != 0) {
+ DEBUG(DEBUG_INFO,
+ (__location__ " schedule for deletion: error "
+ "inserting key with hash [0x%08x] into delete queue\n",
+ hash));
+ return -1;
+ }
+
+ return 0;
+}
+
+/**
+ * Schedule a record for deletetion.
+ * Called from the parent context.
+ */
+int32_t ctdb_control_schedule_for_deletion(struct ctdb_context *ctdb,
+ TDB_DATA indata)
+{
+ struct ctdb_control_schedule_for_deletion *dd;
+ struct ctdb_db_context *ctdb_db;
+ int ret;
+ TDB_DATA key;
+
+ dd = (struct ctdb_control_schedule_for_deletion *)indata.dptr;
+
+ ctdb_db = find_ctdb_db(ctdb, dd->db_id);
+ if (ctdb_db == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " Unknown db id 0x%08x\n",
+ dd->db_id));
+ return -1;
+ }
+
+ key.dsize = dd->keylen;
+ key.dptr = dd->key;
+
+ ret = insert_record_into_delete_queue(ctdb_db, &dd->hdr, key);
+
+ return ret;
+}
+
+int32_t ctdb_local_schedule_for_deletion(struct ctdb_db_context *ctdb_db,
+ const struct ctdb_ltdb_header *hdr,
+ TDB_DATA key)
+{
+ int ret;
+ struct ctdb_control_schedule_for_deletion *dd;
+ TDB_DATA indata;
+ int32_t status;
+
+ if (ctdb_db->ctdb->ctdbd_pid == getpid()) {
+ /* main daemon - directly queue */
+ ret = insert_record_into_delete_queue(ctdb_db, hdr, key);
+
+ return ret;
+ }
+
+ /* if we don't have a connection to the daemon we can not send
+ a control. For example sometimes from update_record control child
+ process.
+ */
+ if (!ctdb_db->ctdb->can_send_controls) {
+ return -1;
+ }
+
+
+ /* child process: send the main daemon a control */
+ indata.dsize = offsetof(struct ctdb_control_schedule_for_deletion, key) + key.dsize;
+ indata.dptr = talloc_zero_array(ctdb_db, uint8_t, indata.dsize);
+ if (indata.dptr == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
+ return -1;
+ }
+ dd = (struct ctdb_control_schedule_for_deletion *)(void *)indata.dptr;
+ dd->db_id = ctdb_db->db_id;
+ dd->hdr = *hdr;
+ dd->keylen = key.dsize;
+ memcpy(dd->key, key.dptr, key.dsize);
+
+ ret = ctdb_control(ctdb_db->ctdb,
+ CTDB_CURRENT_NODE,
+ ctdb_db->db_id,
+ CTDB_CONTROL_SCHEDULE_FOR_DELETION,
+ CTDB_CTRL_FLAG_NOREPLY, /* flags */
+ indata,
+ NULL, /* mem_ctx */
+ NULL, /* outdata */
+ &status,
+ NULL, /* timeout : NULL == wait forever */
+ NULL); /* error message */
+
+ talloc_free(indata.dptr);
+
+ if (ret != 0 || status != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " Error sending "
+ "SCHEDULE_FOR_DELETION "
+ "control.\n"));
+ if (status != 0) {
+ ret = -1;
+ }
+ }
+
+ return ret;
+}
+
+void ctdb_local_remove_from_delete_queue(struct ctdb_db_context *ctdb_db,
+ const struct ctdb_ltdb_header *hdr,
+ const TDB_DATA key)
+{
+ if (ctdb_db->ctdb->ctdbd_pid != getpid()) {
+ /*
+ * Only remove the record from the delete queue if called
+ * in the main daemon.
+ */
+ return;
+ }
+
+ remove_record_from_delete_queue(ctdb_db, hdr, key);
+
+ return;
+}
+
+static int vacuum_fetch_parser(uint32_t reqid,
+ struct ctdb_ltdb_header *header,
+ TDB_DATA key, TDB_DATA data,
+ void *private_data)
+{
+ struct ctdb_db_context *ctdb_db = talloc_get_type_abort(
+ private_data, struct ctdb_db_context);
+ struct fetch_record_data *rd;
+ size_t len;
+ uint32_t hash;
+
+ len = offsetof(struct fetch_record_data, keydata) + key.dsize;
+
+ rd = (struct fetch_record_data *)talloc_size(ctdb_db->fetch_queue,
+ len);
+ if (rd == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " Memory error\n"));
+ return -1;
+ }
+ talloc_set_name_const(rd, "struct fetch_record_data");
+
+ rd->key.dsize = key.dsize;
+ rd->key.dptr = rd->keydata;
+ memcpy(rd->keydata, key.dptr, key.dsize);
+
+ hash = ctdb_hash(&key);
+
+ trbt_insert32(ctdb_db->fetch_queue, hash, rd);
+
+ return 0;
+}
+
+int32_t ctdb_control_vacuum_fetch(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+ struct ctdb_rec_buffer *recbuf;
+ struct ctdb_db_context *ctdb_db;
+ size_t npull;
+ int ret;
+
+ ret = ctdb_rec_buffer_pull(indata.dptr, indata.dsize, ctdb, &recbuf,
+ &npull);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, ("Invalid data in vacuum_fetch\n"));
+ return -1;
+ }
+
+ ctdb_db = find_ctdb_db(ctdb, recbuf->db_id);
+ if (ctdb_db == NULL) {
+ talloc_free(recbuf);
+ DEBUG(DEBUG_ERR, (__location__ " Unknown db 0x%08x\n",
+ recbuf->db_id));
+ return -1;
+ }
+
+ ret = ctdb_rec_buffer_traverse(recbuf, vacuum_fetch_parser, ctdb_db);
+ talloc_free(recbuf);
+ return ret;
+}
diff --git a/ctdb/server/ctdbd.c b/ctdb/server/ctdbd.c
new file mode 100644
index 0000000..10541cb
--- /dev/null
+++ b/ctdb/server/ctdbd.c
@@ -0,0 +1,405 @@
+/*
+ standalone ctdb daemon
+
+ Copyright (C) Andrew Tridgell 2006
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/filesys.h"
+#include "system/time.h"
+#include "system/wait.h"
+#include "system/network.h"
+#include "system/syslog.h"
+
+#include <popt.h>
+#include <talloc.h>
+/* Allow use of deprecated function tevent_loop_allow_nesting() */
+#define TEVENT_DEPRECATED
+#include <tevent.h>
+
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+
+#include "ctdb_private.h"
+
+#include "common/reqid.h"
+#include "common/system.h"
+#include "common/common.h"
+#include "common/path.h"
+#include "common/logging.h"
+#include "common/logging_conf.h"
+
+#include "ctdb_config.h"
+
+int script_log_level;
+bool fast_start;
+
+/*
+ called by the transport layer when a packet comes in
+*/
+static void ctdb_recv_pkt(struct ctdb_context *ctdb, uint8_t *data, uint32_t length)
+{
+ struct ctdb_req_header *hdr = (struct ctdb_req_header *)data;
+
+ CTDB_INCREMENT_STAT(ctdb, node_packets_recv);
+
+ /* up the counter for this source node, so we know its alive */
+ if (ctdb_validate_pnn(ctdb, hdr->srcnode)) {
+ /* as a special case, redirected calls don't increment the rx_cnt */
+ if (hdr->operation != CTDB_REQ_CALL ||
+ ((struct ctdb_req_call_old *)hdr)->hopcount == 0) {
+ ctdb->nodes[hdr->srcnode]->rx_cnt++;
+ }
+ }
+
+ ctdb_input_pkt(ctdb, hdr);
+}
+
+static const struct ctdb_upcalls ctdb_upcalls = {
+ .recv_pkt = ctdb_recv_pkt,
+ .node_dead = ctdb_node_dead,
+ .node_connected = ctdb_node_connected
+};
+
+static struct ctdb_context *ctdb_init(struct tevent_context *ev)
+{
+ int ret;
+ struct ctdb_context *ctdb;
+
+ ctdb = talloc_zero(ev, struct ctdb_context);
+ if (ctdb == NULL) {
+ DBG_ERR("Memory error\n");
+ return NULL;
+ }
+ ctdb->ev = ev;
+
+ /* Wrap early to exercise code. */
+ ret = reqid_init(ctdb, INT_MAX-200, &ctdb->idr);
+ if (ret != 0) {
+ D_ERR("reqid_init failed (%s)\n", strerror(ret));
+ talloc_free(ctdb);
+ return NULL;
+ }
+
+ ret = srvid_init(ctdb, &ctdb->srv);
+ if (ret != 0) {
+ D_ERR("srvid_init failed (%s)\n", strerror(ret));
+ talloc_free(ctdb);
+ return NULL;
+ }
+
+ ctdb->daemon.name = path_socket(ctdb, "ctdbd");
+ if (ctdb->daemon.name == NULL) {
+ DBG_ERR("Memory allocation error\n");
+ talloc_free(ctdb);
+ return NULL;
+ }
+
+ ctdbd_pidfile = path_pidfile(ctdb, "ctdbd");
+ if (ctdbd_pidfile == NULL) {
+ DBG_ERR("Memory allocation error\n");
+ talloc_free(ctdb);
+ return NULL;
+ }
+
+ gettimeofday(&ctdb->ctdbd_start_time, NULL);
+
+ gettimeofday(&ctdb->last_recovery_started, NULL);
+ gettimeofday(&ctdb->last_recovery_finished, NULL);
+
+ ctdb->recovery_mode = CTDB_RECOVERY_NORMAL;
+
+ ctdb->upcalls = &ctdb_upcalls;
+
+ ctdb->statistics.statistics_start_time = timeval_current();
+
+ ctdb->capabilities = CTDB_CAP_DEFAULT;
+
+ /*
+ * Initialise this node's PNN to the unknown value. This will
+ * be set to the correct value by either ctdb_add_node() as
+ * part of loading the nodes file or by
+ * ctdb_tcp_listen_automatic() when the transport is
+ * initialised. At some point we should de-optimise this and
+ * pull it out into ctdb_start_daemon() so it is done clearly
+ * and only in one place.
+ */
+ ctdb->pnn = CTDB_UNKNOWN_PNN;
+
+ ctdb->do_checkpublicip = true;
+
+ return ctdb;
+}
+
+
+/*
+ main program
+*/
+int main(int argc, const char *argv[])
+{
+ struct ctdb_context *ctdb = NULL;
+ int interactive_opt = 0;
+ bool interactive = false;
+
+ struct poptOption popt_options[] = {
+ POPT_AUTOHELP
+ { "interactive", 'i', POPT_ARG_NONE, &interactive_opt, 0,
+ "don't fork, log to stderr", NULL },
+ POPT_TABLEEND
+ };
+ int opt, ret;
+ const char **extra_argv;
+ poptContext pc;
+ struct tevent_context *ev;
+ const char *ctdb_base;
+ struct conf_context *conf;
+ const char *logging_location;
+ const char *test_mode;
+ bool ok;
+
+ /*
+ * Basic setup
+ */
+
+ talloc_enable_null_tracking();
+
+ fault_setup();
+
+ ev = tevent_context_init(NULL);
+ if (ev == NULL) {
+ fprintf(stderr, "tevent_context_init() failed\n");
+ exit(1);
+ }
+ tevent_loop_allow_nesting(ev);
+
+ ctdb = ctdb_init(ev);
+ if (ctdb == NULL) {
+ fprintf(stderr, "Failed to init ctdb\n");
+ exit(1);
+ }
+
+ /* Default value for CTDB_BASE - don't override */
+ setenv("CTDB_BASE", CTDB_ETCDIR, 0);
+ ctdb_base = getenv("CTDB_BASE");
+ if (ctdb_base == NULL) {
+ D_ERR("CTDB_BASE not set\n");
+ exit(1);
+ }
+
+ /*
+ * Command-line option handling
+ */
+
+ pc = poptGetContext(argv[0], argc, argv, popt_options, POPT_CONTEXT_KEEP_FIRST);
+
+ while ((opt = poptGetNextOpt(pc)) != -1) {
+ switch (opt) {
+ default:
+ fprintf(stderr, "Invalid option %s: %s\n",
+ poptBadOption(pc, 0), poptStrerror(opt));
+ goto fail;
+ }
+ }
+
+ /* If there are extra arguments then exit with usage message */
+ extra_argv = poptGetArgs(pc);
+ if (extra_argv) {
+ extra_argv++;
+ if (extra_argv[0]) {
+ poptPrintHelp(pc, stdout, 0);
+ goto fail;
+ }
+ }
+
+ interactive = (interactive_opt != 0);
+
+ /*
+ * Configuration file handling
+ */
+
+ ret = ctdbd_config_load(ctdb, &conf);
+ if (ret != 0) {
+ /* ctdbd_config_load() logs the failure */
+ goto fail;
+ }
+
+ /*
+ * Logging setup/options
+ */
+
+ test_mode = getenv("CTDB_TEST_MODE");
+
+ /* Log to stderr (ignoring configuration) when running as interactive */
+ if (interactive) {
+ logging_location = "file:";
+ setenv("CTDB_INTERACTIVE", "true", 1);
+ } else {
+ logging_location = logging_conf_location(conf);
+ }
+
+ if (strcmp(logging_location, "syslog") != 0 && test_mode == NULL) {
+ /* This can help when CTDB logging is misconfigured */
+ syslog(LOG_DAEMON|LOG_NOTICE,
+ "CTDB logging to location %s",
+ logging_location);
+ }
+
+ /* Initialize logging and set the debug level */
+ ok = ctdb_logging_init(ctdb,
+ logging_location,
+ logging_conf_log_level(conf));
+ if (!ok) {
+ goto fail;
+ }
+ setenv("CTDB_LOGGING", logging_location, 1);
+ setenv("CTDB_DEBUGLEVEL", debug_level_to_string(DEBUGLEVEL), 1);
+
+ script_log_level = debug_level_from_string(
+ ctdb_config.script_log_level);
+
+ D_NOTICE("CTDB starting on node\n");
+
+ /*
+ * Cluster setup/options
+ */
+
+ ret = ctdb_set_transport(ctdb, ctdb_config.transport);
+ if (ret == -1) {
+ D_ERR("ctdb_set_transport failed - %s\n", ctdb_errstr(ctdb));
+ goto fail;
+ }
+
+ if (ctdb_config.cluster_lock != NULL) {
+ ctdb->recovery_lock = ctdb_config.cluster_lock;
+ } else if (ctdb_config.recovery_lock != NULL) {
+ ctdb->recovery_lock = ctdb_config.recovery_lock;
+ } else {
+ D_WARNING("Cluster lock not set\n");
+ }
+
+ /* tell ctdb what address to listen on */
+ if (ctdb_config.node_address) {
+ ret = ctdb_set_address(ctdb, ctdb_config.node_address);
+ if (ret == -1) {
+ D_ERR("ctdb_set_address failed - %s\n",
+ ctdb_errstr(ctdb));
+ goto fail;
+ }
+ }
+
+ /* tell ctdb what nodes are available */
+ ctdb->nodes_file = talloc_asprintf(ctdb, "%s/nodes", ctdb_base);
+ if (ctdb->nodes_file == NULL) {
+ DBG_ERR(" Out of memory\n");
+ goto fail;
+ }
+ ctdb_load_nodes_file(ctdb);
+
+ /*
+ * Database setup/options
+ */
+
+ ctdb->db_directory = ctdb_config.dbdir_volatile;
+ ok = directory_exist(ctdb->db_directory);
+ if (! ok) {
+ D_ERR("Volatile database directory %s does not exist\n",
+ ctdb->db_directory);
+ goto fail;
+ }
+
+ ctdb->db_directory_persistent = ctdb_config.dbdir_persistent;
+ ok = directory_exist(ctdb->db_directory_persistent);
+ if (! ok) {
+ D_ERR("Persistent database directory %s does not exist\n",
+ ctdb->db_directory_persistent);
+ goto fail;
+ }
+
+ ctdb->db_directory_state = ctdb_config.dbdir_state;
+ ok = directory_exist(ctdb->db_directory_state);
+ if (! ok) {
+ D_ERR("State database directory %s does not exist\n",
+ ctdb->db_directory_state);
+ goto fail;
+ }
+
+ if (ctdb_config.lock_debug_script != NULL) {
+ ret = setenv("CTDB_DEBUG_LOCKS",
+ ctdb_config.lock_debug_script,
+ 1);
+ if (ret != 0) {
+ D_ERR("Failed to set up lock debugging (%s)\n",
+ strerror(errno));
+ goto fail;
+ }
+ }
+
+ /*
+ * Legacy setup/options
+ */
+
+ ctdb->start_as_disabled = (int)ctdb_config.start_as_disabled;
+ ctdb->start_as_stopped = (int)ctdb_config.start_as_stopped;
+
+ /* set ctdbd capabilities */
+ if (!ctdb_config.lmaster_capability) {
+ ctdb->capabilities &= ~CTDB_CAP_LMASTER;
+ }
+ if (!ctdb_config.leader_capability) {
+ ctdb->capabilities &= ~CTDB_CAP_RECMASTER;
+ }
+
+ ctdb->do_setsched = ctdb_config.realtime_scheduling;
+
+ /*
+ * Miscellaneous setup
+ */
+
+ ctdb_tunables_load(ctdb);
+
+ ctdb->event_script_dir = talloc_asprintf(ctdb,
+ "%s/events/legacy",
+ ctdb_base);
+ if (ctdb->event_script_dir == NULL) {
+ DBG_ERR("Out of memory\n");
+ goto fail;
+ }
+
+ ctdb->notification_script = talloc_asprintf(ctdb,
+ "%s/notify.sh",
+ ctdb_base);
+ if (ctdb->notification_script == NULL) {
+ D_ERR("Unable to set notification script\n");
+ goto fail;
+ }
+
+ /*
+ * Testing and debug options
+ */
+
+ if (test_mode != NULL) {
+ ctdb->do_setsched = false;
+ ctdb->do_checkpublicip = false;
+ fast_start = true;
+ }
+
+ /* start the protocol running (as a child) */
+ return ctdb_start_daemon(ctdb, interactive, test_mode != NULL);
+
+fail:
+ talloc_free(ctdb);
+ exit(1);
+}
diff --git a/ctdb/server/eventscript.c b/ctdb/server/eventscript.c
new file mode 100644
index 0000000..3ea7d74
--- /dev/null
+++ b/ctdb/server/eventscript.c
@@ -0,0 +1,845 @@
+/*
+ event script handling
+
+ Copyright (C) Andrew Tridgell 2007
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/filesys.h"
+#include "system/network.h"
+#include "system/wait.h"
+#include "system/dir.h"
+#include "system/locale.h"
+#include "system/time.h"
+#include "system/dir.h"
+
+#include <talloc.h>
+#include <tevent.h>
+
+#include "lib/util/dlinklist.h"
+#include "lib/util/debug.h"
+#include "lib/util/samba_util.h"
+#include "lib/util/sys_rw.h"
+
+#include "ctdb_private.h"
+
+#include "common/common.h"
+#include "common/logging.h"
+#include "common/reqid.h"
+#include "common/sock_io.h"
+#include "common/path.h"
+
+#include "protocol/protocol_util.h"
+#include "event/event_protocol_api.h"
+
+/*
+ * Setting up event daemon
+ */
+
+struct eventd_context {
+ struct tevent_context *ev;
+ const char *path;
+ const char *socket;
+
+ /* server state */
+ pid_t eventd_pid;
+ struct tevent_fd *eventd_fde;
+
+ /* client state */
+ struct reqid_context *idr;
+ struct sock_queue *queue;
+ struct eventd_client_state *calls;
+};
+
+static bool eventd_context_init(TALLOC_CTX *mem_ctx,
+ struct ctdb_context *ctdb,
+ struct eventd_context **out)
+{
+ struct eventd_context *ectx;
+ const char *eventd = CTDB_HELPER_BINDIR "/ctdb-eventd";
+ const char *value;
+ int ret;
+
+ ectx = talloc_zero(mem_ctx, struct eventd_context);
+ if (ectx == NULL) {
+ return false;
+ }
+
+ ectx->ev = ctdb->ev;
+
+ value = getenv("CTDB_EVENTD");
+ if (value != NULL) {
+ eventd = value;
+ }
+
+ ectx->path = talloc_strdup(ectx, eventd);
+ if (ectx->path == NULL) {
+ talloc_free(ectx);
+ return false;
+ }
+
+ ectx->socket = path_socket(ectx, "eventd");
+ if (ectx->socket == NULL) {
+ talloc_free(ectx);
+ return false;
+ }
+
+ ret = reqid_init(ectx, 1, &ectx->idr);
+ if (ret != 0) {
+ talloc_free(ectx);
+ return false;
+ }
+
+ ectx->eventd_pid = -1;
+
+ *out = ectx;
+ return true;
+}
+
+struct eventd_startup_state {
+ bool done;
+ int ret;
+ int fd;
+};
+
+static void eventd_startup_timeout_handler(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t,
+ void *private_data)
+{
+ struct eventd_startup_state *state =
+ (struct eventd_startup_state *) private_data;
+
+ state->done = true;
+ state->ret = ETIMEDOUT;
+}
+
+static void eventd_startup_handler(struct tevent_context *ev,
+ struct tevent_fd *fde, uint16_t flags,
+ void *private_data)
+{
+ struct eventd_startup_state *state =
+ (struct eventd_startup_state *)private_data;
+ unsigned int data;
+ ssize_t num_read;
+
+ num_read = sys_read(state->fd, &data, sizeof(data));
+ if (num_read == sizeof(data)) {
+ if (data == 0) {
+ state->ret = 0;
+ } else {
+ state->ret = EIO;
+ }
+ } else if (num_read == 0) {
+ state->ret = EPIPE;
+ } else if (num_read == -1) {
+ state->ret = errno;
+ } else {
+ state->ret = EINVAL;
+ }
+
+ state->done = true;
+}
+
+
+static int wait_for_daemon_startup(struct tevent_context *ev,
+ int fd)
+{
+ TALLOC_CTX *mem_ctx;
+ struct tevent_timer *timer;
+ struct tevent_fd *fde;
+ struct eventd_startup_state state = {
+ .done = false,
+ .ret = 0,
+ .fd = fd,
+ };
+
+ mem_ctx = talloc_new(ev);
+ if (mem_ctx == NULL) {
+ return ENOMEM;
+ }
+
+ timer = tevent_add_timer(ev,
+ mem_ctx,
+ tevent_timeval_current_ofs(10, 0),
+ eventd_startup_timeout_handler,
+ &state);
+ if (timer == NULL) {
+ talloc_free(mem_ctx);
+ return ENOMEM;
+ }
+
+ fde = tevent_add_fd(ev,
+ mem_ctx,
+ fd,
+ TEVENT_FD_READ,
+ eventd_startup_handler,
+ &state);
+ if (fde == NULL) {
+ talloc_free(mem_ctx);
+ return ENOMEM;
+ }
+
+ while (! state.done) {
+ tevent_loop_once(ev);
+ }
+
+ talloc_free(mem_ctx);
+
+ return state.ret;
+}
+
+
+/*
+ * Start and stop event daemon
+ */
+
+static bool eventd_client_connect(struct eventd_context *ectx);
+static void eventd_dead_handler(struct tevent_context *ev,
+ struct tevent_fd *fde, uint16_t flags,
+ void *private_data);
+
+int ctdb_start_eventd(struct ctdb_context *ctdb)
+{
+ struct eventd_context *ectx;
+ const char **argv;
+ int fd[2];
+ pid_t pid;
+ int ret;
+ bool status;
+
+ if (ctdb->ectx == NULL) {
+ status = eventd_context_init(ctdb, ctdb, &ctdb->ectx);
+ if (! status) {
+ DEBUG(DEBUG_ERR,
+ ("Failed to initialize eventd context\n"));
+ return -1;
+ }
+ }
+
+ ectx = ctdb->ectx;
+
+ if (! sock_clean(ectx->socket)) {
+ return -1;
+ }
+
+ ret = pipe(fd);
+ if (ret != 0) {
+ return -1;
+ }
+
+ argv = talloc_array(ectx, const char *, 6);
+ if (argv == NULL) {
+ close(fd[0]);
+ close(fd[1]);
+ return -1;
+ }
+
+ argv[0] = ectx->path;
+ argv[1] = "-P";
+ argv[2] = talloc_asprintf(argv, "%d", ctdb->ctdbd_pid);
+ argv[3] = "-S";
+ argv[4] = talloc_asprintf(argv, "%d", fd[1]);
+ argv[5] = NULL;
+
+ if (argv[2] == NULL || argv[4] == NULL) {
+ close(fd[0]);
+ close(fd[1]);
+ talloc_free(argv);
+ return -1;
+ }
+
+ D_NOTICE("Starting event daemon %s %s %s %s %s\n",
+ argv[0],
+ argv[1],
+ argv[2],
+ argv[3],
+ argv[4]);
+
+ pid = ctdb_fork(ctdb);
+ if (pid == -1) {
+ close(fd[0]);
+ close(fd[1]);
+ talloc_free(argv);
+ return -1;
+ }
+
+ if (pid == 0) {
+ close(fd[0]);
+ ret = execv(argv[0], discard_const(argv));
+ if (ret == -1) {
+ _exit(errno);
+ }
+ _exit(0);
+ }
+
+ talloc_free(argv);
+ close(fd[1]);
+
+ ret = wait_for_daemon_startup(ctdb->ev, fd[0]);
+ if (ret != 0) {
+ ctdb_kill(ctdb, pid, SIGKILL);
+ close(fd[0]);
+ D_ERR("Failed to initialize event daemon (%d)\n", ret);
+ return -1;
+ }
+
+ ectx->eventd_fde = tevent_add_fd(ctdb->ev, ectx, fd[0],
+ TEVENT_FD_READ,
+ eventd_dead_handler, ectx);
+ if (ectx->eventd_fde == NULL) {
+ ctdb_kill(ctdb, pid, SIGKILL);
+ close(fd[0]);
+ return -1;
+ }
+
+ tevent_fd_set_auto_close(ectx->eventd_fde);
+ ectx->eventd_pid = pid;
+
+ status = eventd_client_connect(ectx);
+ if (! status) {
+ DEBUG(DEBUG_ERR, ("Failed to connect to event daemon\n"));
+ ctdb_stop_eventd(ctdb);
+ return -1;
+ }
+
+ return 0;
+}
+
+static void eventd_dead_handler(struct tevent_context *ev,
+ struct tevent_fd *fde, uint16_t flags,
+ void *private_data)
+{
+ D_ERR("Eventd went away - exiting\n");
+ exit(1);
+}
+
+void ctdb_stop_eventd(struct ctdb_context *ctdb)
+{
+ struct eventd_context *ectx = ctdb->ectx;
+
+ if (ectx == NULL) {
+ return;
+ }
+
+ TALLOC_FREE(ectx->eventd_fde);
+ if (ectx->eventd_pid != -1) {
+ kill(ectx->eventd_pid, SIGTERM);
+ ectx->eventd_pid = -1;
+ }
+ TALLOC_FREE(ctdb->ectx);
+}
+
+/*
+ * Connect to event daemon
+ */
+
+struct eventd_client_state {
+ struct eventd_client_state *prev, *next;
+
+ struct eventd_context *ectx;
+ void (*callback)(struct ctdb_event_reply *reply, void *private_data);
+ void *private_data;
+
+ uint32_t reqid;
+ uint8_t *buf;
+ size_t buflen;
+};
+
+static void eventd_client_read(uint8_t *buf, size_t buflen,
+ void *private_data);
+static int eventd_client_state_destructor(struct eventd_client_state *state);
+
+static bool eventd_client_connect(struct eventd_context *ectx)
+{
+ int fd;
+
+ if (ectx->queue != NULL) {
+ return true;
+ }
+
+ fd = sock_connect(ectx->socket);
+ if (fd == -1) {
+ return false;
+ }
+
+ ectx->queue = sock_queue_setup(ectx, ectx->ev, fd,
+ eventd_client_read, ectx);
+ if (ectx->queue == NULL) {
+ close(fd);
+ return false;
+ }
+
+ return true;
+}
+
+static int eventd_client_write(struct eventd_context *ectx,
+ TALLOC_CTX *mem_ctx,
+ struct ctdb_event_request *request,
+ void (*callback)(struct ctdb_event_reply *reply,
+ void *private_data),
+ void *private_data)
+{
+ struct ctdb_event_header header = { 0 };
+ struct eventd_client_state *state;
+ int ret;
+
+ if (! eventd_client_connect(ectx)) {
+ return -1;
+ }
+
+ state = talloc_zero(mem_ctx, struct eventd_client_state);
+ if (state == NULL) {
+ return -1;
+ }
+
+ state->ectx = ectx;
+ state->callback = callback;
+ state->private_data = private_data;
+
+ state->reqid = reqid_new(ectx->idr, state);
+ if (state->reqid == REQID_INVALID) {
+ talloc_free(state);
+ return -1;
+ }
+
+ talloc_set_destructor(state, eventd_client_state_destructor);
+
+ header.reqid = state->reqid;
+
+ state->buflen = ctdb_event_request_len(&header, request);
+ state->buf = talloc_size(state, state->buflen);
+ if (state->buf == NULL) {
+ talloc_free(state);
+ return -1;
+ }
+
+ ret = ctdb_event_request_push(&header,
+ request,
+ state->buf,
+ &state->buflen);
+ if (ret != 0) {
+ talloc_free(state);
+ return -1;
+ }
+
+ ret = sock_queue_write(ectx->queue, state->buf, state->buflen);
+ if (ret != 0) {
+ talloc_free(state);
+ return -1;
+ }
+
+ DLIST_ADD(ectx->calls, state);
+
+ return 0;
+}
+
+static int eventd_client_state_destructor(struct eventd_client_state *state)
+{
+ struct eventd_context *ectx = state->ectx;
+
+ reqid_remove(ectx->idr, state->reqid);
+ DLIST_REMOVE(ectx->calls, state);
+ return 0;
+}
+
+static void eventd_client_read(uint8_t *buf, size_t buflen,
+ void *private_data)
+{
+ struct eventd_context *ectx = talloc_get_type_abort(
+ private_data, struct eventd_context);
+ struct eventd_client_state *state;
+ struct ctdb_event_header header;
+ struct ctdb_event_reply *reply;
+ int ret;
+
+ if (buf == NULL) {
+ /* connection lost */
+ TALLOC_FREE(ectx->queue);
+ return;
+ }
+
+ ret = ctdb_event_reply_pull(buf, buflen, &header, ectx, &reply);
+ if (ret != 0) {
+ D_ERR("Invalid packet received, ret=%d\n", ret);
+ return;
+ }
+
+ if (buflen != header.length) {
+ D_ERR("Packet size mismatch %zu != %"PRIu32"\n",
+ buflen, header.length);
+ talloc_free(reply);
+ return;
+ }
+
+ state = reqid_find(ectx->idr, header.reqid,
+ struct eventd_client_state);
+ if (state == NULL) {
+ talloc_free(reply);
+ return;
+ }
+
+ if (state->reqid != header.reqid) {
+ talloc_free(reply);
+ return;
+ }
+
+ state = talloc_steal(reply, state);
+ state->callback(reply, state->private_data);
+ talloc_free(reply);
+}
+
+/*
+ * Run an event
+ */
+
+struct eventd_client_run_state {
+ struct eventd_context *ectx;
+ void (*callback)(int result, void *private_data);
+ void *private_data;
+};
+
+static void eventd_client_run_done(struct ctdb_event_reply *reply,
+ void *private_data);
+
+static int eventd_client_run(struct eventd_context *ectx,
+ TALLOC_CTX *mem_ctx,
+ void (*callback)(int result,
+ void *private_data),
+ void *private_data,
+ enum ctdb_event event,
+ const char *arg_str,
+ uint32_t timeout)
+{
+ struct eventd_client_run_state *state;
+ struct ctdb_event_request request;
+ struct ctdb_event_request_run rdata;
+ int ret;
+
+ state = talloc_zero(mem_ctx, struct eventd_client_run_state);
+ if (state == NULL) {
+ return -1;
+ }
+
+ state->ectx = ectx;
+ state->callback = callback;
+ state->private_data = private_data;
+
+ rdata.component = "legacy";
+ rdata.event = ctdb_event_to_string(event);
+ rdata.args = arg_str;
+ rdata.timeout = timeout;
+ rdata.flags = 0;
+
+ request.cmd = CTDB_EVENT_CMD_RUN;
+ request.data.run = &rdata;
+
+ ret = eventd_client_write(ectx, state, &request,
+ eventd_client_run_done, state);
+ if (ret != 0) {
+ talloc_free(state);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void eventd_client_run_done(struct ctdb_event_reply *reply,
+ void *private_data)
+{
+ struct eventd_client_run_state *state = talloc_get_type_abort(
+ private_data, struct eventd_client_run_state);
+
+ state = talloc_steal(state->ectx, state);
+ state->callback(reply->result, state->private_data);
+ talloc_free(state);
+}
+
+/*
+ * CTDB event script functions
+ */
+
+int ctdb_event_script_run(struct ctdb_context *ctdb,
+ TALLOC_CTX *mem_ctx,
+ void (*callback)(struct ctdb_context *ctdb,
+ int result, void *private_data),
+ void *private_data,
+ enum ctdb_event event,
+ const char *fmt, va_list ap)
+ PRINTF_ATTRIBUTE(6,0);
+
+struct ctdb_event_script_run_state {
+ struct ctdb_context *ctdb;
+ void (*callback)(struct ctdb_context *ctdb, int result,
+ void *private_data);
+ void *private_data;
+ enum ctdb_event event;
+};
+
+static bool event_allowed_during_recovery(enum ctdb_event event);
+static void ctdb_event_script_run_done(int result, void *private_data);
+static bool check_options(enum ctdb_event call, const char *options);
+
+int ctdb_event_script_run(struct ctdb_context *ctdb,
+ TALLOC_CTX *mem_ctx,
+ void (*callback)(struct ctdb_context *ctdb,
+ int result, void *private_data),
+ void *private_data,
+ enum ctdb_event event,
+ const char *fmt, va_list ap)
+{
+ struct ctdb_event_script_run_state *state;
+ char *arg_str;
+ int ret;
+
+ if ( (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) &&
+ (! event_allowed_during_recovery(event)) ) {
+ DEBUG(DEBUG_ERR,
+ ("Refusing to run event '%s' while in recovery\n",
+ ctdb_eventscript_call_names[event]));
+ return -1;
+ }
+
+ state = talloc_zero(mem_ctx, struct ctdb_event_script_run_state);
+ if (state == NULL) {
+ return -1;
+ }
+
+ state->ctdb = ctdb;
+ state->callback = callback;
+ state->private_data = private_data;
+ state->event = event;
+
+ if (fmt != NULL) {
+ arg_str = talloc_vasprintf(state, fmt, ap);
+ if (arg_str == NULL) {
+ talloc_free(state);
+ return -1;
+ }
+ } else {
+ arg_str = NULL;
+ }
+
+ if (! check_options(event, arg_str)) {
+ DEBUG(DEBUG_ERR,
+ ("Bad event script arguments '%s' for '%s'\n",
+ arg_str, ctdb_eventscript_call_names[event]));
+ talloc_free(arg_str);
+ return -1;
+ }
+
+ ret = eventd_client_run(ctdb->ectx, state,
+ ctdb_event_script_run_done, state,
+ event, arg_str, ctdb->tunable.script_timeout);
+ if (ret != 0) {
+ talloc_free(state);
+ return ret;
+ }
+
+ DEBUG(DEBUG_INFO,
+ (__location__ " Running event %s with arguments %s\n",
+ ctdb_eventscript_call_names[event], arg_str));
+
+ talloc_free(arg_str);
+ return 0;
+}
+
+static void ctdb_event_script_run_done(int result, void *private_data)
+{
+ struct ctdb_event_script_run_state *state = talloc_get_type_abort(
+ private_data, struct ctdb_event_script_run_state);
+
+ if (result == ETIMEDOUT) {
+ switch (state->event) {
+ case CTDB_EVENT_START_RECOVERY:
+ case CTDB_EVENT_RECOVERED:
+ case CTDB_EVENT_TAKE_IP:
+ case CTDB_EVENT_RELEASE_IP:
+ DEBUG(DEBUG_ERR,
+ ("Ignoring hung script for %s event\n",
+ ctdb_eventscript_call_names[state->event]));
+ result = 0;
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ state = talloc_steal(state->ctdb, state);
+ state->callback(state->ctdb, result, state->private_data);
+ talloc_free(state);
+}
+
+
+static unsigned int count_words(const char *options)
+{
+ unsigned int words = 0;
+
+ if (options == NULL) {
+ return 0;
+ }
+
+ options += strspn(options, " \t");
+ while (*options) {
+ words++;
+ options += strcspn(options, " \t");
+ options += strspn(options, " \t");
+ }
+ return words;
+}
+
+static bool check_options(enum ctdb_event call, const char *options)
+{
+ switch (call) {
+ /* These all take no arguments. */
+ case CTDB_EVENT_INIT:
+ case CTDB_EVENT_SETUP:
+ case CTDB_EVENT_STARTUP:
+ case CTDB_EVENT_START_RECOVERY:
+ case CTDB_EVENT_RECOVERED:
+ case CTDB_EVENT_MONITOR:
+ case CTDB_EVENT_SHUTDOWN:
+ case CTDB_EVENT_IPREALLOCATED:
+ return count_words(options) == 0;
+
+ case CTDB_EVENT_TAKE_IP: /* interface, IP address, netmask bits. */
+ case CTDB_EVENT_RELEASE_IP:
+ return count_words(options) == 3;
+
+ case CTDB_EVENT_UPDATE_IP: /* old interface, new interface, IP address, netmask bits. */
+ return count_words(options) == 4;
+
+ default:
+ DEBUG(DEBUG_ERR,(__location__ "Unknown ctdb_event %u\n", call));
+ return false;
+ }
+}
+
+/* only specific events are allowed while in recovery */
+static bool event_allowed_during_recovery(enum ctdb_event event)
+{
+ const enum ctdb_event allowed_events[] = {
+ CTDB_EVENT_INIT,
+ CTDB_EVENT_SETUP,
+ CTDB_EVENT_START_RECOVERY,
+ CTDB_EVENT_SHUTDOWN,
+ CTDB_EVENT_RELEASE_IP,
+ CTDB_EVENT_IPREALLOCATED,
+ };
+ size_t i;
+
+ for (i = 0; i < ARRAY_SIZE(allowed_events); i++) {
+ if (event == allowed_events[i]) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/*
+ run the event script in the background, calling the callback when
+ finished. If mem_ctx is freed, callback will never be called.
+ */
+int ctdb_event_script_callback(struct ctdb_context *ctdb,
+ TALLOC_CTX *mem_ctx,
+ void (*callback)(struct ctdb_context *, int, void *),
+ void *private_data,
+ enum ctdb_event call,
+ const char *fmt, ...)
+{
+ va_list ap;
+ int ret;
+
+ va_start(ap, fmt);
+ ret = ctdb_event_script_run(ctdb, mem_ctx, callback, private_data,
+ call, fmt, ap);
+ va_end(ap);
+
+ return ret;
+}
+
+
+struct ctdb_event_script_args_state {
+ bool done;
+ int status;
+};
+
+static void ctdb_event_script_args_done(struct ctdb_context *ctdb,
+ int status, void *private_data)
+{
+ struct ctdb_event_script_args_state *s =
+ (struct ctdb_event_script_args_state *)private_data;
+
+ s->done = true;
+ s->status = status;
+}
+
+/*
+ run the event script, waiting for it to complete. Used when the caller
+ doesn't want to continue till the event script has finished.
+ */
+int ctdb_event_script_args(struct ctdb_context *ctdb, enum ctdb_event call,
+ const char *fmt, ...)
+{
+ va_list ap;
+ int ret;
+ struct ctdb_event_script_args_state state = {
+ .status = -1,
+ .done = false,
+ };
+
+ va_start(ap, fmt);
+ ret = ctdb_event_script_run(ctdb, ctdb,
+ ctdb_event_script_args_done, &state,
+ call, fmt, ap);
+ va_end(ap);
+ if (ret != 0) {
+ return ret;
+ }
+
+ while (! state.done) {
+ tevent_loop_once(ctdb->ev);
+ }
+
+ if (state.status == ETIMEDOUT) {
+ /* Don't ban self if CTDB is starting up or shutting down */
+ if (call != CTDB_EVENT_INIT && call != CTDB_EVENT_SHUTDOWN) {
+ DEBUG(DEBUG_ERR,
+ (__location__ " eventscript for '%s' timed out."
+ " Immediately banning ourself for %d seconds\n",
+ ctdb_eventscript_call_names[call],
+ ctdb->tunable.recovery_ban_period));
+ ctdb_ban_self(ctdb);
+ }
+ }
+
+ return state.status;
+}
+
+int ctdb_event_script(struct ctdb_context *ctdb, enum ctdb_event call)
+{
+ /* GCC complains about empty format string, so use %s and "". */
+ return ctdb_event_script_args(ctdb, call, NULL);
+}
+
+void ctdb_event_reopen_logs(struct ctdb_context *ctdb)
+{
+ if (ctdb->ectx->eventd_pid > 0) {
+ kill(ctdb->ectx->eventd_pid, SIGHUP);
+ }
+}
diff --git a/ctdb/server/ipalloc.c b/ctdb/server/ipalloc.c
new file mode 100644
index 0000000..7f49364
--- /dev/null
+++ b/ctdb/server/ipalloc.c
@@ -0,0 +1,284 @@
+/*
+ ctdb ip takeover code
+
+ Copyright (C) Ronnie Sahlberg 2007
+ Copyright (C) Andrew Tridgell 2007
+ Copyright (C) Martin Schwenke 2011
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+
+#include <talloc.h>
+
+#include "lib/util/debug.h"
+
+#include "common/logging.h"
+#include "common/rb_tree.h"
+
+#include "protocol/protocol_util.h"
+
+#include "server/ipalloc_private.h"
+
+/* Initialise main ipalloc state and sub-structures */
+struct ipalloc_state *
+ipalloc_state_init(TALLOC_CTX *mem_ctx,
+ uint32_t num_nodes,
+ enum ipalloc_algorithm algorithm,
+ bool no_ip_takeover,
+ bool no_ip_failback,
+ uint32_t *force_rebalance_nodes)
+{
+ struct ipalloc_state *ipalloc_state =
+ talloc_zero(mem_ctx, struct ipalloc_state);
+ if (ipalloc_state == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
+ return NULL;
+ }
+
+ ipalloc_state->num = num_nodes;
+
+ ipalloc_state->algorithm = algorithm;
+ ipalloc_state->no_ip_takeover = no_ip_takeover;
+ ipalloc_state->no_ip_failback = no_ip_failback;
+ ipalloc_state->force_rebalance_nodes = force_rebalance_nodes;
+
+ return ipalloc_state;
+}
+
+static void *add_ip_callback(void *parm, void *data)
+{
+ struct public_ip_list *this_ip = parm;
+ struct public_ip_list *prev_ip = data;
+
+ if (prev_ip == NULL) {
+ return parm;
+ }
+ if (this_ip->pnn == CTDB_UNKNOWN_PNN) {
+ this_ip->pnn = prev_ip->pnn;
+ }
+
+ return parm;
+}
+
+static int getips_count_callback(void *param, void *data)
+{
+ struct public_ip_list **ip_list = (struct public_ip_list **)param;
+ struct public_ip_list *new_ip = (struct public_ip_list *)data;
+
+ new_ip->next = *ip_list;
+ *ip_list = new_ip;
+ return 0;
+}
+
+/* Nodes only know about those public addresses that they are
+ * configured to serve and no individual node has a full list of all
+ * public addresses configured across the cluster. Therefore, a
+ * merged list of all public addresses needs to be built so that IP
+ * allocation can be done. */
+static struct public_ip_list *
+create_merged_ip_list(struct ipalloc_state *ipalloc_state)
+{
+ unsigned int i, j;
+ struct public_ip_list *ip_list;
+ struct ctdb_public_ip_list *public_ips;
+ struct trbt_tree *ip_tree;
+ int ret;
+
+ ip_tree = trbt_create(ipalloc_state, 0);
+
+ if (ipalloc_state->known_public_ips == NULL) {
+ DEBUG(DEBUG_ERR, ("Known public IPs not set\n"));
+ return NULL;
+ }
+
+ for (i=0; i < ipalloc_state->num; i++) {
+
+ public_ips = &ipalloc_state->known_public_ips[i];
+
+ for (j=0; j < public_ips->num; j++) {
+ struct public_ip_list *tmp_ip;
+
+ /* This is returned as part of ip_list */
+ tmp_ip = talloc_zero(ipalloc_state, struct public_ip_list);
+ if (tmp_ip == NULL) {
+ DEBUG(DEBUG_ERR,
+ (__location__ " out of memory\n"));
+ talloc_free(ip_tree);
+ return NULL;
+ }
+
+ /* Do not use information about IP addresses hosted
+ * on other nodes, it may not be accurate */
+ if (public_ips->ip[j].pnn == i) {
+ tmp_ip->pnn = public_ips->ip[j].pnn;
+ } else {
+ tmp_ip->pnn = CTDB_UNKNOWN_PNN;
+ }
+ tmp_ip->addr = public_ips->ip[j].addr;
+ tmp_ip->next = NULL;
+
+ trbt_insertarray32_callback(ip_tree,
+ IP_KEYLEN, ip_key(&public_ips->ip[j].addr),
+ add_ip_callback,
+ tmp_ip);
+ }
+ }
+
+ ip_list = NULL;
+ ret = trbt_traversearray32(ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
+ if (ret != 0) {
+ DBG_ERR("Error traversing the IP tree.\n");
+ }
+
+ talloc_free(ip_tree);
+
+ return ip_list;
+}
+
+static bool populate_bitmap(struct ipalloc_state *ipalloc_state)
+{
+ struct public_ip_list *ip = NULL;
+ unsigned int i, j;
+
+ for (ip = ipalloc_state->all_ips; ip != NULL; ip = ip->next) {
+
+ ip->known_on = bitmap_talloc(ip, ipalloc_state->num);
+ if (ip->known_on == NULL) {
+ return false;
+ }
+
+ ip->available_on = bitmap_talloc(ip, ipalloc_state->num);
+ if (ip->available_on == NULL) {
+ return false;
+ }
+
+ for (i = 0; i < ipalloc_state->num; i++) {
+ struct ctdb_public_ip_list *known =
+ &ipalloc_state->known_public_ips[i];
+ struct ctdb_public_ip_list *avail =
+ &ipalloc_state->available_public_ips[i];
+
+ /* Check to see if "ip" is available on node "i" */
+ for (j = 0; j < avail->num; j++) {
+ if (ctdb_sock_addr_same_ip(
+ &ip->addr, &avail->ip[j].addr)) {
+ bitmap_set(ip->available_on, i);
+ break;
+ }
+ }
+
+ /* Optimisation: available => known */
+ if (bitmap_query(ip->available_on, i)) {
+ bitmap_set(ip->known_on, i);
+ continue;
+ }
+
+ /* Check to see if "ip" is known on node "i" */
+ for (j = 0; j < known->num; j++) {
+ if (ctdb_sock_addr_same_ip(
+ &ip->addr, &known->ip[j].addr)) {
+ bitmap_set(ip->known_on, i);
+ break;
+ }
+ }
+ }
+ }
+
+ return true;
+}
+
+void ipalloc_set_public_ips(struct ipalloc_state *ipalloc_state,
+ struct ctdb_public_ip_list *known_ips,
+ struct ctdb_public_ip_list *available_ips)
+{
+ ipalloc_state->available_public_ips = available_ips;
+ ipalloc_state->known_public_ips = known_ips;
+}
+
+/* This can only return false if there are no available IPs *and*
+ * there are no IP addresses currently allocated. If the latter is
+ * true then the cluster can clearly host IPs... just not necessarily
+ * right now... */
+bool ipalloc_can_host_ips(struct ipalloc_state *ipalloc_state)
+{
+ unsigned int i;
+ bool have_ips = false;
+
+ for (i=0; i < ipalloc_state->num; i++) {
+ struct ctdb_public_ip_list *ips =
+ ipalloc_state->known_public_ips;
+ if (ips[i].num != 0) {
+ unsigned int j;
+ have_ips = true;
+ /* Succeed if an address is hosted on node i */
+ for (j=0; j < ips[i].num; j++) {
+ if (ips[i].ip[j].pnn == i) {
+ return true;
+ }
+ }
+ }
+ }
+
+ if (! have_ips) {
+ return false;
+ }
+
+ /* At this point there are known addresses but none are
+ * hosted. Need to check if cluster can now host some
+ * addresses.
+ */
+ for (i=0; i < ipalloc_state->num; i++) {
+ if (ipalloc_state->available_public_ips[i].num != 0) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/* The calculation part of the IP allocation algorithm. */
+struct public_ip_list *ipalloc(struct ipalloc_state *ipalloc_state)
+{
+ bool ret = false;
+
+ ipalloc_state->all_ips = create_merged_ip_list(ipalloc_state);
+ if (ipalloc_state->all_ips == NULL) {
+ return NULL;
+ }
+
+ if (!populate_bitmap(ipalloc_state)) {
+ return NULL;
+ }
+
+ switch (ipalloc_state->algorithm) {
+ case IPALLOC_LCP2:
+ ret = ipalloc_lcp2(ipalloc_state);
+ break;
+ case IPALLOC_DETERMINISTIC:
+ ret = ipalloc_deterministic(ipalloc_state);
+ break;
+ case IPALLOC_NONDETERMINISTIC:
+ ret = ipalloc_nondeterministic(ipalloc_state);
+ break;
+ }
+
+ /* at this point ->pnn is the node which will own each IP
+ or CTDB_UNKNOWN_PNN if there is no node that can cover this ip
+ */
+
+ return (ret ? ipalloc_state->all_ips : NULL);
+}
diff --git a/ctdb/server/ipalloc.h b/ctdb/server/ipalloc.h
new file mode 100644
index 0000000..42aec9e
--- /dev/null
+++ b/ctdb/server/ipalloc.h
@@ -0,0 +1,67 @@
+/*
+ CTDB IP takeover code
+
+ Copyright (C) Ronnie Sahlberg 2007
+ Copyright (C) Andrew Tridgell 2007
+ Copyright (C) Martin Schwenke 2015
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __CTDB_IPALLOC_H__
+#define __CTDB_IPALLOC_H__
+
+#include <talloc.h>
+
+#include "replace.h"
+#include "system/network.h"
+
+#include "lib/util/bitmap.h"
+
+struct public_ip_list {
+ struct public_ip_list *next;
+ uint32_t pnn;
+ ctdb_sock_addr addr;
+ struct bitmap *known_on;
+ struct bitmap *available_on;
+};
+
+#define IP_KEYLEN 4
+uint32_t *ip_key(ctdb_sock_addr *ip);
+
+/* Flags used in IP allocation algorithms. */
+enum ipalloc_algorithm {
+ IPALLOC_DETERMINISTIC,
+ IPALLOC_NONDETERMINISTIC,
+ IPALLOC_LCP2,
+};
+
+struct ipalloc_state;
+
+struct ipalloc_state * ipalloc_state_init(TALLOC_CTX *mem_ctx,
+ uint32_t num_nodes,
+ enum ipalloc_algorithm algorithm,
+ bool no_ip_takeover,
+ bool no_ip_failback,
+ uint32_t *force_rebalance_nodes);
+
+void ipalloc_set_public_ips(struct ipalloc_state *ipalloc_state,
+ struct ctdb_public_ip_list *known_ips,
+ struct ctdb_public_ip_list *available_ips);
+
+bool ipalloc_can_host_ips(struct ipalloc_state *ipalloc_state);
+
+struct public_ip_list *ipalloc(struct ipalloc_state *ipalloc_state);
+
+#endif /* __CTDB_IPALLOC_H__ */
diff --git a/ctdb/server/ipalloc_common.c b/ctdb/server/ipalloc_common.c
new file mode 100644
index 0000000..437c511
--- /dev/null
+++ b/ctdb/server/ipalloc_common.c
@@ -0,0 +1,192 @@
+/*
+ ctdb ip takeover code
+
+ Copyright (C) Ronnie Sahlberg 2007
+ Copyright (C) Andrew Tridgell 2007
+ Copyright (C) Martin Schwenke 2011
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+
+#include "ctdb_private.h"
+
+#include "lib/util/time.h"
+
+#include "lib/util/debug.h"
+#include "common/logging.h"
+
+#include "common/common.h"
+
+#include "protocol/protocol_util.h"
+
+#include "server/ipalloc_private.h"
+
+#define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
+
+/* Given a physical node, return the number of
+ public addresses that is currently assigned to this node.
+*/
+int node_ip_coverage(uint32_t pnn, struct public_ip_list *ips)
+{
+ int num=0;
+
+ for (;ips;ips=ips->next) {
+ if (ips->pnn == pnn) {
+ num++;
+ }
+ }
+ return num;
+}
+
+
+/* Can the given node host the given IP: is the public IP known to the
+ * node and is NOIPHOST unset?
+*/
+static bool can_node_host_ip(struct ipalloc_state *ipalloc_state,
+ int32_t pnn,
+ struct public_ip_list *ip)
+{
+ return bitmap_query(ip->available_on, pnn);
+}
+
+bool can_node_takeover_ip(struct ipalloc_state *ipalloc_state,
+ int32_t pnn,
+ struct public_ip_list *ip)
+{
+ if (ipalloc_state->no_ip_takeover) {
+ return false;
+ }
+
+ return can_node_host_ip(ipalloc_state, pnn, ip);
+}
+
+/* search the node lists list for a node to takeover this ip.
+ pick the node that currently are serving the least number of ips
+ so that the ips get spread out evenly.
+*/
+int find_takeover_node(struct ipalloc_state *ipalloc_state,
+ struct public_ip_list *ip)
+{
+ unsigned int pnn;
+ int min=0, num;
+ unsigned int i, numnodes;
+
+ numnodes = ipalloc_state->num;
+ pnn = CTDB_UNKNOWN_PNN;
+ for (i=0; i<numnodes; i++) {
+ /* verify that this node can serve this ip */
+ if (!can_node_takeover_ip(ipalloc_state, i, ip)) {
+ /* no it couldnt so skip to the next node */
+ continue;
+ }
+
+ num = node_ip_coverage(i, ipalloc_state->all_ips);
+ /* was this the first node we checked ? */
+ if (pnn == CTDB_UNKNOWN_PNN) {
+ pnn = i;
+ min = num;
+ } else {
+ if (num < min) {
+ pnn = i;
+ min = num;
+ }
+ }
+ }
+ if (pnn == CTDB_UNKNOWN_PNN) {
+ DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
+ ctdb_sock_addr_to_string(ipalloc_state,
+ &ip->addr,
+ false)));
+
+ return -1;
+ }
+
+ ip->pnn = pnn;
+ return 0;
+}
+
+uint32_t *ip_key(ctdb_sock_addr *ip)
+{
+ static uint32_t key[IP_KEYLEN];
+
+ bzero(key, sizeof(key));
+
+ switch (ip->sa.sa_family) {
+ case AF_INET:
+ key[3] = htonl(ip->ip.sin_addr.s_addr);
+ break;
+ case AF_INET6: {
+ uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
+ key[0] = htonl(s6_a32[0]);
+ key[1] = htonl(s6_a32[1]);
+ key[2] = htonl(s6_a32[2]);
+ key[3] = htonl(s6_a32[3]);
+ break;
+ }
+ default:
+ DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
+ return key;
+ }
+
+ return key;
+}
+
+/* Allocate any unassigned IPs just by looping through the IPs and
+ * finding the best node for each.
+ */
+void basic_allocate_unassigned(struct ipalloc_state *ipalloc_state)
+{
+ struct public_ip_list *t;
+
+ /* loop over all ip's and find a physical node to cover for
+ each unassigned ip.
+ */
+ for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
+ if (t->pnn == CTDB_UNKNOWN_PNN) {
+ if (find_takeover_node(ipalloc_state, t)) {
+ DEBUG(DEBUG_WARNING,
+ ("Failed to find node to cover ip %s\n",
+ ctdb_sock_addr_to_string(ipalloc_state,
+ &t->addr,
+ false)));
+ }
+ }
+ }
+}
+
+void unassign_unsuitable_ips(struct ipalloc_state *ipalloc_state)
+{
+ struct public_ip_list *t;
+
+ /* verify that the assigned nodes can serve that public ip
+ and set it to CTDB_UNKNOWN_PNN if not
+ */
+ for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
+ if (t->pnn == CTDB_UNKNOWN_PNN) {
+ continue;
+ }
+ if (!can_node_host_ip(ipalloc_state, t->pnn, t) != 0) {
+ /* this node can not serve this ip. */
+ DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
+ ctdb_sock_addr_to_string(
+ ipalloc_state,
+ &t->addr, false),
+ t->pnn));
+ t->pnn = CTDB_UNKNOWN_PNN;
+ }
+ }
+}
diff --git a/ctdb/server/ipalloc_deterministic.c b/ctdb/server/ipalloc_deterministic.c
new file mode 100644
index 0000000..097d816
--- /dev/null
+++ b/ctdb/server/ipalloc_deterministic.c
@@ -0,0 +1,63 @@
+/*
+ ctdb ip takeover code
+
+ Copyright (C) Ronnie Sahlberg 2007
+ Copyright (C) Andrew Tridgell 2007
+ Copyright (C) Martin Schwenke 2011
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+
+#include "lib/util/debug.h"
+#include "common/logging.h"
+
+#include "server/ipalloc_private.h"
+
+bool ipalloc_deterministic(struct ipalloc_state *ipalloc_state)
+{
+ struct public_ip_list *t;
+ int i, numnodes;
+
+ numnodes = ipalloc_state->num;
+
+ DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
+ /* Allocate IPs to nodes in a modulo fashion so that IPs will
+ * always be allocated the same way for a specific set of
+ * available/unavailable nodes.
+ */
+
+ for (i = 0, t = ipalloc_state->all_ips; t!= NULL; t = t->next, i++) {
+ t->pnn = i % numnodes;
+ }
+
+ /* IP failback doesn't make sense with deterministic
+ * IPs, since the modulo step above implicitly fails
+ * back IPs to their "home" node.
+ */
+ if (ipalloc_state->no_ip_failback) {
+ D_WARNING("WARNING: 'NoIPFailback' set but ignored - "
+ "incompatible with 'Deterministic IPs\n");
+ }
+
+ unassign_unsuitable_ips(ipalloc_state);
+
+ basic_allocate_unassigned(ipalloc_state);
+
+ /* No failback here! */
+
+ return true;
+}
diff --git a/ctdb/server/ipalloc_lcp2.c b/ctdb/server/ipalloc_lcp2.c
new file mode 100644
index 0000000..bc2936b
--- /dev/null
+++ b/ctdb/server/ipalloc_lcp2.c
@@ -0,0 +1,525 @@
+/*
+ ctdb ip takeover code
+
+ Copyright (C) Ronnie Sahlberg 2007
+ Copyright (C) Andrew Tridgell 2007
+ Copyright (C) Martin Schwenke 2011
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+
+#include "lib/util/debug.h"
+#include "common/logging.h"
+
+#include "protocol/protocol_util.h"
+
+#include "server/ipalloc_private.h"
+
+/*
+ * This is the length of the longtest common prefix between the IPs.
+ * It is calculated by XOR-ing the 2 IPs together and counting the
+ * number of leading zeroes. The implementation means that all
+ * addresses end up being 128 bits long.
+ *
+ * FIXME? Should we consider IPv4 and IPv6 separately given that the
+ * 12 bytes of 0 prefix padding will hurt the algorithm if there are
+ * lots of nodes and IP addresses?
+ */
+static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
+{
+ uint32_t ip1_k[IP_KEYLEN];
+ uint32_t *t;
+ int i;
+ uint32_t x;
+
+ uint32_t distance = 0;
+
+ memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
+ t = ip_key(ip2);
+ for (i=0; i<IP_KEYLEN; i++) {
+ x = ip1_k[i] ^ t[i];
+ if (x == 0) {
+ distance += 32;
+ } else {
+ /* Count number of leading zeroes.
+ * FIXME? This could be optimised...
+ */
+ while ((x & ((uint32_t)1 << 31)) == 0) {
+ x <<= 1;
+ distance += 1;
+ }
+ }
+ }
+
+ return distance;
+}
+
+/* Calculate the IP distance for the given IP relative to IPs on the
+ given node. The ips argument is generally the all_ips variable
+ used in the main part of the algorithm.
+ */
+static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
+ struct public_ip_list *ips,
+ unsigned int pnn)
+{
+ struct public_ip_list *t;
+ uint32_t d;
+
+ uint32_t sum = 0;
+
+ for (t = ips; t != NULL; t = t->next) {
+ if (t->pnn != pnn) {
+ continue;
+ }
+
+ /* Optimisation: We never calculate the distance
+ * between an address and itself. This allows us to
+ * calculate the effect of removing an address from a
+ * node by simply calculating the distance between
+ * that address and all of the exitsing addresses.
+ * Moreover, we assume that we're only ever dealing
+ * with addresses from all_ips so we can identify an
+ * address via a pointer rather than doing a more
+ * expensive address comparison. */
+ if (&(t->addr) == ip) {
+ continue;
+ }
+
+ d = ip_distance(ip, &(t->addr));
+ sum += d * d; /* Cheaper than pulling in math.h :-) */
+ }
+
+ return sum;
+}
+
+/* Return the LCP2 imbalance metric for addresses currently assigned
+ to the given node.
+ */
+static uint32_t lcp2_imbalance(struct public_ip_list * all_ips,
+ unsigned int pnn)
+{
+ struct public_ip_list *t;
+
+ uint32_t imbalance = 0;
+
+ for (t = all_ips; t != NULL; t = t->next) {
+ if (t->pnn != pnn) {
+ continue;
+ }
+ /* Pass the rest of the IPs rather than the whole
+ all_ips input list.
+ */
+ imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
+ }
+
+ return imbalance;
+}
+
+static bool lcp2_init(struct ipalloc_state *ipalloc_state,
+ uint32_t **lcp2_imbalances,
+ bool **rebalance_candidates)
+{
+ unsigned int i, numnodes;
+ struct public_ip_list *t;
+
+ numnodes = ipalloc_state->num;
+
+ *rebalance_candidates = talloc_array(ipalloc_state, bool, numnodes);
+ if (*rebalance_candidates == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
+ return false;
+ }
+ *lcp2_imbalances = talloc_array(ipalloc_state, uint32_t, numnodes);
+ if (*lcp2_imbalances == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
+ return false;
+ }
+
+ for (i=0; i<numnodes; i++) {
+ (*lcp2_imbalances)[i] =
+ lcp2_imbalance(ipalloc_state->all_ips, i);
+ /* First step: assume all nodes are candidates */
+ (*rebalance_candidates)[i] = true;
+ }
+
+ /* 2nd step: if a node has IPs assigned then it must have been
+ * healthy before, so we remove it from consideration. This
+ * is overkill but is all we have because we don't maintain
+ * state between takeover runs. An alternative would be to
+ * keep state and invalidate it every time the recovery master
+ * changes.
+ */
+ for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
+ if (t->pnn != CTDB_UNKNOWN_PNN) {
+ (*rebalance_candidates)[t->pnn] = false;
+ }
+ }
+
+ /* 3rd step: if a node is forced to re-balance then
+ we allow failback onto the node */
+ if (ipalloc_state->force_rebalance_nodes == NULL) {
+ return true;
+ }
+ for (i = 0;
+ i < talloc_array_length(ipalloc_state->force_rebalance_nodes);
+ i++) {
+ uint32_t pnn = ipalloc_state->force_rebalance_nodes[i];
+ if (pnn >= numnodes) {
+ DEBUG(DEBUG_ERR,
+ (__location__ "unknown node %u\n", pnn));
+ continue;
+ }
+
+ DEBUG(DEBUG_NOTICE,
+ ("Forcing rebalancing of IPs to node %u\n", pnn));
+ (*rebalance_candidates)[pnn] = true;
+ }
+
+ return true;
+}
+
+/* Allocate any unassigned addresses using the LCP2 algorithm to find
+ * the IP/node combination that will cost the least.
+ */
+static void lcp2_allocate_unassigned(struct ipalloc_state *ipalloc_state,
+ uint32_t *lcp2_imbalances)
+{
+ struct public_ip_list *t;
+ unsigned int dstnode, numnodes;
+
+ unsigned int minnode;
+ uint32_t mindsum, dstdsum, dstimbl;
+ uint32_t minimbl = 0;
+ struct public_ip_list *minip;
+
+ bool should_loop = true;
+ bool have_unassigned = true;
+
+ numnodes = ipalloc_state->num;
+
+ while (have_unassigned && should_loop) {
+ should_loop = false;
+
+ DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
+ DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
+
+ minnode = CTDB_UNKNOWN_PNN;
+ mindsum = 0;
+ minip = NULL;
+
+ /* loop over each unassigned ip. */
+ for (t = ipalloc_state->all_ips; t != NULL ; t = t->next) {
+ if (t->pnn != CTDB_UNKNOWN_PNN) {
+ continue;
+ }
+
+ for (dstnode = 0; dstnode < numnodes; dstnode++) {
+ /* only check nodes that can actually takeover this ip */
+ if (!can_node_takeover_ip(ipalloc_state,
+ dstnode,
+ t)) {
+ /* no it couldnt so skip to the next node */
+ continue;
+ }
+
+ dstdsum = ip_distance_2_sum(&(t->addr),
+ ipalloc_state->all_ips,
+ dstnode);
+ dstimbl = lcp2_imbalances[dstnode] + dstdsum;
+ DEBUG(DEBUG_DEBUG,
+ (" %s -> %d [+%d]\n",
+ ctdb_sock_addr_to_string(ipalloc_state,
+ &(t->addr),
+ false),
+ dstnode,
+ dstimbl - lcp2_imbalances[dstnode]));
+
+
+ if (minnode == CTDB_UNKNOWN_PNN ||
+ dstdsum < mindsum) {
+ minnode = dstnode;
+ minimbl = dstimbl;
+ mindsum = dstdsum;
+ minip = t;
+ should_loop = true;
+ }
+ }
+ }
+
+ DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
+
+ /* If we found one then assign it to the given node. */
+ if (minnode != CTDB_UNKNOWN_PNN) {
+ minip->pnn = minnode;
+ lcp2_imbalances[minnode] = minimbl;
+ DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
+ ctdb_sock_addr_to_string(
+ ipalloc_state,
+ &(minip->addr), false),
+ minnode,
+ mindsum));
+ }
+
+ /* There might be a better way but at least this is clear. */
+ have_unassigned = false;
+ for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
+ if (t->pnn == CTDB_UNKNOWN_PNN) {
+ have_unassigned = true;
+ }
+ }
+ }
+
+ /* We know if we have an unassigned addresses so we might as
+ * well optimise.
+ */
+ if (have_unassigned) {
+ for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
+ if (t->pnn == CTDB_UNKNOWN_PNN) {
+ DEBUG(DEBUG_WARNING,
+ ("Failed to find node to cover ip %s\n",
+ ctdb_sock_addr_to_string(ipalloc_state,
+ &t->addr,
+ false)));
+ }
+ }
+ }
+}
+
+/* LCP2 algorithm for rebalancing the cluster. Given a candidate node
+ * to move IPs from, determines the best IP/destination node
+ * combination to move from the source node.
+ */
+static bool lcp2_failback_candidate(struct ipalloc_state *ipalloc_state,
+ unsigned int srcnode,
+ uint32_t *lcp2_imbalances,
+ bool *rebalance_candidates)
+{
+ unsigned int dstnode, mindstnode, numnodes;
+ uint32_t srcdsum, dstimbl, dstdsum;
+ uint32_t minsrcimbl, mindstimbl;
+ struct public_ip_list *minip;
+ struct public_ip_list *t;
+
+ /* Find an IP and destination node that best reduces imbalance. */
+ minip = NULL;
+ minsrcimbl = 0;
+ mindstnode = CTDB_UNKNOWN_PNN;
+ mindstimbl = 0;
+
+ numnodes = ipalloc_state->num;
+
+ DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
+ DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
+ srcnode, lcp2_imbalances[srcnode]));
+
+ for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
+ uint32_t srcimbl;
+
+ /* Only consider addresses on srcnode. */
+ if (t->pnn != srcnode) {
+ continue;
+ }
+
+ /* What is this IP address costing the source node? */
+ srcdsum = ip_distance_2_sum(&(t->addr),
+ ipalloc_state->all_ips,
+ srcnode);
+ srcimbl = lcp2_imbalances[srcnode] - srcdsum;
+
+ /* Consider this IP address would cost each potential
+ * destination node. Destination nodes are limited to
+ * those that are newly healthy, since we don't want
+ * to do gratuitous failover of IPs just to make minor
+ * balance improvements.
+ */
+ for (dstnode = 0; dstnode < numnodes; dstnode++) {
+ if (!rebalance_candidates[dstnode]) {
+ continue;
+ }
+
+ /* only check nodes that can actually takeover this ip */
+ if (!can_node_takeover_ip(ipalloc_state, dstnode,
+ t)) {
+ /* no it couldnt so skip to the next node */
+ continue;
+ }
+
+ dstdsum = ip_distance_2_sum(&(t->addr),
+ ipalloc_state->all_ips,
+ dstnode);
+ dstimbl = lcp2_imbalances[dstnode] + dstdsum;
+ DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
+ srcnode, -srcdsum,
+ ctdb_sock_addr_to_string(
+ ipalloc_state,
+ &(t->addr), false),
+ dstnode, dstdsum));
+
+ if ((dstimbl < lcp2_imbalances[srcnode]) &&
+ (dstdsum < srcdsum) && \
+ ((mindstnode == CTDB_UNKNOWN_PNN) || \
+ ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
+
+ minip = t;
+ minsrcimbl = srcimbl;
+ mindstnode = dstnode;
+ mindstimbl = dstimbl;
+ }
+ }
+ }
+ DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
+
+ if (mindstnode != CTDB_UNKNOWN_PNN) {
+ /* We found a move that makes things better... */
+ DEBUG(DEBUG_INFO,
+ ("%d [%d] -> %s -> %d [+%d]\n",
+ srcnode, minsrcimbl - lcp2_imbalances[srcnode],
+ ctdb_sock_addr_to_string(ipalloc_state,
+ &(minip->addr), false),
+ mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
+
+
+ lcp2_imbalances[srcnode] = minsrcimbl;
+ lcp2_imbalances[mindstnode] = mindstimbl;
+ minip->pnn = mindstnode;
+
+ return true;
+ }
+
+ return false;
+}
+
+struct lcp2_imbalance_pnn {
+ uint32_t imbalance;
+ unsigned int pnn;
+};
+
+static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
+{
+ const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
+ const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
+
+ if (lipa->imbalance > lipb->imbalance) {
+ return -1;
+ } else if (lipa->imbalance == lipb->imbalance) {
+ return 0;
+ } else {
+ return 1;
+ }
+}
+
+/* LCP2 algorithm for rebalancing the cluster. This finds the source
+ * node with the highest LCP2 imbalance, and then determines the best
+ * IP/destination node combination to move from the source node.
+ */
+static void lcp2_failback(struct ipalloc_state *ipalloc_state,
+ uint32_t *lcp2_imbalances,
+ bool *rebalance_candidates)
+{
+ int i, numnodes;
+ struct lcp2_imbalance_pnn * lips;
+ bool again;
+
+ numnodes = ipalloc_state->num;
+
+try_again:
+ /* Put the imbalances and nodes into an array, sort them and
+ * iterate through candidates. Usually the 1st one will be
+ * used, so this doesn't cost much...
+ */
+ DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
+ DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
+ lips = talloc_array(ipalloc_state, struct lcp2_imbalance_pnn, numnodes);
+ for (i = 0; i < numnodes; i++) {
+ lips[i].imbalance = lcp2_imbalances[i];
+ lips[i].pnn = i;
+ DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
+ }
+ qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
+ lcp2_cmp_imbalance_pnn);
+
+ again = false;
+ for (i = 0; i < numnodes; i++) {
+ /* This means that all nodes had 0 or 1 addresses, so
+ * can't be imbalanced.
+ */
+ if (lips[i].imbalance == 0) {
+ break;
+ }
+
+ if (lcp2_failback_candidate(ipalloc_state,
+ lips[i].pnn,
+ lcp2_imbalances,
+ rebalance_candidates)) {
+ again = true;
+ break;
+ }
+ }
+
+ talloc_free(lips);
+ if (again) {
+ goto try_again;
+ }
+}
+
+bool ipalloc_lcp2(struct ipalloc_state *ipalloc_state)
+{
+ uint32_t *lcp2_imbalances;
+ bool *rebalance_candidates;
+ int numnodes, i;
+ bool have_rebalance_candidates;
+ bool ret = true;
+
+ unassign_unsuitable_ips(ipalloc_state);
+
+ if (!lcp2_init(ipalloc_state,
+ &lcp2_imbalances, &rebalance_candidates)) {
+ ret = false;
+ goto finished;
+ }
+
+ lcp2_allocate_unassigned(ipalloc_state, lcp2_imbalances);
+
+ /* If we don't want IPs to fail back then don't rebalance IPs. */
+ if (ipalloc_state->no_ip_failback) {
+ goto finished;
+ }
+
+ /* It is only worth continuing if we have suitable target
+ * nodes to transfer IPs to. This check is much cheaper than
+ * continuing on...
+ */
+ numnodes = ipalloc_state->num;
+ have_rebalance_candidates = false;
+ for (i=0; i<numnodes; i++) {
+ if (rebalance_candidates[i]) {
+ have_rebalance_candidates = true;
+ break;
+ }
+ }
+ if (!have_rebalance_candidates) {
+ goto finished;
+ }
+
+ /* Now, try to make sure the ip adresses are evenly distributed
+ across the nodes.
+ */
+ lcp2_failback(ipalloc_state, lcp2_imbalances, rebalance_candidates);
+
+finished:
+ return ret;
+}
diff --git a/ctdb/server/ipalloc_nondeterministic.c b/ctdb/server/ipalloc_nondeterministic.c
new file mode 100644
index 0000000..1e70c8b
--- /dev/null
+++ b/ctdb/server/ipalloc_nondeterministic.c
@@ -0,0 +1,150 @@
+/*
+ ctdb ip takeover code
+
+ Copyright (C) Ronnie Sahlberg 2007
+ Copyright (C) Andrew Tridgell 2007
+ Copyright (C) Martin Schwenke 2011
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+
+#include "ctdb_private.h"
+
+#include "lib/util/debug.h"
+#include "common/logging.h"
+#include "common/common.h"
+
+#include "protocol/protocol_util.h"
+
+#include "server/ipalloc_private.h"
+
+/* Basic non-deterministic rebalancing algorithm.
+ */
+static void basic_failback(struct ipalloc_state *ipalloc_state,
+ int num_ips)
+{
+ unsigned int i, numnodes, maxnode, minnode;
+ int maxnum, minnum, num, retries;
+ struct public_ip_list *t;
+
+ numnodes = ipalloc_state->num;
+ retries = 0;
+
+try_again:
+ maxnum=0;
+ minnum=0;
+
+ /* for each ip address, loop over all nodes that can serve
+ this ip and make sure that the difference between the node
+ serving the most and the node serving the least ip's are
+ not greater than 1.
+ */
+ for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
+ if (t->pnn == CTDB_UNKNOWN_PNN) {
+ continue;
+ }
+
+ /* Get the highest and lowest number of ips's served by any
+ valid node which can serve this ip.
+ */
+ maxnode = CTDB_UNKNOWN_PNN;
+ minnode = CTDB_UNKNOWN_PNN;
+ for (i=0; i<numnodes; i++) {
+ /* only check nodes that can actually serve this ip */
+ if (!can_node_takeover_ip(ipalloc_state, i,
+ t)) {
+ /* no it couldnt so skip to the next node */
+ continue;
+ }
+
+ num = node_ip_coverage(i, ipalloc_state->all_ips);
+ if (maxnode == CTDB_UNKNOWN_PNN) {
+ maxnode = i;
+ maxnum = num;
+ } else {
+ if (num > maxnum) {
+ maxnode = i;
+ maxnum = num;
+ }
+ }
+ if (minnode == CTDB_UNKNOWN_PNN) {
+ minnode = i;
+ minnum = num;
+ } else {
+ if (num < minnum) {
+ minnode = i;
+ minnum = num;
+ }
+ }
+ }
+ if (maxnode == CTDB_UNKNOWN_PNN) {
+ DEBUG(DEBUG_WARNING,
+ (__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
+ ctdb_sock_addr_to_string(ipalloc_state,
+ &t->addr, false)));
+
+ continue;
+ }
+
+ /* if the spread between the smallest and largest coverage by
+ a node is >=2 we steal one of the ips from the node with
+ most coverage to even things out a bit.
+ try to do this a limited number of times since we dont
+ want to spend too much time balancing the ip coverage.
+ */
+ if ((maxnum > minnum+1) &&
+ (retries < (num_ips + 5))){
+ struct public_ip_list *tt;
+
+ /* Reassign one of maxnode's VNNs */
+ for (tt = ipalloc_state->all_ips; tt != NULL; tt = tt->next) {
+ if (tt->pnn == maxnode) {
+ (void)find_takeover_node(ipalloc_state,
+ tt);
+ retries++;
+ goto try_again;;
+ }
+ }
+ }
+ }
+}
+
+bool ipalloc_nondeterministic(struct ipalloc_state *ipalloc_state)
+{
+ /* This should be pushed down into basic_failback. */
+ struct public_ip_list *t;
+ int num_ips = 0;
+ for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
+ num_ips++;
+ }
+
+ unassign_unsuitable_ips(ipalloc_state);
+
+ basic_allocate_unassigned(ipalloc_state);
+
+ /* If we don't want IPs to fail back then don't rebalance IPs. */
+ if (ipalloc_state->no_ip_failback) {
+ return true;
+ }
+
+ /* Now, try to make sure the ip adresses are evenly distributed
+ across the nodes.
+ */
+ basic_failback(ipalloc_state, num_ips);
+
+ return true;
+}
diff --git a/ctdb/server/ipalloc_private.h b/ctdb/server/ipalloc_private.h
new file mode 100644
index 0000000..3ea3d31
--- /dev/null
+++ b/ctdb/server/ipalloc_private.h
@@ -0,0 +1,57 @@
+/*
+ CTDB IP takeover code
+
+ Copyright (C) Ronnie Sahlberg 2007
+ Copyright (C) Andrew Tridgell 2007
+ Copyright (C) Martin Schwenke 2015
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __CTDB_IPALLOC_PRIVATE_H__
+#define __CTDB_IPALLOC_PRIVATE_H__
+
+#include "protocol/protocol.h"
+
+#include "server/ipalloc.h"
+
+struct ipalloc_state {
+ uint32_t num;
+
+ /* Arrays with data for each node */
+ struct ctdb_public_ip_list *available_public_ips;
+ struct ctdb_public_ip_list *known_public_ips;
+
+ struct public_ip_list *all_ips;
+ enum ipalloc_algorithm algorithm;
+ bool no_ip_failback;
+ bool no_ip_takeover;
+ uint32_t *force_rebalance_nodes;
+};
+
+bool can_node_takeover_ip(struct ipalloc_state *ipalloc_state,
+ int32_t pnn,
+ struct public_ip_list *ip);
+int node_ip_coverage(uint32_t pnn, struct public_ip_list *ips);
+int find_takeover_node(struct ipalloc_state *ipalloc_state,
+ struct public_ip_list *ip);
+
+void unassign_unsuitable_ips(struct ipalloc_state *ipalloc_state);
+void basic_allocate_unassigned(struct ipalloc_state *ipalloc_state);
+
+bool ipalloc_nondeterministic(struct ipalloc_state *ipalloc_state);
+bool ipalloc_deterministic(struct ipalloc_state *ipalloc_state);
+bool ipalloc_lcp2(struct ipalloc_state *ipalloc_state);
+
+#endif /* __CTDB_IPALLOC_PRIVATE_H__ */
diff --git a/ctdb/server/legacy_conf.c b/ctdb/server/legacy_conf.c
new file mode 100644
index 0000000..3391a3b
--- /dev/null
+++ b/ctdb/server/legacy_conf.c
@@ -0,0 +1,80 @@
+/*
+ CTDB legacy config handling
+
+ Copyright (C) Martin Schwenke 2018
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+
+#include "lib/util/debug.h"
+
+#include "common/conf.h"
+#include "common/logging.h"
+
+#include "legacy_conf.h"
+
+#define LEGACY_SCRIPT_LOG_LEVEL_DEFAULT "ERROR"
+
+static bool legacy_conf_validate_script_log_level(const char *key,
+ const char *old_loglevel,
+ const char *new_loglevel,
+ enum conf_update_mode mode)
+{
+ int log_level;
+ bool ok;
+
+ ok = debug_level_parse(new_loglevel, &log_level);
+ if (!ok) {
+ D_ERR("Invalid value for [%s] -> %s = %s\n",
+ LEGACY_CONF_SECTION,
+ key,
+ new_loglevel);
+ return false;
+ }
+
+ return true;
+}
+
+void legacy_conf_init(struct conf_context *conf)
+{
+ conf_define_section(conf, LEGACY_CONF_SECTION, NULL);
+
+ conf_define_boolean(conf,
+ LEGACY_CONF_SECTION,
+ LEGACY_CONF_REALTIME_SCHEDULING,
+ true,
+ NULL);
+ conf_define_boolean(conf,
+ LEGACY_CONF_SECTION,
+ LEGACY_CONF_LMASTER_CAPABILITY,
+ true,
+ NULL);
+ conf_define_boolean(conf,
+ LEGACY_CONF_SECTION,
+ LEGACY_CONF_START_AS_STOPPED,
+ false,
+ NULL);
+ conf_define_boolean(conf,
+ LEGACY_CONF_SECTION,
+ LEGACY_CONF_START_AS_DISABLED,
+ false,
+ NULL);
+ conf_define_string(conf,
+ LEGACY_CONF_SECTION,
+ LEGACY_CONF_SCRIPT_LOG_LEVEL,
+ LEGACY_SCRIPT_LOG_LEVEL_DEFAULT,
+ legacy_conf_validate_script_log_level);
+}
diff --git a/ctdb/server/legacy_conf.h b/ctdb/server/legacy_conf.h
new file mode 100644
index 0000000..b6b4b57
--- /dev/null
+++ b/ctdb/server/legacy_conf.h
@@ -0,0 +1,35 @@
+/*
+ CTDB legacy config handling
+
+ Copyright (C) Martin Schwenke 2018
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __CTDB_LEGACY_CONF_H__
+#define __CTDB_LEGACY_CONF_H__
+
+#include "common/conf.h"
+
+#define LEGACY_CONF_SECTION "legacy"
+
+#define LEGACY_CONF_REALTIME_SCHEDULING "realtime scheduling"
+#define LEGACY_CONF_LMASTER_CAPABILITY "lmaster capability"
+#define LEGACY_CONF_START_AS_STOPPED "start as stopped"
+#define LEGACY_CONF_START_AS_DISABLED "start as disabled"
+#define LEGACY_CONF_SCRIPT_LOG_LEVEL "script log level"
+
+void legacy_conf_init(struct conf_context *conf);
+
+#endif /* __CTDB_LEGACY_CONF_H__ */