summaryrefslogtreecommitdiffstats
path: root/ctdb/server/ctdb_daemon.c
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 17:20:00 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 17:20:00 +0000
commit8daa83a594a2e98f39d764422bfbdbc62c9efd44 (patch)
tree4099e8021376c7d8c05bdf8503093d80e9c7bad0 /ctdb/server/ctdb_daemon.c
parentInitial commit. (diff)
downloadsamba-8daa83a594a2e98f39d764422bfbdbc62c9efd44.tar.xz
samba-8daa83a594a2e98f39d764422bfbdbc62c9efd44.zip
Adding upstream version 2:4.20.0+dfsg.upstream/2%4.20.0+dfsg
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'ctdb/server/ctdb_daemon.c')
-rw-r--r--ctdb/server/ctdb_daemon.c2248
1 files changed, 2248 insertions, 0 deletions
diff --git a/ctdb/server/ctdb_daemon.c b/ctdb/server/ctdb_daemon.c
new file mode 100644
index 0000000..eb9d634
--- /dev/null
+++ b/ctdb/server/ctdb_daemon.c
@@ -0,0 +1,2248 @@
+/*
+ ctdb daemon code
+
+ Copyright (C) Andrew Tridgell 2006
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "system/wait.h"
+#include "system/time.h"
+
+#include <talloc.h>
+/* Allow use of deprecated function tevent_loop_allow_nesting() */
+#define TEVENT_DEPRECATED
+#include <tevent.h>
+#include <tdb.h>
+
+#include "lib/tdb_wrap/tdb_wrap.h"
+#include "lib/util/dlinklist.h"
+#include "lib/util/debug.h"
+#include "lib/util/time.h"
+#include "lib/util/blocking.h"
+#include "lib/util/become_daemon.h"
+
+#include "version.h"
+#include "ctdb_private.h"
+#include "ctdb_client.h"
+
+#include "common/rb_tree.h"
+#include "common/reqid.h"
+#include "common/system.h"
+#include "common/common.h"
+#include "common/logging.h"
+#include "common/pidfile.h"
+#include "common/sock_io.h"
+
+struct ctdb_client_pid_list {
+ struct ctdb_client_pid_list *next, *prev;
+ struct ctdb_context *ctdb;
+ pid_t pid;
+ struct ctdb_client *client;
+};
+
+const char *ctdbd_pidfile = NULL;
+static struct pidfile_context *ctdbd_pidfile_ctx = NULL;
+
+static void daemon_incoming_packet(void *, struct ctdb_req_header *);
+
+static pid_t __ctdbd_pid;
+
+static void print_exit_message(void)
+{
+ if (getpid() == __ctdbd_pid) {
+ DEBUG(DEBUG_NOTICE,("CTDB daemon shutting down\n"));
+
+ /* Wait a second to allow pending log messages to be flushed */
+ sleep(1);
+ }
+}
+
+#ifdef HAVE_GETRUSAGE
+
+struct cpu_check_threshold_data {
+ unsigned short percent;
+ struct timeval timeofday;
+ struct timeval ru_time;
+};
+
+static void ctdb_cpu_check_threshold(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval tv,
+ void *private_data)
+{
+ struct ctdb_context *ctdb = talloc_get_type_abort(
+ private_data, struct ctdb_context);
+ uint32_t interval = 60;
+
+ static unsigned short threshold = 0;
+ static struct cpu_check_threshold_data prev = {
+ .percent = 0,
+ .timeofday = { .tv_sec = 0 },
+ .ru_time = { .tv_sec = 0 },
+ };
+
+ struct rusage usage;
+ struct cpu_check_threshold_data curr = {
+ .percent = 0,
+ };
+ int64_t ru_time_diff, timeofday_diff;
+ bool first;
+ int ret;
+
+ /*
+ * Cache the threshold so that we don't waste time checking
+ * the environment variable every time
+ */
+ if (threshold == 0) {
+ const char *t;
+
+ threshold = 90;
+
+ t = getenv("CTDB_TEST_CPU_USAGE_THRESHOLD");
+ if (t != NULL) {
+ int th;
+
+ th = atoi(t);
+ if (th <= 0 || th > 100) {
+ DBG_WARNING("Failed to parse env var: %s\n", t);
+ } else {
+ threshold = th;
+ }
+ }
+ }
+
+ ret = getrusage(RUSAGE_SELF, &usage);
+ if (ret != 0) {
+ DBG_WARNING("rusage() failed: %d\n", ret);
+ goto next;
+ }
+
+ /* Sum the system and user CPU usage */
+ curr.ru_time = timeval_sum(&usage.ru_utime, &usage.ru_stime);
+
+ curr.timeofday = tv;
+
+ first = timeval_is_zero(&prev.timeofday);
+ if (first) {
+ /* No previous values recorded so no calculation to do */
+ goto done;
+ }
+
+ timeofday_diff = usec_time_diff(&curr.timeofday, &prev.timeofday);
+ if (timeofday_diff <= 0) {
+ /*
+ * Time went backwards or didn't progress so no (sane)
+ * calculation can be done
+ */
+ goto done;
+ }
+
+ ru_time_diff = usec_time_diff(&curr.ru_time, &prev.ru_time);
+
+ curr.percent = ru_time_diff * 100 / timeofday_diff;
+
+ if (curr.percent >= threshold) {
+ /* Log only if the utilisation changes */
+ if (curr.percent != prev.percent) {
+ D_WARNING("WARNING: CPU utilisation %hu%% >= "
+ "threshold (%hu%%)\n",
+ curr.percent,
+ threshold);
+ }
+ } else {
+ /* Log if the utilisation falls below the threshold */
+ if (prev.percent >= threshold) {
+ D_WARNING("WARNING: CPU utilisation %hu%% < "
+ "threshold (%hu%%)\n",
+ curr.percent,
+ threshold);
+ }
+ }
+
+done:
+ prev = curr;
+
+next:
+ tevent_add_timer(ctdb->ev, ctdb,
+ timeval_current_ofs(interval, 0),
+ ctdb_cpu_check_threshold,
+ ctdb);
+}
+
+static void ctdb_start_cpu_check_threshold(struct ctdb_context *ctdb)
+{
+ tevent_add_timer(ctdb->ev, ctdb,
+ timeval_current(),
+ ctdb_cpu_check_threshold,
+ ctdb);
+}
+#endif /* HAVE_GETRUSAGE */
+
+static void ctdb_time_tick(struct tevent_context *ev, struct tevent_timer *te,
+ struct timeval t, void *private_data)
+{
+ struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+
+ if (getpid() != ctdb->ctdbd_pid) {
+ return;
+ }
+
+ tevent_add_timer(ctdb->ev, ctdb,
+ timeval_current_ofs(1, 0),
+ ctdb_time_tick, ctdb);
+}
+
+/* Used to trigger a dummy event once per second, to make
+ * detection of hangs more reliable.
+ */
+static void ctdb_start_time_tickd(struct ctdb_context *ctdb)
+{
+ tevent_add_timer(ctdb->ev, ctdb,
+ timeval_current_ofs(1, 0),
+ ctdb_time_tick, ctdb);
+}
+
+static void ctdb_start_periodic_events(struct ctdb_context *ctdb)
+{
+ /* start monitoring for connected/disconnected nodes */
+ ctdb_start_keepalive(ctdb);
+
+ /* start periodic update of tcp tickle lists */
+ ctdb_start_tcp_tickle_update(ctdb);
+
+ /* start listening for recovery daemon pings */
+ ctdb_control_recd_ping(ctdb);
+
+ /* start listening to timer ticks */
+ ctdb_start_time_tickd(ctdb);
+
+#ifdef HAVE_GETRUSAGE
+ ctdb_start_cpu_check_threshold(ctdb);
+#endif /* HAVE_GETRUSAGE */
+}
+
+static void ignore_signal(int signum)
+{
+ struct sigaction act;
+
+ memset(&act, 0, sizeof(act));
+
+ act.sa_handler = SIG_IGN;
+ sigemptyset(&act.sa_mask);
+ sigaddset(&act.sa_mask, signum);
+ sigaction(signum, &act, NULL);
+}
+
+
+/*
+ send a packet to a client
+ */
+static int daemon_queue_send(struct ctdb_client *client, struct ctdb_req_header *hdr)
+{
+ CTDB_INCREMENT_STAT(client->ctdb, client_packets_sent);
+ if (hdr->operation == CTDB_REQ_MESSAGE) {
+ if (ctdb_queue_length(client->queue) > client->ctdb->tunable.max_queue_depth_drop_msg) {
+ DEBUG(DEBUG_ERR,("CTDB_REQ_MESSAGE queue full - killing client connection.\n"));
+ talloc_free(client);
+ return -1;
+ }
+ }
+ return ctdb_queue_send(client->queue, (uint8_t *)hdr, hdr->length);
+}
+
+/*
+ message handler for when we are in daemon mode. This redirects the message
+ to the right client
+ */
+static void daemon_message_handler(uint64_t srvid, TDB_DATA data,
+ void *private_data)
+{
+ struct ctdb_client *client = talloc_get_type(private_data, struct ctdb_client);
+ struct ctdb_req_message_old *r;
+ int len;
+
+ /* construct a message to send to the client containing the data */
+ len = offsetof(struct ctdb_req_message_old, data) + data.dsize;
+ r = ctdbd_allocate_pkt(client->ctdb, client->ctdb, CTDB_REQ_MESSAGE,
+ len, struct ctdb_req_message_old);
+ CTDB_NO_MEMORY_VOID(client->ctdb, r);
+
+ talloc_set_name_const(r, "req_message packet");
+
+ r->srvid = srvid;
+ r->datalen = data.dsize;
+ memcpy(&r->data[0], data.dptr, data.dsize);
+
+ daemon_queue_send(client, &r->hdr);
+
+ talloc_free(r);
+}
+
+/*
+ this is called when the ctdb daemon received a ctdb request to
+ set the srvid from the client
+ */
+int daemon_register_message_handler(struct ctdb_context *ctdb, uint32_t client_id, uint64_t srvid)
+{
+ struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
+ int res;
+ if (client == NULL) {
+ DEBUG(DEBUG_ERR,("Bad client_id in daemon_request_register_message_handler\n"));
+ return -1;
+ }
+ res = srvid_register(ctdb->srv, client, srvid, daemon_message_handler,
+ client);
+ if (res != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to register handler %llu in daemon\n",
+ (unsigned long long)srvid));
+ } else {
+ DEBUG(DEBUG_INFO,(__location__ " Registered message handler for srvid=%llu\n",
+ (unsigned long long)srvid));
+ }
+
+ return res;
+}
+
+/*
+ this is called when the ctdb daemon received a ctdb request to
+ remove a srvid from the client
+ */
+int daemon_deregister_message_handler(struct ctdb_context *ctdb, uint32_t client_id, uint64_t srvid)
+{
+ struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
+ if (client == NULL) {
+ DEBUG(DEBUG_ERR,("Bad client_id in daemon_request_deregister_message_handler\n"));
+ return -1;
+ }
+ return srvid_deregister(ctdb->srv, srvid, client);
+}
+
+void daemon_tunnel_handler(uint64_t tunnel_id, TDB_DATA data,
+ void *private_data)
+{
+ struct ctdb_client *client =
+ talloc_get_type_abort(private_data, struct ctdb_client);
+ struct ctdb_req_tunnel_old *c, *pkt;
+ size_t len;
+
+ pkt = (struct ctdb_req_tunnel_old *)data.dptr;
+
+ len = offsetof(struct ctdb_req_tunnel_old, data) + pkt->datalen;
+ c = ctdbd_allocate_pkt(client->ctdb, client->ctdb, CTDB_REQ_TUNNEL,
+ len, struct ctdb_req_tunnel_old);
+ if (c == NULL) {
+ DEBUG(DEBUG_ERR, ("Memory error in daemon_tunnel_handler\n"));
+ return;
+ }
+
+ talloc_set_name_const(c, "req_tunnel packet");
+
+ c->tunnel_id = tunnel_id;
+ c->flags = pkt->flags;
+ c->datalen = pkt->datalen;
+ memcpy(c->data, pkt->data, pkt->datalen);
+
+ daemon_queue_send(client, &c->hdr);
+
+ talloc_free(c);
+}
+
+/*
+ destroy a ctdb_client
+*/
+static int ctdb_client_destructor(struct ctdb_client *client)
+{
+ struct ctdb_db_context *ctdb_db;
+
+ ctdb_takeover_client_destructor_hook(client);
+ reqid_remove(client->ctdb->idr, client->client_id);
+ client->ctdb->num_clients--;
+
+ if (client->num_persistent_updates != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Client disconnecting with %u persistent updates in flight. Starting recovery\n", client->num_persistent_updates));
+ client->ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
+ }
+ ctdb_db = find_ctdb_db(client->ctdb, client->db_id);
+ if (ctdb_db) {
+ DEBUG(DEBUG_ERR, (__location__ " client exit while transaction "
+ "commit active. Forcing recovery.\n"));
+ client->ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
+
+ /*
+ * trans3 transaction state:
+ *
+ * The destructor sets the pointer to NULL.
+ */
+ talloc_free(ctdb_db->persistent_state);
+ }
+
+ return 0;
+}
+
+
+/*
+ this is called when the ctdb daemon received a ctdb request message
+ from a local client over the unix domain socket
+ */
+static void daemon_request_message_from_client(struct ctdb_client *client,
+ struct ctdb_req_message_old *c)
+{
+ TDB_DATA data;
+ int res;
+
+ if (c->hdr.destnode == CTDB_CURRENT_NODE) {
+ c->hdr.destnode = ctdb_get_pnn(client->ctdb);
+ }
+
+ /* maybe the message is for another client on this node */
+ if (ctdb_get_pnn(client->ctdb)==c->hdr.destnode) {
+ ctdb_request_message(client->ctdb, (struct ctdb_req_header *)c);
+ return;
+ }
+
+ /* its for a remote node */
+ data.dptr = &c->data[0];
+ data.dsize = c->datalen;
+ res = ctdb_daemon_send_message(client->ctdb, c->hdr.destnode,
+ c->srvid, data);
+ if (res != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to send message to remote node %u\n",
+ c->hdr.destnode));
+ }
+}
+
+
+struct daemon_call_state {
+ struct ctdb_client *client;
+ uint32_t reqid;
+ struct ctdb_call *call;
+ struct timeval start_time;
+
+ /* readonly request ? */
+ uint32_t readonly_fetch;
+ uint32_t client_callid;
+};
+
+/*
+ complete a call from a client
+*/
+static void daemon_call_from_client_callback(struct ctdb_call_state *state)
+{
+ struct daemon_call_state *dstate = talloc_get_type(state->async.private_data,
+ struct daemon_call_state);
+ struct ctdb_reply_call_old *r;
+ int res;
+ uint32_t length;
+ struct ctdb_client *client = dstate->client;
+ struct ctdb_db_context *ctdb_db = state->ctdb_db;
+
+ talloc_steal(client, dstate);
+ talloc_steal(dstate, dstate->call);
+
+ res = ctdb_daemon_call_recv(state, dstate->call);
+ if (res != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " ctdbd_call_recv() returned error\n"));
+ CTDB_DECREMENT_STAT(client->ctdb, pending_calls);
+
+ CTDB_UPDATE_LATENCY(client->ctdb, ctdb_db, "call_from_client_cb 1", call_latency, dstate->start_time);
+ return;
+ }
+
+ length = offsetof(struct ctdb_reply_call_old, data) + dstate->call->reply_data.dsize;
+ /* If the client asked for readonly FETCH, we remapped this to
+ FETCH_WITH_HEADER when calling the daemon. So we must
+ strip the extra header off the reply data before passing
+ it back to the client.
+ */
+ if (dstate->readonly_fetch
+ && dstate->client_callid == CTDB_FETCH_FUNC) {
+ length -= sizeof(struct ctdb_ltdb_header);
+ }
+
+ r = ctdbd_allocate_pkt(client->ctdb, dstate, CTDB_REPLY_CALL,
+ length, struct ctdb_reply_call_old);
+ if (r == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " Failed to allocate reply_call in ctdb daemon\n"));
+ CTDB_DECREMENT_STAT(client->ctdb, pending_calls);
+ CTDB_UPDATE_LATENCY(client->ctdb, ctdb_db, "call_from_client_cb 2", call_latency, dstate->start_time);
+ return;
+ }
+ r->hdr.reqid = dstate->reqid;
+ r->status = dstate->call->status;
+
+ if (dstate->readonly_fetch
+ && dstate->client_callid == CTDB_FETCH_FUNC) {
+ /* client only asked for a FETCH so we must strip off
+ the extra ctdb_ltdb header
+ */
+ r->datalen = dstate->call->reply_data.dsize - sizeof(struct ctdb_ltdb_header);
+ memcpy(&r->data[0], dstate->call->reply_data.dptr + sizeof(struct ctdb_ltdb_header), r->datalen);
+ } else {
+ r->datalen = dstate->call->reply_data.dsize;
+ memcpy(&r->data[0], dstate->call->reply_data.dptr, r->datalen);
+ }
+
+ res = daemon_queue_send(client, &r->hdr);
+ if (res == -1) {
+ /* client is dead - return immediately */
+ return;
+ }
+ if (res != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " Failed to queue packet from daemon to client\n"));
+ }
+ CTDB_UPDATE_LATENCY(client->ctdb, ctdb_db, "call_from_client_cb 3", call_latency, dstate->start_time);
+ CTDB_DECREMENT_STAT(client->ctdb, pending_calls);
+ talloc_free(dstate);
+}
+
+struct ctdb_daemon_packet_wrap {
+ struct ctdb_context *ctdb;
+ uint32_t client_id;
+};
+
+/*
+ a wrapper to catch disconnected clients
+ */
+static void daemon_incoming_packet_wrap(void *p, struct ctdb_req_header *hdr)
+{
+ struct ctdb_client *client;
+ struct ctdb_daemon_packet_wrap *w = talloc_get_type(p,
+ struct ctdb_daemon_packet_wrap);
+ if (w == NULL) {
+ DEBUG(DEBUG_CRIT,(__location__ " Bad packet type '%s'\n", talloc_get_name(p)));
+ return;
+ }
+
+ client = reqid_find(w->ctdb->idr, w->client_id, struct ctdb_client);
+ if (client == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Packet for disconnected client %u\n",
+ w->client_id));
+ talloc_free(w);
+ return;
+ }
+ talloc_free(w);
+
+ /* process it */
+ daemon_incoming_packet(client, hdr);
+}
+
+struct ctdb_deferred_fetch_call {
+ struct ctdb_deferred_fetch_call *next, *prev;
+ struct ctdb_req_call_old *c;
+ struct ctdb_daemon_packet_wrap *w;
+};
+
+struct ctdb_deferred_fetch_queue {
+ struct ctdb_deferred_fetch_call *deferred_calls;
+};
+
+struct ctdb_deferred_requeue {
+ struct ctdb_deferred_fetch_call *dfc;
+ struct ctdb_client *client;
+};
+
+/* called from a timer event and starts reprocessing the deferred call.*/
+static void reprocess_deferred_call(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *private_data)
+{
+ struct ctdb_deferred_requeue *dfr = (struct ctdb_deferred_requeue *)private_data;
+ struct ctdb_client *client = dfr->client;
+
+ talloc_steal(client, dfr->dfc->c);
+ daemon_incoming_packet(client, (struct ctdb_req_header *)dfr->dfc->c);
+ talloc_free(dfr);
+}
+
+/* the referral context is destroyed either after a timeout or when the initial
+ fetch-lock has finished.
+ at this stage, immediately start reprocessing the queued up deferred
+ calls so they get reprocessed immediately (and since we are dmaster at
+ this stage, trigger the waiting smbd processes to pick up and acquire the
+ record right away.
+*/
+static int deferred_fetch_queue_destructor(struct ctdb_deferred_fetch_queue *dfq)
+{
+
+ /* need to reprocess the packets from the queue explicitly instead of
+ just using a normal destructor since we need to
+ call the clients in the same order as the requests queued up
+ */
+ while (dfq->deferred_calls != NULL) {
+ struct ctdb_client *client;
+ struct ctdb_deferred_fetch_call *dfc = dfq->deferred_calls;
+ struct ctdb_deferred_requeue *dfr;
+
+ DLIST_REMOVE(dfq->deferred_calls, dfc);
+
+ client = reqid_find(dfc->w->ctdb->idr, dfc->w->client_id, struct ctdb_client);
+ if (client == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Packet for disconnected client %u\n",
+ dfc->w->client_id));
+ continue;
+ }
+
+ /* process it by pushing it back onto the eventloop */
+ dfr = talloc(client, struct ctdb_deferred_requeue);
+ if (dfr == NULL) {
+ DEBUG(DEBUG_ERR,("Failed to allocate deferred fetch requeue structure\n"));
+ continue;
+ }
+
+ dfr->dfc = talloc_steal(dfr, dfc);
+ dfr->client = client;
+
+ tevent_add_timer(dfc->w->ctdb->ev, client, timeval_zero(),
+ reprocess_deferred_call, dfr);
+ }
+
+ return 0;
+}
+
+/* insert the new deferral context into the rb tree.
+ there should never be a pre-existing context here, but check for it
+ warn and destroy the previous context if there is already a deferral context
+ for this key.
+*/
+static void *insert_dfq_callback(void *parm, void *data)
+{
+ if (data) {
+ DEBUG(DEBUG_ERR,("Already have DFQ registered. Free old %p and create new %p\n", data, parm));
+ talloc_free(data);
+ }
+ return parm;
+}
+
+/* if the original fetch-lock did not complete within a reasonable time,
+ free the context and context for all deferred requests to cause them to be
+ re-inserted into the event system.
+*/
+static void dfq_timeout(struct tevent_context *ev, struct tevent_timer *te,
+ struct timeval t, void *private_data)
+{
+ talloc_free(private_data);
+}
+
+/* This function is used in the local daemon to register a KEY in a database
+ for being "fetched"
+ While the remote fetch is in-flight, any further attempts to re-fetch the
+ same record will be deferred until the fetch completes.
+*/
+static int setup_deferred_fetch_locks(struct ctdb_db_context *ctdb_db, struct ctdb_call *call)
+{
+ uint32_t *k;
+ struct ctdb_deferred_fetch_queue *dfq;
+
+ k = ctdb_key_to_idkey(call, call->key);
+ if (k == NULL) {
+ DEBUG(DEBUG_ERR,("Failed to allocate key for deferred fetch\n"));
+ return -1;
+ }
+
+ dfq = talloc(call, struct ctdb_deferred_fetch_queue);
+ if (dfq == NULL) {
+ DEBUG(DEBUG_ERR,("Failed to allocate key for deferred fetch queue structure\n"));
+ talloc_free(k);
+ return -1;
+ }
+ dfq->deferred_calls = NULL;
+
+ trbt_insertarray32_callback(ctdb_db->deferred_fetch, k[0], &k[0], insert_dfq_callback, dfq);
+
+ talloc_set_destructor(dfq, deferred_fetch_queue_destructor);
+
+ /* If the fetch hasn't completed in 30 seconds, just tear it all down
+ and let it try again as the events are reissued */
+ tevent_add_timer(ctdb_db->ctdb->ev, dfq, timeval_current_ofs(30, 0),
+ dfq_timeout, dfq);
+
+ talloc_free(k);
+ return 0;
+}
+
+/* check if this is a duplicate request to a fetch already in-flight
+ if it is, make this call deferred to be reprocessed later when
+ the in-flight fetch completes.
+*/
+static int requeue_duplicate_fetch(struct ctdb_db_context *ctdb_db, struct ctdb_client *client, TDB_DATA key, struct ctdb_req_call_old *c)
+{
+ uint32_t *k;
+ struct ctdb_deferred_fetch_queue *dfq;
+ struct ctdb_deferred_fetch_call *dfc;
+
+ k = ctdb_key_to_idkey(c, key);
+ if (k == NULL) {
+ DEBUG(DEBUG_ERR,("Failed to allocate key for deferred fetch\n"));
+ return -1;
+ }
+
+ dfq = trbt_lookuparray32(ctdb_db->deferred_fetch, k[0], &k[0]);
+ if (dfq == NULL) {
+ talloc_free(k);
+ return -1;
+ }
+
+
+ talloc_free(k);
+
+ dfc = talloc(dfq, struct ctdb_deferred_fetch_call);
+ if (dfc == NULL) {
+ DEBUG(DEBUG_ERR, ("Failed to allocate deferred fetch call structure\n"));
+ return -1;
+ }
+
+ dfc->w = talloc(dfc, struct ctdb_daemon_packet_wrap);
+ if (dfc->w == NULL) {
+ DEBUG(DEBUG_ERR,("Failed to allocate deferred fetch daemon packet wrap structure\n"));
+ talloc_free(dfc);
+ return -1;
+ }
+
+ dfc->c = talloc_steal(dfc, c);
+ dfc->w->ctdb = ctdb_db->ctdb;
+ dfc->w->client_id = client->client_id;
+
+ DLIST_ADD_END(dfq->deferred_calls, dfc);
+
+ return 0;
+}
+
+
+/*
+ this is called when the ctdb daemon received a ctdb request call
+ from a local client over the unix domain socket
+ */
+static void daemon_request_call_from_client(struct ctdb_client *client,
+ struct ctdb_req_call_old *c)
+{
+ struct ctdb_call_state *state;
+ struct ctdb_db_context *ctdb_db;
+ struct daemon_call_state *dstate;
+ struct ctdb_call *call;
+ struct ctdb_ltdb_header header;
+ TDB_DATA key, data;
+ int ret;
+ struct ctdb_context *ctdb = client->ctdb;
+ struct ctdb_daemon_packet_wrap *w;
+
+ CTDB_INCREMENT_STAT(ctdb, total_calls);
+ CTDB_INCREMENT_STAT(ctdb, pending_calls);
+
+ ctdb_db = find_ctdb_db(client->ctdb, c->db_id);
+ if (!ctdb_db) {
+ DEBUG(DEBUG_ERR, (__location__ " Unknown database in request. db_id==0x%08x\n",
+ c->db_id));
+ CTDB_DECREMENT_STAT(ctdb, pending_calls);
+ return;
+ }
+
+ if (ctdb_db->unhealthy_reason) {
+ /*
+ * this is just a warning, as the tdb should be empty anyway,
+ * and only persistent databases can be unhealthy, which doesn't
+ * use this code patch
+ */
+ DEBUG(DEBUG_WARNING,("warn: db(%s) unhealty in daemon_request_call_from_client(): %s\n",
+ ctdb_db->db_name, ctdb_db->unhealthy_reason));
+ }
+
+ key.dptr = c->data;
+ key.dsize = c->keylen;
+
+ w = talloc(ctdb, struct ctdb_daemon_packet_wrap);
+ CTDB_NO_MEMORY_VOID(ctdb, w);
+
+ w->ctdb = ctdb;
+ w->client_id = client->client_id;
+
+ ret = ctdb_ltdb_lock_fetch_requeue(ctdb_db, key, &header,
+ (struct ctdb_req_header *)c, &data,
+ daemon_incoming_packet_wrap, w, true);
+ if (ret == -2) {
+ /* will retry later */
+ CTDB_DECREMENT_STAT(ctdb, pending_calls);
+ return;
+ }
+
+ talloc_free(w);
+
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Unable to fetch record\n"));
+ CTDB_DECREMENT_STAT(ctdb, pending_calls);
+ return;
+ }
+
+
+ /* check if this fetch request is a duplicate for a
+ request we already have in flight. If so defer it until
+ the first request completes.
+ */
+ if (ctdb->tunable.fetch_collapse == 1) {
+ if (requeue_duplicate_fetch(ctdb_db, client, key, c) == 0) {
+ ret = ctdb_ltdb_unlock(ctdb_db, key);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+ }
+ CTDB_DECREMENT_STAT(ctdb, pending_calls);
+ talloc_free(data.dptr);
+ return;
+ }
+ }
+
+ /* Dont do READONLY if we don't have a tracking database */
+ if ((c->flags & CTDB_WANT_READONLY) && !ctdb_db_readonly(ctdb_db)) {
+ c->flags &= ~CTDB_WANT_READONLY;
+ }
+
+ if (header.flags & CTDB_REC_RO_REVOKE_COMPLETE) {
+ header.flags &= ~CTDB_REC_RO_FLAGS;
+ CTDB_INCREMENT_STAT(ctdb, total_ro_revokes);
+ CTDB_INCREMENT_DB_STAT(ctdb_db, db_ro_revokes);
+ if (ctdb_ltdb_store(ctdb_db, key, &header, data) != 0) {
+ ctdb_fatal(ctdb, "Failed to write header with cleared REVOKE flag");
+ }
+ /* and clear out the tracking data */
+ if (tdb_delete(ctdb_db->rottdb, key) != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to clear out trackingdb record\n"));
+ }
+ }
+
+ /* if we are revoking, we must defer all other calls until the revoke
+ * had completed.
+ */
+ if (header.flags & CTDB_REC_RO_REVOKING_READONLY) {
+ talloc_free(data.dptr);
+ ret = ctdb_ltdb_unlock(ctdb_db, key);
+
+ if (ctdb_add_revoke_deferred_call(ctdb, ctdb_db, key, (struct ctdb_req_header *)c, daemon_incoming_packet, client) != 0) {
+ ctdb_fatal(ctdb, "Failed to add deferred call for revoke child");
+ }
+ CTDB_DECREMENT_STAT(ctdb, pending_calls);
+ return;
+ }
+
+ if ((header.dmaster == ctdb->pnn)
+ && (!(c->flags & CTDB_WANT_READONLY))
+ && (header.flags & (CTDB_REC_RO_HAVE_DELEGATIONS|CTDB_REC_RO_HAVE_READONLY)) ) {
+ header.flags |= CTDB_REC_RO_REVOKING_READONLY;
+ if (ctdb_ltdb_store(ctdb_db, key, &header, data) != 0) {
+ ctdb_fatal(ctdb, "Failed to store record with HAVE_DELEGATIONS set");
+ }
+ ret = ctdb_ltdb_unlock(ctdb_db, key);
+
+ if (ctdb_start_revoke_ro_record(ctdb, ctdb_db, key, &header, data) != 0) {
+ ctdb_fatal(ctdb, "Failed to start record revoke");
+ }
+ talloc_free(data.dptr);
+
+ if (ctdb_add_revoke_deferred_call(ctdb, ctdb_db, key, (struct ctdb_req_header *)c, daemon_incoming_packet, client) != 0) {
+ ctdb_fatal(ctdb, "Failed to add deferred call for revoke child");
+ }
+
+ CTDB_DECREMENT_STAT(ctdb, pending_calls);
+ return;
+ }
+
+ dstate = talloc(client, struct daemon_call_state);
+ if (dstate == NULL) {
+ ret = ctdb_ltdb_unlock(ctdb_db, key);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+ }
+
+ DEBUG(DEBUG_ERR,(__location__ " Unable to allocate dstate\n"));
+ CTDB_DECREMENT_STAT(ctdb, pending_calls);
+ return;
+ }
+ dstate->start_time = timeval_current();
+ dstate->client = client;
+ dstate->reqid = c->hdr.reqid;
+ talloc_steal(dstate, data.dptr);
+
+ call = dstate->call = talloc_zero(dstate, struct ctdb_call);
+ if (call == NULL) {
+ ret = ctdb_ltdb_unlock(ctdb_db, key);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+ }
+
+ DEBUG(DEBUG_ERR,(__location__ " Unable to allocate call\n"));
+ CTDB_DECREMENT_STAT(ctdb, pending_calls);
+ CTDB_UPDATE_LATENCY(ctdb, ctdb_db, "call_from_client 1", call_latency, dstate->start_time);
+ return;
+ }
+
+ dstate->readonly_fetch = 0;
+ call->call_id = c->callid;
+ call->key = key;
+ call->call_data.dptr = c->data + c->keylen;
+ call->call_data.dsize = c->calldatalen;
+ call->flags = c->flags;
+
+ if (c->flags & CTDB_WANT_READONLY) {
+ /* client wants readonly record, so translate this into a
+ fetch with header. remember what the client asked for
+ so we can remap the reply back to the proper format for
+ the client in the reply
+ */
+ dstate->client_callid = call->call_id;
+ call->call_id = CTDB_FETCH_WITH_HEADER_FUNC;
+ dstate->readonly_fetch = 1;
+ }
+
+ if (header.dmaster == ctdb->pnn) {
+ state = ctdb_call_local_send(ctdb_db, call, &header, &data);
+ } else {
+ state = ctdb_daemon_call_send_remote(ctdb_db, call, &header);
+ if (ctdb->tunable.fetch_collapse == 1) {
+ /* This request triggered a remote fetch-lock.
+ set up a deferral for this key so any additional
+ fetch-locks are deferred until the current one
+ finishes.
+ */
+ setup_deferred_fetch_locks(ctdb_db, call);
+ }
+ }
+
+ ret = ctdb_ltdb_unlock(ctdb_db, key);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+ }
+
+ if (state == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Unable to setup call send\n"));
+ CTDB_DECREMENT_STAT(ctdb, pending_calls);
+ CTDB_UPDATE_LATENCY(ctdb, ctdb_db, "call_from_client 2", call_latency, dstate->start_time);
+ return;
+ }
+ talloc_steal(state, dstate);
+ talloc_steal(client, state);
+
+ state->async.fn = daemon_call_from_client_callback;
+ state->async.private_data = dstate;
+}
+
+
+static void daemon_request_control_from_client(struct ctdb_client *client,
+ struct ctdb_req_control_old *c);
+static void daemon_request_tunnel_from_client(struct ctdb_client *client,
+ struct ctdb_req_tunnel_old *c);
+
+/* data contains a packet from the client */
+static void daemon_incoming_packet(void *p, struct ctdb_req_header *hdr)
+{
+ struct ctdb_client *client = talloc_get_type(p, struct ctdb_client);
+ TALLOC_CTX *tmp_ctx;
+ struct ctdb_context *ctdb = client->ctdb;
+
+ /* place the packet as a child of a tmp_ctx. We then use
+ talloc_free() below to free it. If any of the calls want
+ to keep it, then they will steal it somewhere else, and the
+ talloc_free() will be a no-op */
+ tmp_ctx = talloc_new(client);
+ talloc_steal(tmp_ctx, hdr);
+
+ if (hdr->ctdb_magic != CTDB_MAGIC) {
+ ctdb_set_error(client->ctdb, "Non CTDB packet rejected in daemon\n");
+ goto done;
+ }
+
+ if (hdr->ctdb_version != CTDB_PROTOCOL) {
+ ctdb_set_error(client->ctdb, "Bad CTDB version 0x%x rejected in daemon\n", hdr->ctdb_version);
+ goto done;
+ }
+
+ switch (hdr->operation) {
+ case CTDB_REQ_CALL:
+ CTDB_INCREMENT_STAT(ctdb, client.req_call);
+ daemon_request_call_from_client(client, (struct ctdb_req_call_old *)hdr);
+ break;
+
+ case CTDB_REQ_MESSAGE:
+ CTDB_INCREMENT_STAT(ctdb, client.req_message);
+ daemon_request_message_from_client(client, (struct ctdb_req_message_old *)hdr);
+ break;
+
+ case CTDB_REQ_CONTROL:
+ CTDB_INCREMENT_STAT(ctdb, client.req_control);
+ daemon_request_control_from_client(client, (struct ctdb_req_control_old *)hdr);
+ break;
+
+ case CTDB_REQ_TUNNEL:
+ CTDB_INCREMENT_STAT(ctdb, client.req_tunnel);
+ daemon_request_tunnel_from_client(client, (struct ctdb_req_tunnel_old *)hdr);
+ break;
+
+ default:
+ DEBUG(DEBUG_CRIT,(__location__ " daemon: unrecognized operation %u\n",
+ hdr->operation));
+ }
+
+done:
+ talloc_free(tmp_ctx);
+}
+
+/*
+ called when the daemon gets a incoming packet
+ */
+static void ctdb_daemon_read_cb(uint8_t *data, size_t cnt, void *args)
+{
+ struct ctdb_client *client = talloc_get_type(args, struct ctdb_client);
+ struct ctdb_req_header *hdr;
+
+ if (cnt == 0) {
+ talloc_free(client);
+ return;
+ }
+
+ CTDB_INCREMENT_STAT(client->ctdb, client_packets_recv);
+
+ if (cnt < sizeof(*hdr)) {
+ ctdb_set_error(client->ctdb, "Bad packet length %u in daemon\n",
+ (unsigned)cnt);
+ return;
+ }
+ hdr = (struct ctdb_req_header *)data;
+
+ if (hdr->ctdb_magic != CTDB_MAGIC) {
+ ctdb_set_error(client->ctdb, "Non CTDB packet rejected\n");
+ goto err_out;
+ }
+
+ if (hdr->ctdb_version != CTDB_PROTOCOL) {
+ ctdb_set_error(client->ctdb, "Bad CTDB version 0x%x rejected in daemon\n", hdr->ctdb_version);
+ goto err_out;
+ }
+
+ DEBUG(DEBUG_DEBUG,(__location__ " client request %u of type %u length %u from "
+ "node %u to %u\n", hdr->reqid, hdr->operation, hdr->length,
+ hdr->srcnode, hdr->destnode));
+
+ /* it is the responsibility of the incoming packet function to free 'data' */
+ daemon_incoming_packet(client, hdr);
+ return;
+
+err_out:
+ TALLOC_FREE(data);
+}
+
+
+static int ctdb_clientpid_destructor(struct ctdb_client_pid_list *client_pid)
+{
+ if (client_pid->ctdb->client_pids != NULL) {
+ DLIST_REMOVE(client_pid->ctdb->client_pids, client_pid);
+ }
+
+ return 0;
+}
+
+static int get_new_client_id(struct reqid_context *idr,
+ struct ctdb_client *client,
+ uint32_t *out)
+{
+ uint32_t client_id;
+
+ client_id = reqid_new(idr, client);
+ /*
+ * Some places in the code (e.g. ctdb_control_db_attach(),
+ * ctdb_control_db_detach()) assign a special meaning to
+ * client_id 0. The assumption is that if client_id is 0 then
+ * the control has come from another daemon. Therefore, we
+ * should never return client_id == 0.
+ */
+ if (client_id == 0) {
+ /*
+ * Don't leak ID 0. This is safe because the ID keeps
+ * increasing. A test will be added to ensure that
+ * this doesn't change.
+ */
+ reqid_remove(idr, 0);
+
+ client_id = reqid_new(idr, client);
+ }
+
+ if (client_id == REQID_INVALID) {
+ return EINVAL;
+ }
+
+ if (client_id == 0) {
+ /* Every other ID must have been used and we can't use 0 */
+ reqid_remove(idr, 0);
+ return EINVAL;
+ }
+
+ *out = client_id;
+ return 0;
+}
+
+static void ctdb_accept_client(struct tevent_context *ev,
+ struct tevent_fd *fde, uint16_t flags,
+ void *private_data)
+{
+ struct sockaddr_un addr;
+ socklen_t len;
+ int fd;
+ struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+ struct ctdb_client *client;
+ struct ctdb_client_pid_list *client_pid;
+ pid_t peer_pid = 0;
+ int ret;
+
+ memset(&addr, 0, sizeof(addr));
+ len = sizeof(addr);
+ fd = accept(ctdb->daemon.sd, (struct sockaddr *)&addr, &len);
+ if (fd == -1) {
+ return;
+ }
+ smb_set_close_on_exec(fd);
+
+ ret = set_blocking(fd, false);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,
+ (__location__
+ " failed to set socket non-blocking (%s)\n",
+ strerror(errno)));
+ close(fd);
+ return;
+ }
+
+ set_close_on_exec(fd);
+
+ DEBUG(DEBUG_DEBUG,(__location__ " Created SOCKET FD:%d to connected child\n", fd));
+
+ client = talloc_zero(ctdb, struct ctdb_client);
+ if (ctdb_get_peer_pid(fd, &peer_pid) == 0) {
+ DEBUG(DEBUG_INFO,("Connected client with pid:%u\n", (unsigned)peer_pid));
+ }
+
+ client->ctdb = ctdb;
+ client->fd = fd;
+
+ ret = get_new_client_id(ctdb->idr, client, &client->client_id);
+ if (ret != 0) {
+ DBG_ERR("Unable to get client ID (%d)\n", ret);
+ close(fd);
+ talloc_free(client);
+ return;
+ }
+
+ client->pid = peer_pid;
+
+ client_pid = talloc(client, struct ctdb_client_pid_list);
+ if (client_pid == NULL) {
+ DEBUG(DEBUG_ERR,("Failed to allocate client pid structure\n"));
+ close(fd);
+ talloc_free(client);
+ return;
+ }
+ client_pid->ctdb = ctdb;
+ client_pid->pid = peer_pid;
+ client_pid->client = client;
+
+ DLIST_ADD(ctdb->client_pids, client_pid);
+
+ client->queue = ctdb_queue_setup(ctdb, client, fd, CTDB_DS_ALIGNMENT,
+ ctdb_daemon_read_cb, client,
+ "client-%u", client->pid);
+
+ talloc_set_destructor(client, ctdb_client_destructor);
+ talloc_set_destructor(client_pid, ctdb_clientpid_destructor);
+ ctdb->num_clients++;
+}
+
+
+
+/*
+ * Create a unix domain socket, bind it, secure it and listen. Return
+ * the file descriptor for the socket.
+ */
+static int ux_socket_bind(struct ctdb_context *ctdb, bool test_mode_enabled)
+{
+ struct sockaddr_un addr = { .sun_family = AF_UNIX };
+ int ret;
+
+ ctdb->daemon.sd = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (ctdb->daemon.sd == -1) {
+ return -1;
+ }
+
+ strncpy(addr.sun_path, ctdb->daemon.name, sizeof(addr.sun_path)-1);
+
+ if (! sock_clean(ctdb->daemon.name)) {
+ return -1;
+ }
+
+ set_close_on_exec(ctdb->daemon.sd);
+
+ ret = set_blocking(ctdb->daemon.sd, false);
+ if (ret != 0) {
+ DBG_ERR("Failed to set socket non-blocking (%s)\n",
+ strerror(errno));
+ goto failed;
+ }
+
+ ret = bind(ctdb->daemon.sd, (struct sockaddr *)&addr, sizeof(addr));
+ if (ret == -1) {
+ D_ERR("Unable to bind on ctdb socket '%s'\n", ctdb->daemon.name);
+ goto failed;
+ }
+
+ if (!test_mode_enabled) {
+ ret = chown(ctdb->daemon.name, geteuid(), getegid());
+ if (ret != 0 && !test_mode_enabled) {
+ D_ERR("Unable to secure (chown) ctdb socket '%s'\n",
+ ctdb->daemon.name);
+ goto failed;
+ }
+ }
+
+ ret = chmod(ctdb->daemon.name, 0700);
+ if (ret != 0) {
+ D_ERR("Unable to secure (chmod) ctdb socket '%s'\n",
+ ctdb->daemon.name);
+ goto failed;
+ }
+
+
+ ret = listen(ctdb->daemon.sd, 100);
+ if (ret != 0) {
+ D_ERR("Unable to listen on ctdb socket '%s'\n",
+ ctdb->daemon.name);
+ goto failed;
+ }
+
+ D_NOTICE("Listening to ctdb socket %s\n", ctdb->daemon.name);
+ return 0;
+
+failed:
+ close(ctdb->daemon.sd);
+ ctdb->daemon.sd = -1;
+ return -1;
+}
+
+struct ctdb_node *ctdb_find_node(struct ctdb_context *ctdb, uint32_t pnn)
+{
+ struct ctdb_node *node = NULL;
+ unsigned int i;
+
+ if (pnn == CTDB_CURRENT_NODE) {
+ pnn = ctdb->pnn;
+ }
+
+ /* Always found: PNN correctly set just before this is called */
+ for (i = 0; i < ctdb->num_nodes; i++) {
+ node = ctdb->nodes[i];
+ if (pnn == node->pnn) {
+ return node;
+ }
+ }
+
+ return NULL;
+}
+
+static void initialise_node_flags (struct ctdb_context *ctdb)
+{
+ struct ctdb_node *node = NULL;
+
+ node = ctdb_find_node(ctdb, CTDB_CURRENT_NODE);
+ /*
+ * PNN correctly set just before this is called so always
+ * found but keep static analysers happy...
+ */
+ if (node == NULL) {
+ DBG_ERR("Unable to find current node\n");
+ return;
+ }
+
+ node->flags &= ~NODE_FLAGS_DISCONNECTED;
+
+ /* do we start out in DISABLED mode? */
+ if (ctdb->start_as_disabled != 0) {
+ D_ERR("This node is configured to start in DISABLED state\n");
+ node->flags |= NODE_FLAGS_PERMANENTLY_DISABLED;
+ }
+ /* do we start out in STOPPED mode? */
+ if (ctdb->start_as_stopped != 0) {
+ D_ERR("This node is configured to start in STOPPED state\n");
+ node->flags |= NODE_FLAGS_STOPPED;
+ }
+}
+
+static void ctdb_setup_event_callback(struct ctdb_context *ctdb, int status,
+ void *private_data)
+{
+ if (status != 0) {
+ ctdb_die(ctdb, "Failed to run setup event");
+ }
+ ctdb_run_notification_script(ctdb, "setup");
+
+ /* Start the recovery daemon */
+ if (ctdb_start_recoverd(ctdb) != 0) {
+ DEBUG(DEBUG_ALERT,("Failed to start recovery daemon\n"));
+ exit(11);
+ }
+
+ ctdb_start_periodic_events(ctdb);
+
+ ctdb_wait_for_first_recovery(ctdb);
+}
+
+static struct timeval tevent_before_wait_ts;
+static struct timeval tevent_after_wait_ts;
+
+static void ctdb_tevent_trace_init(void)
+{
+ struct timeval now;
+
+ now = timeval_current();
+
+ tevent_before_wait_ts = now;
+ tevent_after_wait_ts = now;
+}
+
+static void ctdb_tevent_trace(enum tevent_trace_point tp,
+ void *private_data)
+{
+ struct timeval diff;
+ struct timeval now;
+ struct ctdb_context *ctdb =
+ talloc_get_type(private_data, struct ctdb_context);
+
+ if (getpid() != ctdb->ctdbd_pid) {
+ return;
+ }
+
+ now = timeval_current();
+
+ switch (tp) {
+ case TEVENT_TRACE_BEFORE_WAIT:
+ diff = timeval_until(&tevent_after_wait_ts, &now);
+ if (diff.tv_sec > 3) {
+ DEBUG(DEBUG_ERR,
+ ("Handling event took %ld seconds!\n",
+ (long)diff.tv_sec));
+ }
+ tevent_before_wait_ts = now;
+ break;
+
+ case TEVENT_TRACE_AFTER_WAIT:
+ diff = timeval_until(&tevent_before_wait_ts, &now);
+ if (diff.tv_sec > 3) {
+ DEBUG(DEBUG_ERR,
+ ("No event for %ld seconds!\n",
+ (long)diff.tv_sec));
+ }
+ tevent_after_wait_ts = now;
+ break;
+
+ default:
+ /* Do nothing for future tevent trace points */ ;
+ }
+}
+
+static void ctdb_remove_pidfile(void)
+{
+ TALLOC_FREE(ctdbd_pidfile_ctx);
+}
+
+static void ctdb_create_pidfile(TALLOC_CTX *mem_ctx)
+{
+ if (ctdbd_pidfile != NULL) {
+ int ret = pidfile_context_create(mem_ctx, ctdbd_pidfile,
+ &ctdbd_pidfile_ctx);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,
+ ("Failed to create PID file %s\n",
+ ctdbd_pidfile));
+ exit(11);
+ }
+
+ DEBUG(DEBUG_NOTICE, ("Created PID file %s\n", ctdbd_pidfile));
+ atexit(ctdb_remove_pidfile);
+ }
+}
+
+static void ctdb_initialise_vnn_map(struct ctdb_context *ctdb)
+{
+ unsigned int i, j, count;
+
+ /* initialize the vnn mapping table, skipping any deleted nodes */
+ ctdb->vnn_map = talloc(ctdb, struct ctdb_vnn_map);
+ CTDB_NO_MEMORY_FATAL(ctdb, ctdb->vnn_map);
+
+ count = 0;
+ for (i = 0; i < ctdb->num_nodes; i++) {
+ if ((ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) == 0) {
+ count++;
+ }
+ }
+
+ ctdb->vnn_map->generation = INVALID_GENERATION;
+ ctdb->vnn_map->size = count;
+ ctdb->vnn_map->map = talloc_array(ctdb->vnn_map, uint32_t, ctdb->vnn_map->size);
+ CTDB_NO_MEMORY_FATAL(ctdb, ctdb->vnn_map->map);
+
+ for(i=0, j=0; i < ctdb->vnn_map->size; i++) {
+ if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
+ continue;
+ }
+ ctdb->vnn_map->map[j] = i;
+ j++;
+ }
+}
+
+static void ctdb_set_my_pnn(struct ctdb_context *ctdb)
+{
+ if (ctdb->address == NULL) {
+ ctdb_fatal(ctdb,
+ "Can not determine PNN - node address is not set\n");
+ }
+
+ ctdb->pnn = ctdb_ip_to_pnn(ctdb, ctdb->address);
+ if (ctdb->pnn == CTDB_UNKNOWN_PNN) {
+ ctdb_fatal(ctdb,
+ "Can not determine PNN - unknown node address\n");
+ }
+
+ D_NOTICE("PNN is %u\n", ctdb->pnn);
+}
+
+static void stdin_handler(struct tevent_context *ev,
+ struct tevent_fd *fde,
+ uint16_t flags,
+ void *private_data)
+{
+ struct ctdb_context *ctdb = talloc_get_type_abort(
+ private_data, struct ctdb_context);
+ ssize_t nread;
+ char c;
+
+ nread = read(STDIN_FILENO, &c, 1);
+ if (nread != 1) {
+ D_ERR("stdin closed, exiting\n");
+ talloc_free(fde);
+ ctdb_shutdown_sequence(ctdb, EPIPE);
+ }
+}
+
+static int setup_stdin_handler(struct ctdb_context *ctdb)
+{
+ struct tevent_fd *fde;
+ struct stat st;
+ int ret;
+
+ ret = fstat(STDIN_FILENO, &st);
+ if (ret != 0) {
+ /* Problem with stdin, ignore... */
+ DBG_INFO("Can't fstat() stdin\n");
+ return 0;
+ }
+
+ if (!S_ISFIFO(st.st_mode)) {
+ DBG_INFO("Not a pipe...\n");
+ return 0;
+ }
+
+ fde = tevent_add_fd(ctdb->ev,
+ ctdb,
+ STDIN_FILENO,
+ TEVENT_FD_READ,
+ stdin_handler,
+ ctdb);
+ if (fde == NULL) {
+ return ENOMEM;
+ }
+
+ DBG_INFO("Set up stdin handler\n");
+ return 0;
+}
+
+static void fork_only(void)
+{
+ pid_t pid;
+
+ pid = fork();
+ if (pid == -1) {
+ D_ERR("Fork failed (errno=%d)\n", errno);
+ exit(1);
+ }
+
+ if (pid != 0) {
+ /* Parent simply exits... */
+ exit(0);
+ }
+}
+
+static void sighup_hook(void *private_data)
+{
+ struct ctdb_context *ctdb = talloc_get_type_abort(private_data,
+ struct ctdb_context);
+
+ if (ctdb->recoverd_pid > 0) {
+ kill(ctdb->recoverd_pid, SIGHUP);
+ }
+ ctdb_event_reopen_logs(ctdb);
+}
+
+/*
+ start the protocol going as a daemon
+*/
+int ctdb_start_daemon(struct ctdb_context *ctdb,
+ bool interactive,
+ bool test_mode_enabled)
+{
+ bool status;
+ int ret;
+ struct tevent_fd *fde;
+
+ /* Fork if not interactive */
+ if (!interactive) {
+ if (test_mode_enabled) {
+ /* Keep stdin open */
+ fork_only();
+ } else {
+ /* Fork, close stdin, start a session */
+ become_daemon(true, false, false);
+ }
+ }
+
+ ignore_signal(SIGPIPE);
+ ignore_signal(SIGUSR1);
+
+ ctdb->ctdbd_pid = getpid();
+ DEBUG(DEBUG_ERR, ("Starting CTDBD (Version %s) as PID: %u\n",
+ SAMBA_VERSION_STRING, ctdb->ctdbd_pid));
+ ctdb_create_pidfile(ctdb);
+
+ /* create a unix domain stream socket to listen to */
+ ret = ux_socket_bind(ctdb, test_mode_enabled);
+ if (ret != 0) {
+ D_ERR("Cannot continue. Exiting!\n");
+ exit(10);
+ }
+
+ /* Make sure we log something when the daemon terminates.
+ * This must be the first exit handler to run (so the last to
+ * be registered.
+ */
+ __ctdbd_pid = getpid();
+ atexit(print_exit_message);
+
+ if (ctdb->do_setsched) {
+ /* try to set us up as realtime */
+ if (!set_scheduler()) {
+ exit(1);
+ }
+ DEBUG(DEBUG_NOTICE, ("Set real-time scheduler priority\n"));
+ }
+
+ ctdb->ev = tevent_context_init(NULL);
+ if (ctdb->ev == NULL) {
+ DEBUG(DEBUG_ALERT,("tevent_context_init() failed\n"));
+ exit(1);
+ }
+ tevent_loop_allow_nesting(ctdb->ev);
+ ctdb_tevent_trace_init();
+ tevent_set_trace_callback(ctdb->ev, ctdb_tevent_trace, ctdb);
+
+ status = logging_setup_sighup_handler(ctdb->ev,
+ ctdb,
+ sighup_hook,
+ ctdb);
+ if (!status) {
+ D_ERR("Failed to set up signal handler for SIGHUP\n");
+ exit(1);
+ }
+
+ /* set up a handler to pick up sigchld */
+ if (ctdb_init_sigchld(ctdb) == NULL) {
+ DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD\n"));
+ exit(1);
+ }
+
+ if (!interactive) {
+ ctdb_set_child_logging(ctdb);
+ }
+
+ /* Exit if stdin is closed */
+ if (test_mode_enabled) {
+ ret = setup_stdin_handler(ctdb);
+ if (ret != 0) {
+ DBG_ERR("Failed to setup stdin handler\n");
+ exit(1);
+ }
+ }
+
+ TALLOC_FREE(ctdb->srv);
+ if (srvid_init(ctdb, &ctdb->srv) != 0) {
+ DEBUG(DEBUG_CRIT,("Failed to setup message srvid context\n"));
+ exit(1);
+ }
+
+ TALLOC_FREE(ctdb->tunnels);
+ if (srvid_init(ctdb, &ctdb->tunnels) != 0) {
+ DEBUG(DEBUG_ERR, ("Failed to setup tunnels context\n"));
+ exit(1);
+ }
+
+ /* initialize statistics collection */
+ ctdb_statistics_init(ctdb);
+
+ /* force initial recovery for election */
+ ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
+
+ if (ctdb_start_eventd(ctdb) != 0) {
+ DEBUG(DEBUG_ERR, ("Failed to start event daemon\n"));
+ exit(1);
+ }
+
+ ctdb_set_runstate(ctdb, CTDB_RUNSTATE_INIT);
+ ret = ctdb_event_script(ctdb, CTDB_EVENT_INIT);
+ if (ret != 0) {
+ ctdb_die(ctdb, "Failed to run init event\n");
+ }
+ ctdb_run_notification_script(ctdb, "init");
+
+ if (strcmp(ctdb->transport, "tcp") == 0) {
+ ret = ctdb_tcp_init(ctdb);
+ }
+#ifdef USE_INFINIBAND
+ if (strcmp(ctdb->transport, "ib") == 0) {
+ ret = ctdb_ibw_init(ctdb);
+ }
+#endif
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,("Failed to initialise transport '%s'\n", ctdb->transport));
+ return -1;
+ }
+
+ if (ctdb->methods == NULL) {
+ DEBUG(DEBUG_ALERT,(__location__ " Can not initialize transport. ctdb->methods is NULL\n"));
+ ctdb_fatal(ctdb, "transport is unavailable. can not initialize.");
+ }
+
+ /* Initialise the transport. This sets the node address if it
+ * was not set via the command-line. */
+ if (ctdb->methods->initialise(ctdb) != 0) {
+ ctdb_fatal(ctdb, "transport failed to initialise");
+ }
+
+ ctdb_set_my_pnn(ctdb);
+
+ initialise_node_flags(ctdb);
+
+ ret = ctdb_set_public_addresses(ctdb, true);
+ if (ret == -1) {
+ D_ERR("Unable to setup public IP addresses\n");
+ exit(1);
+ }
+
+ ctdb_initialise_vnn_map(ctdb);
+
+ /* attach to existing databases */
+ if (ctdb_attach_databases(ctdb) != 0) {
+ ctdb_fatal(ctdb, "Failed to attach to databases\n");
+ }
+
+ /* start frozen, then let the first election sort things out */
+ if (!ctdb_blocking_freeze(ctdb)) {
+ ctdb_fatal(ctdb, "Failed to get initial freeze\n");
+ }
+
+ /* now start accepting clients, only can do this once frozen */
+ fde = tevent_add_fd(ctdb->ev, ctdb, ctdb->daemon.sd, TEVENT_FD_READ,
+ ctdb_accept_client, ctdb);
+ if (fde == NULL) {
+ ctdb_fatal(ctdb, "Failed to add daemon socket to event loop");
+ }
+ tevent_fd_set_auto_close(fde);
+
+ /* Start the transport */
+ if (ctdb->methods->start(ctdb) != 0) {
+ DEBUG(DEBUG_ALERT,("transport failed to start!\n"));
+ ctdb_fatal(ctdb, "transport failed to start");
+ }
+
+ /* Recovery daemon and timed events are started from the
+ * callback, only after the setup event completes
+ * successfully.
+ */
+ ctdb_set_runstate(ctdb, CTDB_RUNSTATE_SETUP);
+ ret = ctdb_event_script_callback(ctdb,
+ ctdb,
+ ctdb_setup_event_callback,
+ ctdb,
+ CTDB_EVENT_SETUP,
+ "%s",
+ "");
+ if (ret != 0) {
+ DEBUG(DEBUG_CRIT,("Failed to set up 'setup' event\n"));
+ exit(1);
+ }
+
+ lockdown_memory(ctdb->valgrinding);
+
+ /* go into a wait loop to allow other nodes to complete */
+ tevent_loop_wait(ctdb->ev);
+
+ DEBUG(DEBUG_CRIT,("event_loop_wait() returned. this should not happen\n"));
+ exit(1);
+}
+
+/*
+ allocate a packet for use in daemon<->daemon communication
+ */
+struct ctdb_req_header *_ctdb_transport_allocate(struct ctdb_context *ctdb,
+ TALLOC_CTX *mem_ctx,
+ enum ctdb_operation operation,
+ size_t length, size_t slength,
+ const char *type)
+{
+ int size;
+ struct ctdb_req_header *hdr;
+
+ length = MAX(length, slength);
+ size = (length+(CTDB_DS_ALIGNMENT-1)) & ~(CTDB_DS_ALIGNMENT-1);
+
+ if (ctdb->methods == NULL) {
+ DEBUG(DEBUG_INFO,(__location__ " Unable to allocate transport packet for operation %u of length %u. Transport is DOWN.\n",
+ operation, (unsigned)length));
+ return NULL;
+ }
+
+ hdr = (struct ctdb_req_header *)ctdb->methods->allocate_pkt(mem_ctx, size);
+ if (hdr == NULL) {
+ DEBUG(DEBUG_ERR,("Unable to allocate transport packet for operation %u of length %u\n",
+ operation, (unsigned)length));
+ return NULL;
+ }
+ talloc_set_name_const(hdr, type);
+ memset(hdr, 0, slength);
+ hdr->length = length;
+ hdr->operation = operation;
+ hdr->ctdb_magic = CTDB_MAGIC;
+ hdr->ctdb_version = CTDB_PROTOCOL;
+ hdr->generation = ctdb->vnn_map->generation;
+ hdr->srcnode = ctdb->pnn;
+
+ return hdr;
+}
+
+struct daemon_control_state {
+ struct daemon_control_state *next, *prev;
+ struct ctdb_client *client;
+ struct ctdb_req_control_old *c;
+ uint32_t reqid;
+ struct ctdb_node *node;
+};
+
+/*
+ callback when a control reply comes in
+ */
+static void daemon_control_callback(struct ctdb_context *ctdb,
+ int32_t status, TDB_DATA data,
+ const char *errormsg,
+ void *private_data)
+{
+ struct daemon_control_state *state = talloc_get_type(private_data,
+ struct daemon_control_state);
+ struct ctdb_client *client = state->client;
+ struct ctdb_reply_control_old *r;
+ size_t len;
+ int ret;
+
+ /* construct a message to send to the client containing the data */
+ len = offsetof(struct ctdb_reply_control_old, data) + data.dsize;
+ if (errormsg) {
+ len += strlen(errormsg);
+ }
+ r = ctdbd_allocate_pkt(ctdb, state, CTDB_REPLY_CONTROL, len,
+ struct ctdb_reply_control_old);
+ CTDB_NO_MEMORY_VOID(ctdb, r);
+
+ r->hdr.reqid = state->reqid;
+ r->status = status;
+ r->datalen = data.dsize;
+ r->errorlen = 0;
+ memcpy(&r->data[0], data.dptr, data.dsize);
+ if (errormsg) {
+ r->errorlen = strlen(errormsg);
+ memcpy(&r->data[r->datalen], errormsg, r->errorlen);
+ }
+
+ ret = daemon_queue_send(client, &r->hdr);
+ if (ret != -1) {
+ talloc_free(state);
+ }
+}
+
+/*
+ fail all pending controls to a disconnected node
+ */
+void ctdb_daemon_cancel_controls(struct ctdb_context *ctdb, struct ctdb_node *node)
+{
+ struct daemon_control_state *state;
+ while ((state = node->pending_controls)) {
+ DLIST_REMOVE(node->pending_controls, state);
+ daemon_control_callback(ctdb, (uint32_t)-1, tdb_null,
+ "node is disconnected", state);
+ }
+}
+
+/*
+ destroy a daemon_control_state
+ */
+static int daemon_control_destructor(struct daemon_control_state *state)
+{
+ if (state->node) {
+ DLIST_REMOVE(state->node->pending_controls, state);
+ }
+ return 0;
+}
+
+/*
+ this is called when the ctdb daemon received a ctdb request control
+ from a local client over the unix domain socket
+ */
+static void daemon_request_control_from_client(struct ctdb_client *client,
+ struct ctdb_req_control_old *c)
+{
+ TDB_DATA data;
+ int res;
+ struct daemon_control_state *state;
+ TALLOC_CTX *tmp_ctx = talloc_new(client);
+
+ if (c->hdr.destnode == CTDB_CURRENT_NODE) {
+ c->hdr.destnode = client->ctdb->pnn;
+ }
+
+ state = talloc(client, struct daemon_control_state);
+ CTDB_NO_MEMORY_VOID(client->ctdb, state);
+
+ state->client = client;
+ state->c = talloc_steal(state, c);
+ state->reqid = c->hdr.reqid;
+ if (ctdb_validate_pnn(client->ctdb, c->hdr.destnode)) {
+ state->node = client->ctdb->nodes[c->hdr.destnode];
+ DLIST_ADD(state->node->pending_controls, state);
+ } else {
+ state->node = NULL;
+ }
+
+ talloc_set_destructor(state, daemon_control_destructor);
+
+ if (c->flags & CTDB_CTRL_FLAG_NOREPLY) {
+ talloc_steal(tmp_ctx, state);
+ }
+
+ data.dptr = &c->data[0];
+ data.dsize = c->datalen;
+ res = ctdb_daemon_send_control(client->ctdb, c->hdr.destnode,
+ c->srvid, c->opcode, client->client_id,
+ c->flags,
+ data, daemon_control_callback,
+ state);
+ if (res != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to send control to remote node %u\n",
+ c->hdr.destnode));
+ }
+
+ talloc_free(tmp_ctx);
+}
+
+static void daemon_request_tunnel_from_client(struct ctdb_client *client,
+ struct ctdb_req_tunnel_old *c)
+{
+ TDB_DATA data;
+ int ret;
+
+ if (! ctdb_validate_pnn(client->ctdb, c->hdr.destnode)) {
+ DEBUG(DEBUG_ERR, ("Invalid destination 0x%x\n",
+ c->hdr.destnode));
+ return;
+ }
+
+ ret = srvid_exists(client->ctdb->tunnels, c->tunnel_id, NULL);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,
+ ("tunnel id 0x%"PRIx64" not registered, dropping pkt\n",
+ c->tunnel_id));
+ return;
+ }
+
+ data = (TDB_DATA) {
+ .dsize = c->datalen,
+ .dptr = &c->data[0],
+ };
+
+ ret = ctdb_daemon_send_tunnel(client->ctdb, c->hdr.destnode,
+ c->tunnel_id, c->flags, data);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, ("Failed to set tunnel to remote note %u\n",
+ c->hdr.destnode));
+ }
+}
+
+/*
+ register a call function
+*/
+int ctdb_daemon_set_call(struct ctdb_context *ctdb, uint32_t db_id,
+ ctdb_fn_t fn, int id)
+{
+ struct ctdb_registered_call *call;
+ struct ctdb_db_context *ctdb_db;
+
+ ctdb_db = find_ctdb_db(ctdb, db_id);
+ if (ctdb_db == NULL) {
+ return -1;
+ }
+
+ call = talloc(ctdb_db, struct ctdb_registered_call);
+ call->fn = fn;
+ call->id = id;
+
+ DLIST_ADD(ctdb_db->calls, call);
+ return 0;
+}
+
+
+
+/*
+ this local messaging handler is ugly, but is needed to prevent
+ recursion in ctdb_send_message() when the destination node is the
+ same as the source node
+ */
+struct ctdb_local_message {
+ struct ctdb_context *ctdb;
+ uint64_t srvid;
+ TDB_DATA data;
+};
+
+static void ctdb_local_message_trigger(struct tevent_context *ev,
+ struct tevent_timer *te,
+ struct timeval t, void *private_data)
+{
+ struct ctdb_local_message *m = talloc_get_type(
+ private_data, struct ctdb_local_message);
+
+ srvid_dispatch(m->ctdb->srv, m->srvid, CTDB_SRVID_ALL, m->data);
+ talloc_free(m);
+}
+
+static int ctdb_local_message(struct ctdb_context *ctdb, uint64_t srvid, TDB_DATA data)
+{
+ struct ctdb_local_message *m;
+ m = talloc(ctdb, struct ctdb_local_message);
+ CTDB_NO_MEMORY(ctdb, m);
+
+ m->ctdb = ctdb;
+ m->srvid = srvid;
+ m->data = data;
+ m->data.dptr = talloc_memdup(m, m->data.dptr, m->data.dsize);
+ if (m->data.dptr == NULL) {
+ talloc_free(m);
+ return -1;
+ }
+
+ /* this needs to be done as an event to prevent recursion */
+ tevent_add_timer(ctdb->ev, m, timeval_zero(),
+ ctdb_local_message_trigger, m);
+ return 0;
+}
+
+/*
+ send a ctdb message
+*/
+int ctdb_daemon_send_message(struct ctdb_context *ctdb, uint32_t pnn,
+ uint64_t srvid, TDB_DATA data)
+{
+ struct ctdb_req_message_old *r;
+ int len;
+
+ if (ctdb->methods == NULL) {
+ DEBUG(DEBUG_INFO,(__location__ " Failed to send message. Transport is DOWN\n"));
+ return -1;
+ }
+
+ /* see if this is a message to ourselves */
+ if (pnn == ctdb->pnn) {
+ return ctdb_local_message(ctdb, srvid, data);
+ }
+
+ len = offsetof(struct ctdb_req_message_old, data) + data.dsize;
+ r = ctdb_transport_allocate(ctdb, ctdb, CTDB_REQ_MESSAGE, len,
+ struct ctdb_req_message_old);
+ CTDB_NO_MEMORY(ctdb, r);
+
+ r->hdr.destnode = pnn;
+ r->srvid = srvid;
+ r->datalen = data.dsize;
+ memcpy(&r->data[0], data.dptr, data.dsize);
+
+ ctdb_queue_packet(ctdb, &r->hdr);
+
+ talloc_free(r);
+ return 0;
+}
+
+
+
+struct ctdb_client_notify_list {
+ struct ctdb_client_notify_list *next, *prev;
+ struct ctdb_context *ctdb;
+ uint64_t srvid;
+ TDB_DATA data;
+};
+
+
+static int ctdb_client_notify_destructor(struct ctdb_client_notify_list *nl)
+{
+ int ret;
+
+ DEBUG(DEBUG_ERR,("Sending client notify message for srvid:%llu\n", (unsigned long long)nl->srvid));
+
+ ret = ctdb_daemon_send_message(nl->ctdb, CTDB_BROADCAST_CONNECTED, (unsigned long long)nl->srvid, nl->data);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,("Failed to send client notify message\n"));
+ }
+
+ return 0;
+}
+
+int32_t ctdb_control_register_notify(struct ctdb_context *ctdb, uint32_t client_id, TDB_DATA indata)
+{
+ struct ctdb_notify_data_old *notify = (struct ctdb_notify_data_old *)indata.dptr;
+ struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
+ struct ctdb_client_notify_list *nl;
+
+ DEBUG(DEBUG_INFO,("Register srvid %llu for client %d\n", (unsigned long long)notify->srvid, client_id));
+
+ if (indata.dsize < offsetof(struct ctdb_notify_data_old, notify_data)) {
+ DEBUG(DEBUG_ERR,(__location__ " Too little data in control : %d\n", (int)indata.dsize));
+ return -1;
+ }
+
+ if (indata.dsize != (notify->len + offsetof(struct ctdb_notify_data_old, notify_data))) {
+ DEBUG(DEBUG_ERR,(__location__ " Wrong amount of data in control. Got %d, expected %d\n", (int)indata.dsize, (int)(notify->len + offsetof(struct ctdb_notify_data_old, notify_data))));
+ return -1;
+ }
+
+
+ if (client == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Could not find client parent structure. You can not send this control to a remote node\n"));
+ return -1;
+ }
+
+ for(nl=client->notify; nl; nl=nl->next) {
+ if (nl->srvid == notify->srvid) {
+ break;
+ }
+ }
+ if (nl != NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Notification for srvid:%llu already exists for this client\n", (unsigned long long)notify->srvid));
+ return -1;
+ }
+
+ nl = talloc(client, struct ctdb_client_notify_list);
+ CTDB_NO_MEMORY(ctdb, nl);
+ nl->ctdb = ctdb;
+ nl->srvid = notify->srvid;
+ nl->data.dsize = notify->len;
+ nl->data.dptr = talloc_memdup(nl, notify->notify_data,
+ nl->data.dsize);
+ CTDB_NO_MEMORY(ctdb, nl->data.dptr);
+
+ DLIST_ADD(client->notify, nl);
+ talloc_set_destructor(nl, ctdb_client_notify_destructor);
+
+ return 0;
+}
+
+int32_t ctdb_control_deregister_notify(struct ctdb_context *ctdb, uint32_t client_id, TDB_DATA indata)
+{
+ uint64_t srvid = *(uint64_t *)indata.dptr;
+ struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
+ struct ctdb_client_notify_list *nl;
+
+ DEBUG(DEBUG_INFO,("Deregister srvid %llu for client %d\n", (unsigned long long)srvid, client_id));
+
+ if (client == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Could not find client parent structure. You can not send this control to a remote node\n"));
+ return -1;
+ }
+
+ for(nl=client->notify; nl; nl=nl->next) {
+ if (nl->srvid == srvid) {
+ break;
+ }
+ }
+ if (nl == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " No notification for srvid:%llu found for this client\n", (unsigned long long)srvid));
+ return -1;
+ }
+
+ DLIST_REMOVE(client->notify, nl);
+ talloc_set_destructor(nl, NULL);
+ talloc_free(nl);
+
+ return 0;
+}
+
+struct ctdb_client *ctdb_find_client_by_pid(struct ctdb_context *ctdb, pid_t pid)
+{
+ struct ctdb_client_pid_list *client_pid;
+
+ for (client_pid = ctdb->client_pids; client_pid; client_pid=client_pid->next) {
+ if (client_pid->pid == pid) {
+ return client_pid->client;
+ }
+ }
+ return NULL;
+}
+
+
+/* This control is used by samba when probing if a process (of a samba daemon)
+ exists on the node.
+ Samba does this when it needs/wants to check if a subrecord in one of the
+ databases is still valid, or if it is stale and can be removed.
+ If the node is in unhealthy or stopped state we just kill of the samba
+ process holding this sub-record and return to the calling samba that
+ the process does not exist.
+ This allows us to forcefully recall subrecords registered by samba processes
+ on banned and stopped nodes.
+*/
+int32_t ctdb_control_process_exists(struct ctdb_context *ctdb, pid_t pid)
+{
+ struct ctdb_client *client;
+
+ client = ctdb_find_client_by_pid(ctdb, pid);
+ if (client == NULL) {
+ return -1;
+ }
+
+ if (ctdb->nodes[ctdb->pnn]->flags & NODE_FLAGS_INACTIVE) {
+ DEBUG(DEBUG_NOTICE,
+ ("Killing client with pid:%d on banned/stopped node\n",
+ (int)pid));
+ talloc_free(client);
+ return -1;
+ }
+
+ return kill(pid, 0);
+}
+
+int32_t ctdb_control_check_pid_srvid(struct ctdb_context *ctdb,
+ TDB_DATA indata)
+{
+ struct ctdb_client_pid_list *client_pid;
+ pid_t pid;
+ uint64_t srvid;
+ int ret;
+
+ pid = *(pid_t *)indata.dptr;
+ srvid = *(uint64_t *)(indata.dptr + sizeof(pid_t));
+
+ for (client_pid = ctdb->client_pids;
+ client_pid != NULL;
+ client_pid = client_pid->next) {
+ if (client_pid->pid == pid) {
+ ret = srvid_exists(ctdb->srv, srvid,
+ client_pid->client);
+ if (ret == 0) {
+ return 0;
+ }
+ }
+ }
+
+ return -1;
+}
+
+int ctdb_control_getnodesfile(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
+{
+ struct ctdb_node_map_old *node_map = NULL;
+
+ CHECK_CONTROL_DATA_SIZE(0);
+
+ node_map = ctdb_read_nodes_file(ctdb, ctdb->nodes_file);
+ if (node_map == NULL) {
+ DEBUG(DEBUG_ERR, ("Failed to read nodes file\n"));
+ return -1;
+ }
+
+ outdata->dptr = (unsigned char *)node_map;
+ outdata->dsize = talloc_get_size(outdata->dptr);
+
+ return 0;
+}
+
+void ctdb_shutdown_sequence(struct ctdb_context *ctdb, int exit_code)
+{
+ if (ctdb->runstate == CTDB_RUNSTATE_SHUTDOWN) {
+ DEBUG(DEBUG_NOTICE,("Already shutting down so will not proceed.\n"));
+ return;
+ }
+
+ DEBUG(DEBUG_ERR,("Shutdown sequence commencing.\n"));
+ ctdb_set_runstate(ctdb, CTDB_RUNSTATE_SHUTDOWN);
+ ctdb_stop_recoverd(ctdb);
+ ctdb_stop_keepalive(ctdb);
+ ctdb_stop_monitoring(ctdb);
+ ctdb_event_script(ctdb, CTDB_EVENT_SHUTDOWN);
+ ctdb_stop_eventd(ctdb);
+ if (ctdb->methods != NULL && ctdb->methods->shutdown != NULL) {
+ ctdb->methods->shutdown(ctdb);
+ }
+
+ DEBUG(DEBUG_ERR,("Shutdown sequence complete, exiting.\n"));
+ exit(exit_code);
+}
+
+/* When forking the main daemon and the child process needs to connect
+ * back to the daemon as a client process, this function can be used
+ * to change the ctdb context from daemon into client mode. The child
+ * process must be created using ctdb_fork() and not fork() -
+ * ctdb_fork() does some necessary housekeeping.
+ */
+int switch_from_server_to_client(struct ctdb_context *ctdb)
+{
+ int ret;
+
+ if (ctdb->daemon.sd != -1) {
+ close(ctdb->daemon.sd);
+ ctdb->daemon.sd = -1;
+ }
+
+ /* get a new event context */
+ ctdb->ev = tevent_context_init(ctdb);
+ if (ctdb->ev == NULL) {
+ DEBUG(DEBUG_ALERT,("tevent_context_init() failed\n"));
+ exit(1);
+ }
+ tevent_loop_allow_nesting(ctdb->ev);
+
+ /* Connect to main CTDB daemon */
+ ret = ctdb_socket_connect(ctdb);
+ if (ret != 0) {
+ DEBUG(DEBUG_ALERT, (__location__ " Failed to init ctdb client\n"));
+ return -1;
+ }
+
+ ctdb->can_send_controls = true;
+
+ return 0;
+}