diff options
Diffstat (limited to '')
43 files changed, 33017 insertions, 0 deletions
diff --git a/ctdb/server/ctdb_banning.c b/ctdb/server/ctdb_banning.c new file mode 100644 index 0000000..3c71157 --- /dev/null +++ b/ctdb/server/ctdb_banning.c @@ -0,0 +1,146 @@ +/* + ctdb banning code + + Copyright (C) Ronnie Sahlberg 2009 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ +#include "replace.h" +#include "system/time.h" +#include "system/network.h" +#include "system/filesys.h" +#include "system/wait.h" + +#include <talloc.h> +#include <tevent.h> + +#include "lib/util/debug.h" +#include "lib/util/samba_util.h" + +#include "ctdb_private.h" +#include "ctdb_client.h" + +#include "common/common.h" +#include "common/logging.h" + +static void ctdb_ban_node_event(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *private_data) +{ + struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context); + + /* Make sure we were able to freeze databases during banning */ + if (!ctdb_db_all_frozen(ctdb)) { + DEBUG(DEBUG_ERR, ("Banning timed out, but not all databases " + "frozen yet - banning this node again.\n")); + ctdb_ban_self(ctdb); + return; + } + + DEBUG(DEBUG_ERR,("Banning timed out\n")); + ctdb->nodes[ctdb->pnn]->flags &= ~NODE_FLAGS_BANNED; + + if (ctdb->banning_ctx != NULL) { + talloc_free(ctdb->banning_ctx); + ctdb->banning_ctx = NULL; + } +} + +int32_t ctdb_control_set_ban_state(struct ctdb_context *ctdb, TDB_DATA indata) +{ + struct ctdb_ban_state *bantime = (struct ctdb_ban_state *)indata.dptr; + bool already_banned; + + DEBUG(DEBUG_INFO,("SET BAN STATE\n")); + + if (bantime->pnn != ctdb->pnn) { + DEBUG(DEBUG_WARNING, + ("SET_BAN_STATE control for PNN %d ignored\n", + bantime->pnn)); + return -1; + } + + already_banned = false; + if (ctdb->banning_ctx != NULL) { + talloc_free(ctdb->banning_ctx); + ctdb->banning_ctx = NULL; + already_banned = true; + } + + if (bantime->time == 0) { + DEBUG(DEBUG_ERR,("Unbanning this node\n")); + ctdb->nodes[bantime->pnn]->flags &= ~NODE_FLAGS_BANNED; + return 0; + } + + if (ctdb->tunable.enable_bans == 0) { + DEBUG(DEBUG_ERR,("Bans are disabled - ignoring ban of node %u\n", bantime->pnn)); + return 0; + } + + ctdb->banning_ctx = talloc(ctdb, struct ctdb_ban_state); + if (ctdb->banning_ctx == NULL) { + DEBUG(DEBUG_CRIT,(__location__ " ERROR Failed to allocate new banning state\n")); + return -1; + } + *((struct ctdb_ban_state *)(ctdb->banning_ctx)) = *bantime; + + + DEBUG(DEBUG_ERR,("Banning this node for %d seconds\n", bantime->time)); + ctdb->nodes[bantime->pnn]->flags |= NODE_FLAGS_BANNED; + + tevent_add_timer(ctdb->ev, ctdb->banning_ctx, + timeval_current_ofs(bantime->time,0), + ctdb_ban_node_event, ctdb); + + if (!already_banned) { + ctdb_node_become_inactive(ctdb); + } + return 0; +} + +int32_t ctdb_control_get_ban_state(struct ctdb_context *ctdb, TDB_DATA *outdata) +{ + struct ctdb_ban_state *bantime; + + bantime = talloc(outdata, struct ctdb_ban_state); + CTDB_NO_MEMORY(ctdb, bantime); + + if (ctdb->banning_ctx != NULL) { + *bantime = *(struct ctdb_ban_state *)(ctdb->banning_ctx); + } else { + bantime->pnn = ctdb->pnn; + bantime->time = 0; + } + + outdata->dptr = (uint8_t *)bantime; + outdata->dsize = sizeof(struct ctdb_ban_state); + + return 0; +} + +/* Routine to ban ourselves for a while when trouble strikes. */ +void ctdb_ban_self(struct ctdb_context *ctdb) +{ + TDB_DATA data; + struct ctdb_ban_state bantime; + + bantime.pnn = ctdb->pnn; + bantime.time = ctdb->tunable.recovery_ban_period; + + data.dsize = sizeof(bantime); + data.dptr = (uint8_t *)&bantime; + + ctdb_control_set_ban_state(ctdb, data); +} diff --git a/ctdb/server/ctdb_call.c b/ctdb/server/ctdb_call.c new file mode 100644 index 0000000..a51a92d --- /dev/null +++ b/ctdb/server/ctdb_call.c @@ -0,0 +1,2086 @@ +/* + ctdb_call protocol code + + Copyright (C) Andrew Tridgell 2006 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ +/* + see http://wiki.samba.org/index.php/Samba_%26_Clustering for + protocol design and packet details +*/ +#include "replace.h" +#include "system/network.h" +#include "system/filesys.h" + +#include <talloc.h> +#include <tevent.h> + +#include "lib/util/dlinklist.h" +#include "lib/util/debug.h" +#include "lib/util/samba_util.h" +#include "lib/util/sys_rw.h" +#include "lib/util/util_process.h" + +#include "ctdb_private.h" +#include "ctdb_client.h" + +#include "common/rb_tree.h" +#include "common/reqid.h" +#include "common/system.h" +#include "common/common.h" +#include "common/logging.h" +#include "common/hash_count.h" + +struct ctdb_sticky_record { + struct ctdb_context *ctdb; + struct ctdb_db_context *ctdb_db; + TDB_CONTEXT *pindown; +}; + +/* + find the ctdb_db from a db index + */ + struct ctdb_db_context *find_ctdb_db(struct ctdb_context *ctdb, uint32_t id) +{ + struct ctdb_db_context *ctdb_db; + + for (ctdb_db=ctdb->db_list; ctdb_db; ctdb_db=ctdb_db->next) { + if (ctdb_db->db_id == id) { + break; + } + } + return ctdb_db; +} + +/* + a variant of input packet that can be used in lock requeue +*/ +static void ctdb_call_input_pkt(void *p, struct ctdb_req_header *hdr) +{ + struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context); + ctdb_input_pkt(ctdb, hdr); +} + + +/* + send an error reply +*/ +static void ctdb_send_error(struct ctdb_context *ctdb, + struct ctdb_req_header *hdr, uint32_t status, + const char *fmt, ...) PRINTF_ATTRIBUTE(4,5); +static void ctdb_send_error(struct ctdb_context *ctdb, + struct ctdb_req_header *hdr, uint32_t status, + const char *fmt, ...) +{ + va_list ap; + struct ctdb_reply_error_old *r; + char *msg; + int msglen, len; + + if (ctdb->methods == NULL) { + DEBUG(DEBUG_INFO,(__location__ " Failed to send error. Transport is DOWN\n")); + return; + } + + va_start(ap, fmt); + msg = talloc_vasprintf(ctdb, fmt, ap); + if (msg == NULL) { + ctdb_fatal(ctdb, "Unable to allocate error in ctdb_send_error\n"); + } + va_end(ap); + + msglen = strlen(msg)+1; + len = offsetof(struct ctdb_reply_error_old, msg); + r = ctdb_transport_allocate(ctdb, msg, CTDB_REPLY_ERROR, len + msglen, + struct ctdb_reply_error_old); + CTDB_NO_MEMORY_FATAL(ctdb, r); + + r->hdr.destnode = hdr->srcnode; + r->hdr.reqid = hdr->reqid; + r->status = status; + r->msglen = msglen; + memcpy(&r->msg[0], msg, msglen); + + ctdb_queue_packet(ctdb, &r->hdr); + + talloc_free(msg); +} + + +/** + * send a redirect reply + * + * The logic behind this function is this: + * + * A client wants to grab a record and sends a CTDB_REQ_CALL packet + * to its local ctdb (ctdb_request_call). If the node is not itself + * the record's DMASTER, it first redirects the packet to the + * record's LMASTER. The LMASTER then redirects the call packet to + * the current DMASTER. Note that this works because of this: When + * a record is migrated off a node, then the new DMASTER is stored + * in the record's copy on the former DMASTER. + */ +static void ctdb_call_send_redirect(struct ctdb_context *ctdb, + struct ctdb_db_context *ctdb_db, + TDB_DATA key, + struct ctdb_req_call_old *c, + struct ctdb_ltdb_header *header) +{ + uint32_t lmaster = ctdb_lmaster(ctdb, &key); + + c->hdr.destnode = lmaster; + if (ctdb->pnn == lmaster) { + c->hdr.destnode = header->dmaster; + } + c->hopcount++; + + if (c->hopcount%100 > 95) { + DEBUG(DEBUG_WARNING,("High hopcount %d dbid:%s " + "key:0x%08x reqid=%08x pnn:%d src:%d lmaster:%d " + "header->dmaster:%d dst:%d\n", + c->hopcount, ctdb_db->db_name, ctdb_hash(&key), + c->hdr.reqid, ctdb->pnn, c->hdr.srcnode, lmaster, + header->dmaster, c->hdr.destnode)); + } + + ctdb_queue_packet(ctdb, &c->hdr); +} + + +/* + send a dmaster reply + + caller must have the chainlock before calling this routine. Caller must be + the lmaster +*/ +static void ctdb_send_dmaster_reply(struct ctdb_db_context *ctdb_db, + struct ctdb_ltdb_header *header, + TDB_DATA key, TDB_DATA data, + uint32_t new_dmaster, + uint32_t reqid) +{ + struct ctdb_context *ctdb = ctdb_db->ctdb; + struct ctdb_reply_dmaster_old *r; + int ret, len; + TALLOC_CTX *tmp_ctx; + + if (ctdb->pnn != ctdb_lmaster(ctdb, &key)) { + DEBUG(DEBUG_ALERT,(__location__ " Caller is not lmaster!\n")); + return; + } + + header->dmaster = new_dmaster; + ret = ctdb_ltdb_store(ctdb_db, key, header, data); + if (ret != 0) { + ctdb_fatal(ctdb, "ctdb_send_dmaster_reply unable to update dmaster"); + return; + } + + if (ctdb->methods == NULL) { + ctdb_fatal(ctdb, "ctdb_send_dmaster_reply can't update dmaster since transport is down"); + return; + } + + /* put the packet on a temporary context, allowing us to safely free + it below even if ctdb_reply_dmaster() has freed it already */ + tmp_ctx = talloc_new(ctdb); + + /* send the CTDB_REPLY_DMASTER */ + len = offsetof(struct ctdb_reply_dmaster_old, data) + key.dsize + data.dsize + sizeof(uint32_t); + r = ctdb_transport_allocate(ctdb, tmp_ctx, CTDB_REPLY_DMASTER, len, + struct ctdb_reply_dmaster_old); + CTDB_NO_MEMORY_FATAL(ctdb, r); + + r->hdr.destnode = new_dmaster; + r->hdr.reqid = reqid; + r->hdr.generation = ctdb_db->generation; + r->rsn = header->rsn; + r->keylen = key.dsize; + r->datalen = data.dsize; + r->db_id = ctdb_db->db_id; + memcpy(&r->data[0], key.dptr, key.dsize); + memcpy(&r->data[key.dsize], data.dptr, data.dsize); + memcpy(&r->data[key.dsize+data.dsize], &header->flags, sizeof(uint32_t)); + + ctdb_queue_packet(ctdb, &r->hdr); + + talloc_free(tmp_ctx); +} + +/* + send a dmaster request (give another node the dmaster for a record) + + This is always sent to the lmaster, which ensures that the lmaster + always knows who the dmaster is. The lmaster will then send a + CTDB_REPLY_DMASTER to the new dmaster +*/ +static void ctdb_call_send_dmaster(struct ctdb_db_context *ctdb_db, + struct ctdb_req_call_old *c, + struct ctdb_ltdb_header *header, + TDB_DATA *key, TDB_DATA *data) +{ + struct ctdb_req_dmaster_old *r; + struct ctdb_context *ctdb = ctdb_db->ctdb; + int len; + uint32_t lmaster = ctdb_lmaster(ctdb, key); + + if (ctdb->methods == NULL) { + ctdb_fatal(ctdb, "Failed ctdb_call_send_dmaster since transport is down"); + return; + } + + if (data->dsize != 0) { + header->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA; + } + + if (lmaster == ctdb->pnn) { + ctdb_send_dmaster_reply(ctdb_db, header, *key, *data, + c->hdr.srcnode, c->hdr.reqid); + return; + } + + len = offsetof(struct ctdb_req_dmaster_old, data) + key->dsize + data->dsize + + sizeof(uint32_t); + r = ctdb_transport_allocate(ctdb, ctdb, CTDB_REQ_DMASTER, len, + struct ctdb_req_dmaster_old); + CTDB_NO_MEMORY_FATAL(ctdb, r); + r->hdr.destnode = lmaster; + r->hdr.reqid = c->hdr.reqid; + r->hdr.generation = ctdb_db->generation; + r->db_id = c->db_id; + r->rsn = header->rsn; + r->dmaster = c->hdr.srcnode; + r->keylen = key->dsize; + r->datalen = data->dsize; + memcpy(&r->data[0], key->dptr, key->dsize); + memcpy(&r->data[key->dsize], data->dptr, data->dsize); + memcpy(&r->data[key->dsize + data->dsize], &header->flags, sizeof(uint32_t)); + + header->dmaster = c->hdr.srcnode; + if (ctdb_ltdb_store(ctdb_db, *key, header, *data) != 0) { + ctdb_fatal(ctdb, "Failed to store record in ctdb_call_send_dmaster"); + } + + ctdb_queue_packet(ctdb, &r->hdr); + + talloc_free(r); +} + +static void ctdb_sticky_pindown_timeout(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *private_data) +{ + struct ctdb_sticky_record *sr = talloc_get_type(private_data, + struct ctdb_sticky_record); + + DEBUG(DEBUG_ERR,("Pindown timeout db:%s unstick record\n", sr->ctdb_db->db_name)); + if (sr->pindown != NULL) { + talloc_free(sr->pindown); + sr->pindown = NULL; + } +} + +static int +ctdb_set_sticky_pindown(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, TDB_DATA key) +{ + TALLOC_CTX *tmp_ctx = talloc_new(NULL); + uint32_t *k; + struct ctdb_sticky_record *sr; + + k = ctdb_key_to_idkey(tmp_ctx, key); + if (k == NULL) { + DEBUG(DEBUG_ERR,("Failed to allocate key for sticky record\n")); + talloc_free(tmp_ctx); + return -1; + } + + sr = trbt_lookuparray32(ctdb_db->sticky_records, k[0], &k[0]); + if (sr == NULL) { + talloc_free(tmp_ctx); + return 0; + } + + talloc_free(tmp_ctx); + + if (sr->pindown == NULL) { + DEBUG(DEBUG_ERR,("Pinning down record in %s for %d ms\n", ctdb_db->db_name, ctdb->tunable.sticky_pindown)); + sr->pindown = talloc_new(sr); + if (sr->pindown == NULL) { + DEBUG(DEBUG_ERR,("Failed to allocate pindown context for sticky record\n")); + return -1; + } + tevent_add_timer(ctdb->ev, sr->pindown, + timeval_current_ofs(ctdb->tunable.sticky_pindown / 1000, + (ctdb->tunable.sticky_pindown * 1000) % 1000000), + ctdb_sticky_pindown_timeout, sr); + } + + return 0; +} + +/* + called when a CTDB_REPLY_DMASTER packet comes in, or when the lmaster + gets a CTDB_REQUEST_DMASTER for itself. We become the dmaster. + + must be called with the chainlock held. This function releases the chainlock +*/ +static void ctdb_become_dmaster(struct ctdb_db_context *ctdb_db, + struct ctdb_req_header *hdr, + TDB_DATA key, TDB_DATA data, + uint64_t rsn, uint32_t record_flags) +{ + struct ctdb_call_state *state; + struct ctdb_context *ctdb = ctdb_db->ctdb; + struct ctdb_ltdb_header header; + int ret; + + DEBUG(DEBUG_DEBUG,("pnn %u dmaster response %08x\n", ctdb->pnn, ctdb_hash(&key))); + + ZERO_STRUCT(header); + header.rsn = rsn; + header.dmaster = ctdb->pnn; + header.flags = record_flags; + + state = reqid_find(ctdb->idr, hdr->reqid, struct ctdb_call_state); + + if (state) { + if (state->call->flags & CTDB_CALL_FLAG_VACUUM_MIGRATION) { + /* + * We temporarily add the VACUUM_MIGRATED flag to + * the record flags, so that ctdb_ltdb_store can + * decide whether the record should be stored or + * deleted. + */ + header.flags |= CTDB_REC_FLAG_VACUUM_MIGRATED; + } + } + + if (ctdb_ltdb_store(ctdb_db, key, &header, data) != 0) { + ctdb_fatal(ctdb, "ctdb_reply_dmaster store failed\n"); + + ret = ctdb_ltdb_unlock(ctdb_db, key); + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret)); + } + return; + } + + /* we just became DMASTER and this database is "sticky", + see if the record is flagged as "hot" and set up a pin-down + context to stop migrations for a little while if so + */ + if (ctdb_db_sticky(ctdb_db)) { + ctdb_set_sticky_pindown(ctdb, ctdb_db, key); + } + + if (state == NULL) { + DEBUG(DEBUG_ERR,("pnn %u Invalid reqid %u in ctdb_become_dmaster from node %u\n", + ctdb->pnn, hdr->reqid, hdr->srcnode)); + + ret = ctdb_ltdb_unlock(ctdb_db, key); + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret)); + } + return; + } + + if (key.dsize != state->call->key.dsize || memcmp(key.dptr, state->call->key.dptr, key.dsize)) { + DEBUG(DEBUG_ERR, ("Got bogus DMASTER packet reqid:%u from node %u. Key does not match key held in matching idr.\n", hdr->reqid, hdr->srcnode)); + + ret = ctdb_ltdb_unlock(ctdb_db, key); + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret)); + } + return; + } + + if (hdr->reqid != state->reqid) { + /* we found a record but it was the wrong one */ + DEBUG(DEBUG_ERR, ("Dropped orphan in ctdb_become_dmaster with reqid:%u\n from node %u", hdr->reqid, hdr->srcnode)); + + ret = ctdb_ltdb_unlock(ctdb_db, key); + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret)); + } + return; + } + + (void) hash_count_increment(ctdb_db->migratedb, key); + + ctdb_call_local(ctdb_db, state->call, &header, state, &data, true); + + ret = ctdb_ltdb_unlock(ctdb_db, state->call->key); + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret)); + } + + state->state = CTDB_CALL_DONE; + if (state->async.fn) { + state->async.fn(state); + } +} + +struct dmaster_defer_call { + struct dmaster_defer_call *next, *prev; + struct ctdb_context *ctdb; + struct ctdb_req_header *hdr; +}; + +struct dmaster_defer_queue { + struct ctdb_db_context *ctdb_db; + uint32_t generation; + struct dmaster_defer_call *deferred_calls; +}; + +static void dmaster_defer_reprocess(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, + void *private_data) +{ + struct dmaster_defer_call *call = talloc_get_type( + private_data, struct dmaster_defer_call); + + ctdb_input_pkt(call->ctdb, call->hdr); + talloc_free(call); +} + +static int dmaster_defer_queue_destructor(struct dmaster_defer_queue *ddq) +{ + /* Ignore requests, if database recovery happens in-between. */ + if (ddq->generation != ddq->ctdb_db->generation) { + return 0; + } + + while (ddq->deferred_calls != NULL) { + struct dmaster_defer_call *call = ddq->deferred_calls; + + DLIST_REMOVE(ddq->deferred_calls, call); + + talloc_steal(call->ctdb, call); + tevent_add_timer(call->ctdb->ev, call, timeval_zero(), + dmaster_defer_reprocess, call); + } + return 0; +} + +static void *insert_ddq_callback(void *parm, void *data) +{ + if (data) { + talloc_free(data); + } + return parm; +} + +/** + * This function is used to register a key in database that needs to be updated. + * Any requests for that key should get deferred till this is completed. + */ +static int dmaster_defer_setup(struct ctdb_db_context *ctdb_db, + struct ctdb_req_header *hdr, + TDB_DATA key) +{ + uint32_t *k; + struct dmaster_defer_queue *ddq; + + k = ctdb_key_to_idkey(hdr, key); + if (k == NULL) { + DEBUG(DEBUG_ERR, ("Failed to allocate key for dmaster defer setup\n")); + return -1; + } + + /* Already exists */ + ddq = trbt_lookuparray32(ctdb_db->defer_dmaster, k[0], k); + if (ddq != NULL) { + if (ddq->generation == ctdb_db->generation) { + talloc_free(k); + return 0; + } + + /* Recovery occurred - get rid of old queue. All the deferred + * requests will be resent anyway from ctdb_call_resend_db. + */ + talloc_free(ddq); + } + + ddq = talloc(hdr, struct dmaster_defer_queue); + if (ddq == NULL) { + DEBUG(DEBUG_ERR, ("Failed to allocate dmaster defer queue\n")); + talloc_free(k); + return -1; + } + ddq->ctdb_db = ctdb_db; + ddq->generation = hdr->generation; + ddq->deferred_calls = NULL; + + trbt_insertarray32_callback(ctdb_db->defer_dmaster, k[0], k, + insert_ddq_callback, ddq); + talloc_set_destructor(ddq, dmaster_defer_queue_destructor); + + talloc_free(k); + return 0; +} + +static int dmaster_defer_add(struct ctdb_db_context *ctdb_db, + struct ctdb_req_header *hdr, + TDB_DATA key) +{ + struct dmaster_defer_queue *ddq; + struct dmaster_defer_call *call; + uint32_t *k; + + k = ctdb_key_to_idkey(hdr, key); + if (k == NULL) { + DEBUG(DEBUG_ERR, ("Failed to allocate key for dmaster defer add\n")); + return -1; + } + + ddq = trbt_lookuparray32(ctdb_db->defer_dmaster, k[0], k); + if (ddq == NULL) { + talloc_free(k); + return -1; + } + + talloc_free(k); + + if (ddq->generation != hdr->generation) { + talloc_set_destructor(ddq, NULL); + talloc_free(ddq); + return -1; + } + + call = talloc(ddq, struct dmaster_defer_call); + if (call == NULL) { + DEBUG(DEBUG_ERR, ("Failed to allocate dmaster defer call\n")); + return -1; + } + + call->ctdb = ctdb_db->ctdb; + call->hdr = talloc_steal(call, hdr); + + DLIST_ADD_END(ddq->deferred_calls, call); + + return 0; +} + +/* + called when a CTDB_REQ_DMASTER packet comes in + + this comes into the lmaster for a record when the current dmaster + wants to give up the dmaster role and give it to someone else +*/ +void ctdb_request_dmaster(struct ctdb_context *ctdb, struct ctdb_req_header *hdr) +{ + struct ctdb_req_dmaster_old *c = (struct ctdb_req_dmaster_old *)hdr; + TDB_DATA key, data, data2; + struct ctdb_ltdb_header header; + struct ctdb_db_context *ctdb_db; + uint32_t record_flags = 0; + size_t len; + int ret; + + key.dptr = c->data; + key.dsize = c->keylen; + data.dptr = c->data + c->keylen; + data.dsize = c->datalen; + len = offsetof(struct ctdb_req_dmaster_old, data) + key.dsize + data.dsize + + sizeof(uint32_t); + if (len <= c->hdr.length) { + memcpy(&record_flags, &c->data[c->keylen + c->datalen], + sizeof(record_flags)); + } + + ctdb_db = find_ctdb_db(ctdb, c->db_id); + if (!ctdb_db) { + ctdb_send_error(ctdb, hdr, -1, + "Unknown database in request. db_id==0x%08x", + c->db_id); + return; + } + + dmaster_defer_setup(ctdb_db, hdr, key); + + /* fetch the current record */ + ret = ctdb_ltdb_lock_fetch_requeue(ctdb_db, key, &header, hdr, &data2, + ctdb_call_input_pkt, ctdb, false); + if (ret == -1) { + ctdb_fatal(ctdb, "ctdb_req_dmaster failed to fetch record"); + return; + } + if (ret == -2) { + DEBUG(DEBUG_INFO,(__location__ " deferring ctdb_request_dmaster\n")); + return; + } + + if (ctdb_lmaster(ctdb, &key) != ctdb->pnn) { + DEBUG(DEBUG_ERR, ("dmaster request to non-lmaster " + "db=%s lmaster=%u gen=%u curgen=%u\n", + ctdb_db->db_name, ctdb_lmaster(ctdb, &key), + hdr->generation, ctdb_db->generation)); + ctdb_fatal(ctdb, "ctdb_req_dmaster to non-lmaster"); + } + + DEBUG(DEBUG_DEBUG,("pnn %u dmaster request on %08x for %u from %u\n", + ctdb->pnn, ctdb_hash(&key), c->dmaster, c->hdr.srcnode)); + + /* its a protocol error if the sending node is not the current dmaster */ + if (header.dmaster != hdr->srcnode) { + DEBUG(DEBUG_ALERT,("pnn %u dmaster request for new-dmaster %u from non-master %u real-dmaster=%u key %08x dbid 0x%08x gen=%u curgen=%u c->rsn=%llu header.rsn=%llu reqid=%u keyval=0x%08x\n", + ctdb->pnn, c->dmaster, hdr->srcnode, header.dmaster, ctdb_hash(&key), + ctdb_db->db_id, hdr->generation, ctdb->vnn_map->generation, + (unsigned long long)c->rsn, (unsigned long long)header.rsn, c->hdr.reqid, + (key.dsize >= 4)?(*(uint32_t *)key.dptr):0)); + if (header.rsn != 0 || header.dmaster != ctdb->pnn) { + DEBUG(DEBUG_ERR,("ctdb_req_dmaster from non-master. Force a recovery.\n")); + + ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE; + ctdb_ltdb_unlock(ctdb_db, key); + return; + } + } + + if (header.rsn > c->rsn) { + DEBUG(DEBUG_ALERT,("pnn %u dmaster request with older RSN new-dmaster %u from %u real-dmaster=%u key %08x dbid 0x%08x gen=%u curgen=%u c->rsn=%llu header.rsn=%llu reqid=%u\n", + ctdb->pnn, c->dmaster, hdr->srcnode, header.dmaster, ctdb_hash(&key), + ctdb_db->db_id, hdr->generation, ctdb->vnn_map->generation, + (unsigned long long)c->rsn, (unsigned long long)header.rsn, c->hdr.reqid)); + } + + /* use the rsn from the sending node */ + header.rsn = c->rsn; + + /* store the record flags from the sending node */ + header.flags = record_flags; + + /* check if the new dmaster is the lmaster, in which case we + skip the dmaster reply */ + if (c->dmaster == ctdb->pnn) { + ctdb_become_dmaster(ctdb_db, hdr, key, data, c->rsn, record_flags); + } else { + ctdb_send_dmaster_reply(ctdb_db, &header, key, data, c->dmaster, hdr->reqid); + + ret = ctdb_ltdb_unlock(ctdb_db, key); + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret)); + } + } +} + +static void ctdb_sticky_record_timeout(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *private_data) +{ + struct ctdb_sticky_record *sr = talloc_get_type(private_data, + struct ctdb_sticky_record); + talloc_free(sr); +} + +static void *ctdb_make_sticky_record_callback(void *parm, void *data) +{ + if (data) { + DEBUG(DEBUG_ERR,("Already have sticky record registered. Free old %p and create new %p\n", data, parm)); + talloc_free(data); + } + return parm; +} + +static int +ctdb_make_record_sticky(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, TDB_DATA key) +{ + TALLOC_CTX *tmp_ctx = talloc_new(NULL); + uint32_t *k; + struct ctdb_sticky_record *sr; + + k = ctdb_key_to_idkey(tmp_ctx, key); + if (k == NULL) { + DEBUG(DEBUG_ERR,("Failed to allocate key for sticky record\n")); + talloc_free(tmp_ctx); + return -1; + } + + sr = trbt_lookuparray32(ctdb_db->sticky_records, k[0], &k[0]); + if (sr != NULL) { + talloc_free(tmp_ctx); + return 0; + } + + sr = talloc(ctdb_db->sticky_records, struct ctdb_sticky_record); + if (sr == NULL) { + talloc_free(tmp_ctx); + DEBUG(DEBUG_ERR,("Failed to allocate sticky record structure\n")); + return -1; + } + + sr->ctdb = ctdb; + sr->ctdb_db = ctdb_db; + sr->pindown = NULL; + + DEBUG(DEBUG_ERR,("Make record sticky for %d seconds in db %s key:0x%08x.\n", + ctdb->tunable.sticky_duration, + ctdb_db->db_name, ctdb_hash(&key))); + + trbt_insertarray32_callback(ctdb_db->sticky_records, k[0], &k[0], ctdb_make_sticky_record_callback, sr); + + tevent_add_timer(ctdb->ev, sr, + timeval_current_ofs(ctdb->tunable.sticky_duration, 0), + ctdb_sticky_record_timeout, sr); + + talloc_free(tmp_ctx); + return 0; +} + +struct pinned_down_requeue_handle { + struct ctdb_context *ctdb; + struct ctdb_req_header *hdr; +}; + +struct pinned_down_deferred_call { + struct ctdb_context *ctdb; + struct ctdb_req_header *hdr; +}; + +static void pinned_down_requeue(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *private_data) +{ + struct pinned_down_requeue_handle *handle = talloc_get_type(private_data, struct pinned_down_requeue_handle); + struct ctdb_context *ctdb = handle->ctdb; + + talloc_steal(ctdb, handle->hdr); + ctdb_call_input_pkt(ctdb, handle->hdr); + + talloc_free(handle); +} + +static int pinned_down_destructor(struct pinned_down_deferred_call *pinned_down) +{ + struct ctdb_context *ctdb = pinned_down->ctdb; + struct pinned_down_requeue_handle *handle = talloc(ctdb, struct pinned_down_requeue_handle); + + handle->ctdb = pinned_down->ctdb; + handle->hdr = pinned_down->hdr; + talloc_steal(handle, handle->hdr); + + tevent_add_timer(ctdb->ev, handle, timeval_zero(), + pinned_down_requeue, handle); + + return 0; +} + +static int +ctdb_defer_pinned_down_request(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_req_header *hdr) +{ + TALLOC_CTX *tmp_ctx = talloc_new(NULL); + uint32_t *k; + struct ctdb_sticky_record *sr; + struct pinned_down_deferred_call *pinned_down; + + k = ctdb_key_to_idkey(tmp_ctx, key); + if (k == NULL) { + DEBUG(DEBUG_ERR,("Failed to allocate key for sticky record\n")); + talloc_free(tmp_ctx); + return -1; + } + + sr = trbt_lookuparray32(ctdb_db->sticky_records, k[0], &k[0]); + if (sr == NULL) { + talloc_free(tmp_ctx); + return -1; + } + + talloc_free(tmp_ctx); + + if (sr->pindown == NULL) { + return -1; + } + + pinned_down = talloc(sr->pindown, struct pinned_down_deferred_call); + if (pinned_down == NULL) { + DEBUG(DEBUG_ERR,("Failed to allocate structure for deferred pinned down request\n")); + return -1; + } + + pinned_down->ctdb = ctdb; + pinned_down->hdr = hdr; + + talloc_set_destructor(pinned_down, pinned_down_destructor); + talloc_steal(pinned_down, hdr); + + return 0; +} + +static int hot_key_cmp(const void *a, const void *b) +{ + const struct ctdb_db_hot_key *ka = (const struct ctdb_db_hot_key *)a; + const struct ctdb_db_hot_key *kb = (const struct ctdb_db_hot_key *)b; + + if (ka->count < kb->count) { + return -1; + } + if (ka->count > kb->count) { + return 1; + } + + return 0; +} + +static void +ctdb_update_db_stat_hot_keys(struct ctdb_db_context *ctdb_db, TDB_DATA key, + unsigned int count) +{ + unsigned int i, id; + char *keystr; + + /* + * If all slots are being used then only need to compare + * against the count in the 0th slot, since it contains the + * smallest count. + */ + if (ctdb_db->statistics.num_hot_keys == MAX_HOT_KEYS && + count <= ctdb_db->hot_keys[0].count) { + return; + } + + /* see if we already know this key */ + for (i = 0; i < MAX_HOT_KEYS; i++) { + if (key.dsize != ctdb_db->hot_keys[i].key.dsize) { + continue; + } + if (memcmp(key.dptr, ctdb_db->hot_keys[i].key.dptr, key.dsize)) { + continue; + } + /* found an entry for this key */ + if (count <= ctdb_db->hot_keys[i].count) { + return; + } + if (count >= (2 * ctdb_db->hot_keys[i].last_logged_count)) { + keystr = hex_encode_talloc(ctdb_db, + (unsigned char *)key.dptr, + key.dsize); + D_NOTICE("Updated hot key database=%s key=%s count=%d\n", + ctdb_db->db_name, + keystr ? keystr : "" , + count); + TALLOC_FREE(keystr); + ctdb_db->hot_keys[i].last_logged_count = count; + } + ctdb_db->hot_keys[i].count = count; + goto sort_keys; + } + + if (ctdb_db->statistics.num_hot_keys < MAX_HOT_KEYS) { + id = ctdb_db->statistics.num_hot_keys; + ctdb_db->statistics.num_hot_keys++; + } else { + id = 0; + } + + if (ctdb_db->hot_keys[id].key.dptr != NULL) { + talloc_free(ctdb_db->hot_keys[id].key.dptr); + } + ctdb_db->hot_keys[id].key.dsize = key.dsize; + ctdb_db->hot_keys[id].key.dptr = talloc_memdup(ctdb_db, + key.dptr, + key.dsize); + ctdb_db->hot_keys[id].count = count; + + keystr = hex_encode_talloc(ctdb_db, + (unsigned char *)key.dptr, key.dsize); + D_NOTICE("Added hot key database=%s key=%s count=%d\n", + ctdb_db->db_name, + keystr ? keystr : "" , + count); + talloc_free(keystr); + ctdb_db->hot_keys[id].last_logged_count = count; + +sort_keys: + qsort(&ctdb_db->hot_keys[0], + ctdb_db->statistics.num_hot_keys, + sizeof(struct ctdb_db_hot_key), + hot_key_cmp); +} + +/* + called when a CTDB_REQ_CALL packet comes in +*/ +void ctdb_request_call(struct ctdb_context *ctdb, struct ctdb_req_header *hdr) +{ + struct ctdb_req_call_old *c = (struct ctdb_req_call_old *)hdr; + TDB_DATA data; + struct ctdb_reply_call_old *r; + int ret, len; + struct ctdb_ltdb_header header; + struct ctdb_call *call; + struct ctdb_db_context *ctdb_db; + int tmp_count, bucket; + + if (ctdb->methods == NULL) { + DEBUG(DEBUG_INFO,(__location__ " Failed ctdb_request_call. Transport is DOWN\n")); + return; + } + + + ctdb_db = find_ctdb_db(ctdb, c->db_id); + if (!ctdb_db) { + ctdb_send_error(ctdb, hdr, -1, + "Unknown database in request. db_id==0x%08x", + c->db_id); + return; + } + + call = talloc(hdr, struct ctdb_call); + CTDB_NO_MEMORY_FATAL(ctdb, call); + + call->call_id = c->callid; + call->key.dptr = c->data; + call->key.dsize = c->keylen; + call->call_data.dptr = c->data + c->keylen; + call->call_data.dsize = c->calldatalen; + call->reply_data.dptr = NULL; + call->reply_data.dsize = 0; + + + /* If this record is pinned down we should defer the + request until the pindown times out + */ + if (ctdb_db_sticky(ctdb_db)) { + if (ctdb_defer_pinned_down_request(ctdb, ctdb_db, call->key, hdr) == 0) { + DEBUG(DEBUG_WARNING, + ("Defer request for pinned down record in %s\n", ctdb_db->db_name)); + talloc_free(call); + return; + } + } + + if (dmaster_defer_add(ctdb_db, hdr, call->key) == 0) { + talloc_free(call); + return; + } + + /* determine if we are the dmaster for this key. This also + fetches the record data (if any), thus avoiding a 2nd fetch of the data + if the call will be answered locally */ + + ret = ctdb_ltdb_lock_fetch_requeue(ctdb_db, call->key, &header, hdr, &data, + ctdb_call_input_pkt, ctdb, false); + if (ret == -1) { + ctdb_send_error(ctdb, hdr, ret, "ltdb fetch failed in ctdb_request_call"); + talloc_free(call); + return; + } + if (ret == -2) { + DEBUG(DEBUG_INFO,(__location__ " deferred ctdb_request_call\n")); + talloc_free(call); + return; + } + + /* Dont do READONLY if we don't have a tracking database */ + if ((c->flags & CTDB_WANT_READONLY) && !ctdb_db_readonly(ctdb_db)) { + c->flags &= ~CTDB_WANT_READONLY; + } + + if (header.flags & CTDB_REC_RO_REVOKE_COMPLETE) { + header.flags &= ~CTDB_REC_RO_FLAGS; + CTDB_INCREMENT_STAT(ctdb, total_ro_revokes); + CTDB_INCREMENT_DB_STAT(ctdb_db, db_ro_revokes); + if (ctdb_ltdb_store(ctdb_db, call->key, &header, data) != 0) { + ctdb_fatal(ctdb, "Failed to write header with cleared REVOKE flag"); + } + /* and clear out the tracking data */ + if (tdb_delete(ctdb_db->rottdb, call->key) != 0) { + DEBUG(DEBUG_ERR,(__location__ " Failed to clear out trackingdb record\n")); + } + } + + /* if we are revoking, we must defer all other calls until the revoke + * had completed. + */ + if (header.flags & CTDB_REC_RO_REVOKING_READONLY) { + talloc_free(data.dptr); + ret = ctdb_ltdb_unlock(ctdb_db, call->key); + + if (ctdb_add_revoke_deferred_call(ctdb, ctdb_db, call->key, hdr, ctdb_call_input_pkt, ctdb) != 0) { + ctdb_fatal(ctdb, "Failed to add deferred call for revoke child"); + } + talloc_free(call); + return; + } + + /* + * If we are not the dmaster and are not hosting any delegations, + * then we redirect the request to the node than can answer it + * (the lmaster or the dmaster). + */ + if ((header.dmaster != ctdb->pnn) + && (!(header.flags & CTDB_REC_RO_HAVE_DELEGATIONS)) ) { + talloc_free(data.dptr); + ctdb_call_send_redirect(ctdb, ctdb_db, call->key, c, &header); + + ret = ctdb_ltdb_unlock(ctdb_db, call->key); + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret)); + } + talloc_free(call); + return; + } + + if ( (!(c->flags & CTDB_WANT_READONLY)) + && (header.flags & (CTDB_REC_RO_HAVE_DELEGATIONS|CTDB_REC_RO_HAVE_READONLY)) ) { + header.flags |= CTDB_REC_RO_REVOKING_READONLY; + if (ctdb_ltdb_store(ctdb_db, call->key, &header, data) != 0) { + ctdb_fatal(ctdb, "Failed to store record with HAVE_DELEGATIONS set"); + } + ret = ctdb_ltdb_unlock(ctdb_db, call->key); + + if (ctdb_start_revoke_ro_record(ctdb, ctdb_db, call->key, &header, data) != 0) { + ctdb_fatal(ctdb, "Failed to start record revoke"); + } + talloc_free(data.dptr); + + if (ctdb_add_revoke_deferred_call(ctdb, ctdb_db, call->key, hdr, ctdb_call_input_pkt, ctdb) != 0) { + ctdb_fatal(ctdb, "Failed to add deferred call for revoke child"); + } + talloc_free(call); + + return; + } + + /* If this is the first request for delegation. bump rsn and set + * the delegations flag + */ + if ((c->flags & CTDB_WANT_READONLY) + && (c->callid == CTDB_FETCH_WITH_HEADER_FUNC) + && (!(header.flags & CTDB_REC_RO_HAVE_DELEGATIONS))) { + header.rsn += 3; + header.flags |= CTDB_REC_RO_HAVE_DELEGATIONS; + if (ctdb_ltdb_store(ctdb_db, call->key, &header, data) != 0) { + ctdb_fatal(ctdb, "Failed to store record with HAVE_DELEGATIONS set"); + } + } + if ((c->flags & CTDB_WANT_READONLY) + && ((unsigned int)call->call_id == CTDB_FETCH_WITH_HEADER_FUNC)) { + TDB_DATA tdata; + + tdata = tdb_fetch(ctdb_db->rottdb, call->key); + if (ctdb_trackingdb_add_pnn(ctdb, &tdata, c->hdr.srcnode) != 0) { + ctdb_fatal(ctdb, "Failed to add node to trackingdb"); + } + if (tdb_store(ctdb_db->rottdb, call->key, tdata, TDB_REPLACE) != 0) { + ctdb_fatal(ctdb, "Failed to store trackingdb data"); + } + free(tdata.dptr); + + ret = ctdb_ltdb_unlock(ctdb_db, call->key); + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret)); + } + + len = offsetof(struct ctdb_reply_call_old, data) + data.dsize + sizeof(struct ctdb_ltdb_header); + r = ctdb_transport_allocate(ctdb, ctdb, CTDB_REPLY_CALL, len, + struct ctdb_reply_call_old); + CTDB_NO_MEMORY_FATAL(ctdb, r); + r->hdr.destnode = c->hdr.srcnode; + r->hdr.reqid = c->hdr.reqid; + r->hdr.generation = ctdb_db->generation; + r->status = 0; + r->datalen = data.dsize + sizeof(struct ctdb_ltdb_header); + header.rsn -= 2; + header.flags |= CTDB_REC_RO_HAVE_READONLY; + header.flags &= ~CTDB_REC_RO_HAVE_DELEGATIONS; + memcpy(&r->data[0], &header, sizeof(struct ctdb_ltdb_header)); + + if (data.dsize) { + memcpy(&r->data[sizeof(struct ctdb_ltdb_header)], data.dptr, data.dsize); + } + + ctdb_queue_packet(ctdb, &r->hdr); + CTDB_INCREMENT_STAT(ctdb, total_ro_delegations); + CTDB_INCREMENT_DB_STAT(ctdb_db, db_ro_delegations); + + talloc_free(r); + talloc_free(call); + return; + } + + CTDB_UPDATE_STAT(ctdb, max_hop_count, c->hopcount); + tmp_count = c->hopcount; + bucket = 0; + while (tmp_count) { + tmp_count >>= 1; + bucket++; + } + if (bucket >= MAX_COUNT_BUCKETS) { + bucket = MAX_COUNT_BUCKETS - 1; + } + CTDB_INCREMENT_STAT(ctdb, hop_count_bucket[bucket]); + CTDB_INCREMENT_DB_STAT(ctdb_db, hop_count_bucket[bucket]); + + /* If this database supports sticky records, then check if the + hopcount is big. If it is it means the record is hot and we + should make it sticky. + */ + if (ctdb_db_sticky(ctdb_db) && + c->hopcount >= ctdb->tunable.hopcount_make_sticky) { + ctdb_make_record_sticky(ctdb, ctdb_db, call->key); + } + + + /* Try if possible to migrate the record off to the caller node. + * From the clients perspective a fetch of the data is just as + * expensive as a migration. + */ + if (c->hdr.srcnode != ctdb->pnn) { + if (ctdb_db->persistent_state) { + DEBUG(DEBUG_INFO, (__location__ " refusing migration" + " of key %s while transaction is active\n", + (char *)call->key.dptr)); + } else { + DEBUG(DEBUG_DEBUG,("pnn %u starting migration of %08x to %u\n", + ctdb->pnn, ctdb_hash(&(call->key)), c->hdr.srcnode)); + ctdb_call_send_dmaster(ctdb_db, c, &header, &(call->key), &data); + talloc_free(data.dptr); + + ret = ctdb_ltdb_unlock(ctdb_db, call->key); + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret)); + } + } + talloc_free(call); + return; + } + + ret = ctdb_call_local(ctdb_db, call, &header, hdr, &data, true); + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ " ctdb_call_local failed\n")); + call->status = -1; + } + + ret = ctdb_ltdb_unlock(ctdb_db, call->key); + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret)); + } + + len = offsetof(struct ctdb_reply_call_old, data) + call->reply_data.dsize; + r = ctdb_transport_allocate(ctdb, ctdb, CTDB_REPLY_CALL, len, + struct ctdb_reply_call_old); + CTDB_NO_MEMORY_FATAL(ctdb, r); + r->hdr.destnode = hdr->srcnode; + r->hdr.reqid = hdr->reqid; + r->hdr.generation = ctdb_db->generation; + r->status = call->status; + r->datalen = call->reply_data.dsize; + if (call->reply_data.dsize) { + memcpy(&r->data[0], call->reply_data.dptr, call->reply_data.dsize); + } + + ctdb_queue_packet(ctdb, &r->hdr); + + talloc_free(r); + talloc_free(call); +} + +/** + * called when a CTDB_REPLY_CALL packet comes in + * + * This packet comes in response to a CTDB_REQ_CALL request packet. It + * contains any reply data from the call + */ +void ctdb_reply_call(struct ctdb_context *ctdb, struct ctdb_req_header *hdr) +{ + struct ctdb_reply_call_old *c = (struct ctdb_reply_call_old *)hdr; + struct ctdb_call_state *state; + + state = reqid_find(ctdb->idr, hdr->reqid, struct ctdb_call_state); + if (state == NULL) { + DEBUG(DEBUG_ERR, (__location__ " reqid %u not found\n", hdr->reqid)); + return; + } + + if (hdr->reqid != state->reqid) { + /* we found a record but it was the wrong one */ + DEBUG(DEBUG_ERR, ("Dropped orphaned call reply with reqid:%u\n",hdr->reqid)); + return; + } + + + /* read only delegation processing */ + /* If we got a FETCH_WITH_HEADER we should check if this is a ro + * delegation since we may need to update the record header + */ + if (state->c->callid == CTDB_FETCH_WITH_HEADER_FUNC) { + struct ctdb_db_context *ctdb_db = state->ctdb_db; + struct ctdb_ltdb_header *header = (struct ctdb_ltdb_header *)&c->data[0]; + struct ctdb_ltdb_header oldheader; + TDB_DATA key, data, olddata; + int ret; + + if (!(header->flags & CTDB_REC_RO_HAVE_READONLY)) { + goto finished_ro; + return; + } + + key.dsize = state->c->keylen; + key.dptr = state->c->data; + ret = ctdb_ltdb_lock_requeue(ctdb_db, key, hdr, + ctdb_call_input_pkt, ctdb, false); + if (ret == -2) { + return; + } + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ " Failed to get lock in ctdb_reply_call\n")); + return; + } + + ret = ctdb_ltdb_fetch(ctdb_db, key, &oldheader, state, &olddata); + if (ret != 0) { + DEBUG(DEBUG_ERR, ("Failed to fetch old record in ctdb_reply_call\n")); + ctdb_ltdb_unlock(ctdb_db, key); + goto finished_ro; + } + + if (header->rsn <= oldheader.rsn) { + ctdb_ltdb_unlock(ctdb_db, key); + goto finished_ro; + } + + if (c->datalen < sizeof(struct ctdb_ltdb_header)) { + DEBUG(DEBUG_ERR,(__location__ " Got FETCH_WITH_HEADER reply with too little data: %d bytes\n", c->datalen)); + ctdb_ltdb_unlock(ctdb_db, key); + goto finished_ro; + } + + data.dsize = c->datalen - sizeof(struct ctdb_ltdb_header); + data.dptr = &c->data[sizeof(struct ctdb_ltdb_header)]; + ret = ctdb_ltdb_store(ctdb_db, key, header, data); + if (ret != 0) { + DEBUG(DEBUG_ERR, ("Failed to store new record in ctdb_reply_call\n")); + ctdb_ltdb_unlock(ctdb_db, key); + goto finished_ro; + } + + ctdb_ltdb_unlock(ctdb_db, key); + } +finished_ro: + + state->call->reply_data.dptr = c->data; + state->call->reply_data.dsize = c->datalen; + state->call->status = c->status; + + talloc_steal(state, c); + + state->state = CTDB_CALL_DONE; + if (state->async.fn) { + state->async.fn(state); + } +} + + +/** + * called when a CTDB_REPLY_DMASTER packet comes in + * + * This packet comes in from the lmaster in response to a CTDB_REQ_CALL + * request packet. It means that the current dmaster wants to give us + * the dmaster role. + */ +void ctdb_reply_dmaster(struct ctdb_context *ctdb, struct ctdb_req_header *hdr) +{ + struct ctdb_reply_dmaster_old *c = (struct ctdb_reply_dmaster_old *)hdr; + struct ctdb_db_context *ctdb_db; + TDB_DATA key, data; + uint32_t record_flags = 0; + size_t len; + int ret; + + ctdb_db = find_ctdb_db(ctdb, c->db_id); + if (ctdb_db == NULL) { + DEBUG(DEBUG_ERR,("Unknown db_id 0x%x in ctdb_reply_dmaster\n", c->db_id)); + return; + } + + key.dptr = c->data; + key.dsize = c->keylen; + data.dptr = &c->data[key.dsize]; + data.dsize = c->datalen; + len = offsetof(struct ctdb_reply_dmaster_old, data) + key.dsize + data.dsize + + sizeof(uint32_t); + if (len <= c->hdr.length) { + memcpy(&record_flags, &c->data[c->keylen + c->datalen], + sizeof(record_flags)); + } + + dmaster_defer_setup(ctdb_db, hdr, key); + + ret = ctdb_ltdb_lock_requeue(ctdb_db, key, hdr, + ctdb_call_input_pkt, ctdb, false); + if (ret == -2) { + return; + } + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ " Failed to get lock in ctdb_reply_dmaster\n")); + return; + } + + ctdb_become_dmaster(ctdb_db, hdr, key, data, c->rsn, record_flags); +} + + +/* + called when a CTDB_REPLY_ERROR packet comes in +*/ +void ctdb_reply_error(struct ctdb_context *ctdb, struct ctdb_req_header *hdr) +{ + struct ctdb_reply_error_old *c = (struct ctdb_reply_error_old *)hdr; + struct ctdb_call_state *state; + + state = reqid_find(ctdb->idr, hdr->reqid, struct ctdb_call_state); + if (state == NULL) { + DEBUG(DEBUG_ERR,("pnn %u Invalid reqid %u in ctdb_reply_error\n", + ctdb->pnn, hdr->reqid)); + return; + } + + if (hdr->reqid != state->reqid) { + /* we found a record but it was the wrong one */ + DEBUG(DEBUG_ERR, ("Dropped orphaned error reply with reqid:%u\n",hdr->reqid)); + return; + } + + talloc_steal(state, c); + + state->state = CTDB_CALL_ERROR; + state->errmsg = (char *)c->msg; + if (state->async.fn) { + state->async.fn(state); + } +} + + +/* + destroy a ctdb_call +*/ +static int ctdb_call_destructor(struct ctdb_call_state *state) +{ + DLIST_REMOVE(state->ctdb_db->pending_calls, state); + reqid_remove(state->ctdb_db->ctdb->idr, state->reqid); + return 0; +} + + +/* + called when a ctdb_call needs to be resent after a reconfigure event +*/ +static void ctdb_call_resend(struct ctdb_call_state *state) +{ + struct ctdb_context *ctdb = state->ctdb_db->ctdb; + + state->generation = state->ctdb_db->generation; + + /* use a new reqid, in case the old reply does eventually come in */ + reqid_remove(ctdb->idr, state->reqid); + state->reqid = reqid_new(ctdb->idr, state); + state->c->hdr.reqid = state->reqid; + + /* update the generation count for this request, so its valid with the new vnn_map */ + state->c->hdr.generation = state->generation; + + /* send the packet to ourselves, it will be redirected appropriately */ + state->c->hdr.destnode = ctdb->pnn; + + ctdb_queue_packet(ctdb, &state->c->hdr); + D_INFO("resent ctdb_call for db %s reqid %u generation %u\n", + state->ctdb_db->db_name, + state->reqid, + state->generation); +} + +/* + resend all pending calls on recovery + */ +void ctdb_call_resend_db(struct ctdb_db_context *ctdb_db) +{ + struct ctdb_call_state *state, *next; + unsigned int count = 0; + + for (state = ctdb_db->pending_calls; state; state = next) { + next = state->next; + ctdb_call_resend(state); + count++; + } + /* Avoid logging a 0 count below */ + if (count == 0) { + return; + } + D_NOTICE("Resent calls for database=%s, generation=%u, count=%u\n", + ctdb_db->db_name, + ctdb_db->generation, + count); +} + +void ctdb_call_resend_all(struct ctdb_context *ctdb) +{ + struct ctdb_db_context *ctdb_db; + + for (ctdb_db = ctdb->db_list; ctdb_db; ctdb_db = ctdb_db->next) { + ctdb_call_resend_db(ctdb_db); + } +} + +/* + this allows the caller to setup a async.fn +*/ +static void call_local_trigger(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *private_data) +{ + struct ctdb_call_state *state = talloc_get_type(private_data, struct ctdb_call_state); + if (state->async.fn) { + state->async.fn(state); + } +} + + +/* + construct an event driven local ctdb_call + + this is used so that locally processed ctdb_call requests are processed + in an event driven manner +*/ +struct ctdb_call_state *ctdb_call_local_send(struct ctdb_db_context *ctdb_db, + struct ctdb_call *call, + struct ctdb_ltdb_header *header, + TDB_DATA *data) +{ + struct ctdb_call_state *state; + struct ctdb_context *ctdb = ctdb_db->ctdb; + int ret; + + state = talloc_zero(ctdb_db, struct ctdb_call_state); + CTDB_NO_MEMORY_NULL(ctdb, state); + + talloc_steal(state, data->dptr); + + state->state = CTDB_CALL_DONE; + state->call = talloc(state, struct ctdb_call); + CTDB_NO_MEMORY_NULL(ctdb, state->call); + *(state->call) = *call; + state->ctdb_db = ctdb_db; + + ret = ctdb_call_local(ctdb_db, state->call, header, state, data, true); + if (ret != 0) { + DEBUG(DEBUG_DEBUG,("ctdb_call_local() failed, ignoring return code %d\n", ret)); + } + + tevent_add_timer(ctdb->ev, state, timeval_zero(), + call_local_trigger, state); + + return state; +} + + +/* + make a remote ctdb call - async send. Called in daemon context. + + This constructs a ctdb_call request and queues it for processing. + This call never blocks. +*/ +struct ctdb_call_state *ctdb_daemon_call_send_remote(struct ctdb_db_context *ctdb_db, + struct ctdb_call *call, + struct ctdb_ltdb_header *header) +{ + uint32_t len; + struct ctdb_call_state *state; + struct ctdb_context *ctdb = ctdb_db->ctdb; + struct ctdb_req_call_old *c; + + if (ctdb->methods == NULL) { + DEBUG(DEBUG_INFO,(__location__ " Failed send packet. Transport is down\n")); + return NULL; + } + + state = talloc_zero(ctdb_db, struct ctdb_call_state); + CTDB_NO_MEMORY_NULL(ctdb, state); + state->call = talloc(state, struct ctdb_call); + CTDB_NO_MEMORY_NULL(ctdb, state->call); + + state->reqid = reqid_new(ctdb->idr, state); + state->ctdb_db = ctdb_db; + state->state = CTDB_CALL_WAIT; + state->generation = ctdb_db->generation; + + len = offsetof(struct ctdb_req_call_old, data) + call->key.dsize + + call->call_data.dsize; + + c = ctdb_transport_allocate(ctdb, + state, + CTDB_REQ_CALL, + len, + struct ctdb_req_call_old); + + CTDB_NO_MEMORY_NULL(ctdb, c); + state->c = c; + + c->hdr.destnode = header->dmaster; + c->hdr.reqid = state->reqid; + c->hdr.generation = ctdb_db->generation; + c->flags = call->flags; + c->db_id = ctdb_db->db_id; + c->callid = call->call_id; + c->hopcount = 0; + c->keylen = call->key.dsize; + c->calldatalen = call->call_data.dsize; + + memcpy(&c->data[0], call->key.dptr, call->key.dsize); + memcpy(&c->data[call->key.dsize], + call->call_data.dptr, + call->call_data.dsize); + + *(state->call) = *call; + state->call->call_data.dptr = &c->data[call->key.dsize]; + state->call->key.dptr = &c->data[0]; + + DLIST_ADD(ctdb_db->pending_calls, state); + + talloc_set_destructor(state, ctdb_call_destructor); + ctdb_queue_packet(ctdb, &state->c->hdr); + + return state; +} + +/* + make a remote ctdb call - async recv - called in daemon context + + This is called when the program wants to wait for a ctdb_call to complete and get the + results. This call will block unless the call has already completed. +*/ +int ctdb_daemon_call_recv(struct ctdb_call_state *state, struct ctdb_call *call) +{ + while (state->state < CTDB_CALL_DONE) { + tevent_loop_once(state->ctdb_db->ctdb->ev); + } + if (state->state != CTDB_CALL_DONE) { + ctdb_set_error(state->ctdb_db->ctdb, "%s", state->errmsg); + talloc_free(state); + return -1; + } + + if (state->call->reply_data.dsize) { + call->reply_data.dptr = talloc_memdup(call, + state->call->reply_data.dptr, + state->call->reply_data.dsize); + call->reply_data.dsize = state->call->reply_data.dsize; + } else { + call->reply_data.dptr = NULL; + call->reply_data.dsize = 0; + } + call->status = state->call->status; + talloc_free(state); + return 0; +} + + +struct revokechild_deferred_call { + struct revokechild_deferred_call *prev, *next; + struct ctdb_context *ctdb; + struct ctdb_req_header *hdr; + deferred_requeue_fn fn; + void *ctx; + struct revokechild_handle *rev_hdl; +}; + +struct revokechild_handle { + struct revokechild_handle *next, *prev; + struct ctdb_context *ctdb; + struct ctdb_db_context *ctdb_db; + struct tevent_fd *fde; + int status; + int fd[2]; + pid_t child; + TDB_DATA key; + struct revokechild_deferred_call *deferred_call_list; +}; + +static void deferred_call_requeue(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *private_data) +{ + struct revokechild_deferred_call *dlist = talloc_get_type_abort( + private_data, struct revokechild_deferred_call); + + while (dlist != NULL) { + struct revokechild_deferred_call *dcall = dlist; + + talloc_set_destructor(dcall, NULL); + DLIST_REMOVE(dlist, dcall); + dcall->fn(dcall->ctx, dcall->hdr); + talloc_free(dcall); + } +} + +static int deferred_call_destructor(struct revokechild_deferred_call *dcall) +{ + struct revokechild_handle *rev_hdl = dcall->rev_hdl; + + DLIST_REMOVE(rev_hdl->deferred_call_list, dcall); + return 0; +} + +static int revokechild_destructor(struct revokechild_handle *rev_hdl) +{ + struct revokechild_deferred_call *now_list = NULL; + struct revokechild_deferred_call *delay_list = NULL; + + if (rev_hdl->fde != NULL) { + talloc_free(rev_hdl->fde); + } + + if (rev_hdl->fd[0] != -1) { + close(rev_hdl->fd[0]); + } + if (rev_hdl->fd[1] != -1) { + close(rev_hdl->fd[1]); + } + ctdb_kill(rev_hdl->ctdb, rev_hdl->child, SIGKILL); + + DLIST_REMOVE(rev_hdl->ctdb_db->revokechild_active, rev_hdl); + + while (rev_hdl->deferred_call_list != NULL) { + struct revokechild_deferred_call *dcall; + + dcall = rev_hdl->deferred_call_list; + DLIST_REMOVE(rev_hdl->deferred_call_list, dcall); + + /* If revoke is successful, then first process all the calls + * that need write access, and delay readonly requests by 1 + * second grace. + * + * If revoke is unsuccessful, most likely because of node + * failure, delay all the pending requests, so database can + * be recovered. + */ + + if (rev_hdl->status == 0) { + struct ctdb_req_call_old *c; + + c = (struct ctdb_req_call_old *)dcall->hdr; + if (c->flags & CTDB_WANT_READONLY) { + DLIST_ADD(delay_list, dcall); + } else { + DLIST_ADD(now_list, dcall); + } + } else { + DLIST_ADD(delay_list, dcall); + } + } + + if (now_list != NULL) { + tevent_add_timer(rev_hdl->ctdb->ev, + rev_hdl->ctdb_db, + tevent_timeval_current_ofs(0, 0), + deferred_call_requeue, + now_list); + } + + if (delay_list != NULL) { + tevent_add_timer(rev_hdl->ctdb->ev, + rev_hdl->ctdb_db, + tevent_timeval_current_ofs(1, 0), + deferred_call_requeue, + delay_list); + } + + return 0; +} + +static void revokechild_handler(struct tevent_context *ev, + struct tevent_fd *fde, + uint16_t flags, void *private_data) +{ + struct revokechild_handle *rev_hdl = + talloc_get_type(private_data, struct revokechild_handle); + int ret; + char c; + + ret = sys_read(rev_hdl->fd[0], &c, 1); + if (ret != 1) { + DEBUG(DEBUG_ERR,("Failed to read status from revokechild. errno:%d\n", errno)); + rev_hdl->status = -1; + talloc_free(rev_hdl); + return; + } + if (c != 0) { + DEBUG(DEBUG_ERR,("revokechild returned failure. status:%d\n", c)); + rev_hdl->status = -1; + talloc_free(rev_hdl); + return; + } + + talloc_free(rev_hdl); +} + +struct ctdb_revoke_state { + struct ctdb_db_context *ctdb_db; + TDB_DATA key; + struct ctdb_ltdb_header *header; + TDB_DATA data; + int count; + int status; + int finished; +}; + +static void update_record_cb(struct ctdb_client_control_state *state) +{ + struct ctdb_revoke_state *revoke_state; + int ret; + int32_t res; + + if (state == NULL) { + return; + } + revoke_state = state->async.private_data; + + state->async.fn = NULL; + ret = ctdb_control_recv(state->ctdb, state, state, NULL, &res, NULL); + if ((ret != 0) || (res != 0)) { + DEBUG(DEBUG_ERR,("Recv for revoke update record failed ret:%d res:%d\n", ret, res)); + revoke_state->status = -1; + } + + revoke_state->count--; + if (revoke_state->count <= 0) { + revoke_state->finished = 1; + } +} + +static void revoke_send_cb(struct ctdb_context *ctdb, uint32_t pnn, void *private_data) +{ + struct ctdb_revoke_state *revoke_state = private_data; + struct ctdb_client_control_state *state; + + state = ctdb_ctrl_updaterecord_send(ctdb, revoke_state, timeval_current_ofs(ctdb->tunable.control_timeout,0), pnn, revoke_state->ctdb_db, revoke_state->key, revoke_state->header, revoke_state->data); + if (state == NULL) { + DEBUG(DEBUG_ERR,("Failure to send update record to revoke readonly delegation\n")); + revoke_state->status = -1; + return; + } + state->async.fn = update_record_cb; + state->async.private_data = revoke_state; + + revoke_state->count++; + +} + +static void ctdb_revoke_timeout_handler(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval yt, void *private_data) +{ + struct ctdb_revoke_state *state = private_data; + + DEBUG(DEBUG_ERR,("Timed out waiting for revoke to finish\n")); + state->finished = 1; + state->status = -1; +} + +static int ctdb_revoke_all_delegations(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, TDB_DATA tdata, TDB_DATA key, struct ctdb_ltdb_header *header, TDB_DATA data) +{ + struct ctdb_revoke_state *state = talloc_zero(ctdb, struct ctdb_revoke_state); + struct ctdb_ltdb_header new_header; + TDB_DATA new_data; + + state->ctdb_db = ctdb_db; + state->key = key; + state->header = header; + state->data = data; + + ctdb_trackingdb_traverse(ctdb, tdata, revoke_send_cb, state); + + tevent_add_timer(ctdb->ev, state, + timeval_current_ofs(ctdb->tunable.control_timeout, 0), + ctdb_revoke_timeout_handler, state); + + while (state->finished == 0) { + tevent_loop_once(ctdb->ev); + } + + if (ctdb_ltdb_lock(ctdb_db, key) != 0) { + DEBUG(DEBUG_ERR,("Failed to chainlock the database in revokechild\n")); + talloc_free(state); + return -1; + } + if (ctdb_ltdb_fetch(ctdb_db, key, &new_header, state, &new_data) != 0) { + ctdb_ltdb_unlock(ctdb_db, key); + DEBUG(DEBUG_ERR,("Failed for fetch tdb record in revokechild\n")); + talloc_free(state); + return -1; + } + header->rsn++; + if (new_header.rsn > header->rsn) { + ctdb_ltdb_unlock(ctdb_db, key); + DEBUG(DEBUG_ERR,("RSN too high in tdb record in revokechild\n")); + talloc_free(state); + return -1; + } + if ( (new_header.flags & (CTDB_REC_RO_REVOKING_READONLY|CTDB_REC_RO_HAVE_DELEGATIONS)) != (CTDB_REC_RO_REVOKING_READONLY|CTDB_REC_RO_HAVE_DELEGATIONS) ) { + ctdb_ltdb_unlock(ctdb_db, key); + DEBUG(DEBUG_ERR,("Flags are wrong in tdb record in revokechild\n")); + talloc_free(state); + return -1; + } + + /* + * If revoke on all nodes succeed, revoke is complete. Otherwise, + * remove CTDB_REC_RO_REVOKING_READONLY flag and retry. + */ + if (state->status == 0) { + new_header.rsn++; + new_header.flags |= CTDB_REC_RO_REVOKE_COMPLETE; + } else { + DEBUG(DEBUG_NOTICE, ("Revoke all delegations failed, retrying.\n")); + new_header.flags &= ~CTDB_REC_RO_REVOKING_READONLY; + } + if (ctdb_ltdb_store(ctdb_db, key, &new_header, new_data) != 0) { + ctdb_ltdb_unlock(ctdb_db, key); + DEBUG(DEBUG_ERR,("Failed to write new record in revokechild\n")); + talloc_free(state); + return -1; + } + ctdb_ltdb_unlock(ctdb_db, key); + + talloc_free(state); + return 0; +} + + +int ctdb_start_revoke_ro_record(struct ctdb_context *ctdb, + struct ctdb_db_context *ctdb_db, + TDB_DATA key, + struct ctdb_ltdb_header *header, + TDB_DATA data) +{ + TDB_DATA tdata; + struct revokechild_handle *rev_hdl; + pid_t parent = getpid(); + int ret; + + header->flags &= ~(CTDB_REC_RO_REVOKING_READONLY | + CTDB_REC_RO_HAVE_DELEGATIONS | + CTDB_REC_RO_HAVE_READONLY); + + header->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA; + header->rsn -= 1; + + rev_hdl = talloc_zero(ctdb_db, struct revokechild_handle); + if (rev_hdl == NULL) { + D_ERR("Failed to allocate revokechild_handle\n"); + return -1; + } + + tdata = tdb_fetch(ctdb_db->rottdb, key); + if (tdata.dsize > 0) { + uint8_t *tmp; + + tmp = tdata.dptr; + tdata.dptr = talloc_memdup(rev_hdl, tdata.dptr, tdata.dsize); + free(tmp); + } + + rev_hdl->status = 0; + rev_hdl->ctdb = ctdb; + rev_hdl->ctdb_db = ctdb_db; + rev_hdl->fd[0] = -1; + rev_hdl->fd[1] = -1; + + rev_hdl->key.dsize = key.dsize; + rev_hdl->key.dptr = talloc_memdup(rev_hdl, key.dptr, key.dsize); + if (rev_hdl->key.dptr == NULL) { + D_ERR("Failed to allocate key for revokechild_handle\n"); + goto err_out; + } + + ret = pipe(rev_hdl->fd); + if (ret != 0) { + D_ERR("Failed to allocate key for revokechild_handle\n"); + goto err_out; + } + + + rev_hdl->child = ctdb_fork(ctdb); + if (rev_hdl->child == (pid_t)-1) { + D_ERR("Failed to fork child for revokechild\n"); + goto err_out; + } + + if (rev_hdl->child == 0) { + char c = 0; + close(rev_hdl->fd[0]); + + prctl_set_comment("ctdb_revokechild"); + if (switch_from_server_to_client(ctdb) != 0) { + D_ERR("Failed to switch from server to client " + "for revokechild process\n"); + c = 1; + goto child_finished; + } + + c = ctdb_revoke_all_delegations(ctdb, + ctdb_db, + tdata, + key, + header, + data); + +child_finished: + sys_write(rev_hdl->fd[1], &c, 1); + ctdb_wait_for_process_to_exit(parent); + _exit(0); + } + + close(rev_hdl->fd[1]); + rev_hdl->fd[1] = -1; + set_close_on_exec(rev_hdl->fd[0]); + + rev_hdl->fde = tevent_add_fd(ctdb->ev, + rev_hdl, + rev_hdl->fd[0], + TEVENT_FD_READ, + revokechild_handler, + (void *)rev_hdl); + + if (rev_hdl->fde == NULL) { + D_ERR("Failed to set up fd event for revokechild process\n"); + talloc_free(rev_hdl); + } + tevent_fd_set_auto_close(rev_hdl->fde); + + /* This is an active revokechild child process */ + DLIST_ADD_END(ctdb_db->revokechild_active, rev_hdl); + talloc_set_destructor(rev_hdl, revokechild_destructor); + + return 0; +err_out: + talloc_free(rev_hdl); + return -1; +} + +int ctdb_add_revoke_deferred_call(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_req_header *hdr, deferred_requeue_fn fn, void *call_context) +{ + struct revokechild_handle *rev_hdl; + struct revokechild_deferred_call *deferred_call; + + for (rev_hdl = ctdb_db->revokechild_active; + rev_hdl; + rev_hdl = rev_hdl->next) { + if (rev_hdl->key.dsize == 0) { + continue; + } + if (rev_hdl->key.dsize != key.dsize) { + continue; + } + if (!memcmp(rev_hdl->key.dptr, key.dptr, key.dsize)) { + break; + } + } + + if (rev_hdl == NULL) { + DEBUG(DEBUG_ERR,("Failed to add deferred call to revoke list. revoke structure not found\n")); + return -1; + } + + deferred_call = talloc(call_context, struct revokechild_deferred_call); + if (deferred_call == NULL) { + DEBUG(DEBUG_ERR,("Failed to allocate deferred call structure for revoking record\n")); + return -1; + } + + deferred_call->ctdb = ctdb; + deferred_call->hdr = talloc_steal(deferred_call, hdr); + deferred_call->fn = fn; + deferred_call->ctx = call_context; + deferred_call->rev_hdl = rev_hdl; + + talloc_set_destructor(deferred_call, deferred_call_destructor); + + DLIST_ADD(rev_hdl->deferred_call_list, deferred_call); + + return 0; +} + +static void ctdb_migration_count_handler(TDB_DATA key, uint64_t counter, + void *private_data) +{ + struct ctdb_db_context *ctdb_db = talloc_get_type_abort( + private_data, struct ctdb_db_context); + unsigned int value; + + value = (counter < INT_MAX ? counter : INT_MAX); + ctdb_update_db_stat_hot_keys(ctdb_db, key, value); +} + +static void ctdb_migration_cleandb_event(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval current_time, + void *private_data) +{ + struct ctdb_db_context *ctdb_db = talloc_get_type_abort( + private_data, struct ctdb_db_context); + + if (ctdb_db->migratedb == NULL) { + return; + } + + hash_count_expire(ctdb_db->migratedb, NULL); + + te = tevent_add_timer(ctdb_db->ctdb->ev, ctdb_db->migratedb, + tevent_timeval_current_ofs(10, 0), + ctdb_migration_cleandb_event, ctdb_db); + if (te == NULL) { + DEBUG(DEBUG_ERR, + ("Memory error in migration cleandb event for %s\n", + ctdb_db->db_name)); + TALLOC_FREE(ctdb_db->migratedb); + } +} + +int ctdb_migration_init(struct ctdb_db_context *ctdb_db) +{ + struct timeval one_second = { 1, 0 }; + struct tevent_timer *te; + int ret; + + if (! ctdb_db_volatile(ctdb_db)) { + return 0; + } + + ret = hash_count_init(ctdb_db, one_second, + ctdb_migration_count_handler, ctdb_db, + &ctdb_db->migratedb); + if (ret != 0) { + DEBUG(DEBUG_ERR, + ("Memory error in migration init for %s\n", + ctdb_db->db_name)); + return -1; + } + + te = tevent_add_timer(ctdb_db->ctdb->ev, ctdb_db->migratedb, + tevent_timeval_current_ofs(10, 0), + ctdb_migration_cleandb_event, ctdb_db); + if (te == NULL) { + DEBUG(DEBUG_ERR, + ("Memory error in migration init for %s\n", + ctdb_db->db_name)); + TALLOC_FREE(ctdb_db->migratedb); + return -1; + } + + return 0; +} diff --git a/ctdb/server/ctdb_client.c b/ctdb/server/ctdb_client.c new file mode 100644 index 0000000..c9edb1d --- /dev/null +++ b/ctdb/server/ctdb_client.c @@ -0,0 +1,1709 @@ +/* + ctdb daemon code + + Copyright (C) Andrew Tridgell 2007 + Copyright (C) Ronnie Sahlberg 2007 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "replace.h" +#include "system/network.h" +#include "system/filesys.h" +#include "system/locale.h" + +#include <talloc.h> +#include <tevent.h> +#include <tdb.h> + +#include "lib/tdb_wrap/tdb_wrap.h" +#include "lib/util/dlinklist.h" +#include "lib/util/time.h" +#include "lib/util/debug.h" +#include "lib/util/samba_util.h" + +#include "ctdb_private.h" +#include "ctdb_client.h" + +#include "common/reqid.h" +#include "common/system.h" +#include "common/common.h" +#include "common/logging.h" + +/* + allocate a packet for use in client<->daemon communication + */ +struct ctdb_req_header *_ctdbd_allocate_pkt(struct ctdb_context *ctdb, + TALLOC_CTX *mem_ctx, + enum ctdb_operation operation, + size_t length, size_t slength, + const char *type) +{ + int size; + struct ctdb_req_header *hdr; + + length = MAX(length, slength); + size = (length+(CTDB_DS_ALIGNMENT-1)) & ~(CTDB_DS_ALIGNMENT-1); + + hdr = (struct ctdb_req_header *)talloc_zero_size(mem_ctx, size); + if (hdr == NULL) { + DEBUG(DEBUG_ERR,("Unable to allocate packet for operation %u of length %u\n", + operation, (unsigned)length)); + return NULL; + } + talloc_set_name_const(hdr, type); + hdr->length = length; + hdr->operation = operation; + hdr->ctdb_magic = CTDB_MAGIC; + hdr->ctdb_version = CTDB_PROTOCOL; + hdr->srcnode = ctdb->pnn; + if (ctdb->vnn_map) { + hdr->generation = ctdb->vnn_map->generation; + } + + return hdr; +} + +/* + local version of ctdb_call +*/ +int ctdb_call_local(struct ctdb_db_context *ctdb_db, struct ctdb_call *call, + struct ctdb_ltdb_header *header, TALLOC_CTX *mem_ctx, + TDB_DATA *data, bool updatetdb) +{ + struct ctdb_call_info *c; + struct ctdb_registered_call *fn; + struct ctdb_context *ctdb = ctdb_db->ctdb; + + c = talloc_zero(mem_ctx, struct ctdb_call_info); + CTDB_NO_MEMORY(ctdb, c); + + c->key = call->key; + c->call_data = &call->call_data; + c->record_data.dptr = talloc_memdup(c, data->dptr, data->dsize); + c->record_data.dsize = data->dsize; + CTDB_NO_MEMORY(ctdb, c->record_data.dptr); + c->header = header; + + for (fn=ctdb_db->calls;fn;fn=fn->next) { + if (fn->id == (uint32_t)call->call_id) { + break; + } + } + if (fn == NULL) { + ctdb_set_error(ctdb, "Unknown call id %u\n", call->call_id); + talloc_free(c); + return -1; + } + + if (fn->fn(c) != 0) { + ctdb_set_error(ctdb, "ctdb_call %u failed\n", call->call_id); + talloc_free(c); + return -1; + } + + /* we need to force the record to be written out if this was a remote access */ + if (c->new_data == NULL) { + c->new_data = &c->record_data; + } + + if (c->new_data && updatetdb) { + /* XXX check that we always have the lock here? */ + if (ctdb_ltdb_store(ctdb_db, call->key, header, *c->new_data) != 0) { + ctdb_set_error(ctdb, "ctdb_call tdb_store failed\n"); + talloc_free(c); + return -1; + } + } + + if (c->reply_data) { + call->reply_data = *c->reply_data; + + talloc_steal(call, call->reply_data.dptr); + talloc_set_name_const(call->reply_data.dptr, __location__); + } else { + call->reply_data.dptr = NULL; + call->reply_data.dsize = 0; + } + call->status = c->status; + + talloc_free(c); + + return 0; +} + + +/* + queue a packet for sending from client to daemon +*/ +static int ctdb_client_queue_pkt(struct ctdb_context *ctdb, struct ctdb_req_header *hdr) +{ + return ctdb_queue_send(ctdb->daemon.queue, (uint8_t *)hdr, hdr->length); +} + + +/* + called when a CTDB_REPLY_CALL packet comes in in the client + + This packet comes in response to a CTDB_REQ_CALL request packet. It + contains any reply data from the call +*/ +static void ctdb_client_reply_call(struct ctdb_context *ctdb, struct ctdb_req_header *hdr) +{ + struct ctdb_reply_call_old *c = (struct ctdb_reply_call_old *)hdr; + struct ctdb_client_call_state *state; + + state = reqid_find(ctdb->idr, hdr->reqid, struct ctdb_client_call_state); + if (state == NULL) { + DEBUG(DEBUG_ERR,(__location__ " reqid %u not found\n", hdr->reqid)); + return; + } + + if (hdr->reqid != state->reqid) { + /* we found a record but it was the wrong one */ + DEBUG(DEBUG_ERR, ("Dropped client call reply with reqid:%u\n",hdr->reqid)); + return; + } + + state->call->reply_data.dptr = c->data; + state->call->reply_data.dsize = c->datalen; + state->call->status = c->status; + + talloc_steal(state, c); + + state->state = CTDB_CALL_DONE; + + if (state->async.fn) { + state->async.fn(state); + } +} + +void ctdb_request_message(struct ctdb_context *ctdb, + struct ctdb_req_header *hdr) +{ + struct ctdb_req_message_old *c = (struct ctdb_req_message_old *)hdr; + TDB_DATA data; + + data.dsize = c->datalen; + data.dptr = talloc_memdup(c, &c->data[0], c->datalen); + if (data.dptr == NULL) { + DEBUG(DEBUG_ERR, (__location__ " Memory allocation failure\n")); + return; + } + + srvid_dispatch(ctdb->srv, c->srvid, CTDB_SRVID_ALL, data); +} + +static void ctdb_client_reply_control(struct ctdb_context *ctdb, struct ctdb_req_header *hdr); + +/* + this is called in the client, when data comes in from the daemon + */ +void ctdb_client_read_cb(uint8_t *data, size_t cnt, void *args) +{ + struct ctdb_context *ctdb = talloc_get_type(args, struct ctdb_context); + struct ctdb_req_header *hdr = (struct ctdb_req_header *)data; + TALLOC_CTX *tmp_ctx; + + /* place the packet as a child of a tmp_ctx. We then use + talloc_free() below to free it. If any of the calls want + to keep it, then they will steal it somewhere else, and the + talloc_free() will be a no-op */ + tmp_ctx = talloc_new(ctdb); + talloc_steal(tmp_ctx, hdr); + + if (cnt == 0) { + DEBUG(DEBUG_CRIT,("Daemon has exited - shutting down client\n")); + exit(1); + } + + if (cnt < sizeof(*hdr)) { + DEBUG(DEBUG_CRIT,("Bad packet length %u in client\n", (unsigned)cnt)); + goto done; + } + if (cnt != hdr->length) { + ctdb_set_error(ctdb, "Bad header length %u expected %u in client\n", + (unsigned)hdr->length, (unsigned)cnt); + goto done; + } + + if (hdr->ctdb_magic != CTDB_MAGIC) { + ctdb_set_error(ctdb, "Non CTDB packet rejected in client\n"); + goto done; + } + + if (hdr->ctdb_version != CTDB_PROTOCOL) { + ctdb_set_error(ctdb, "Bad CTDB version 0x%x rejected in client\n", hdr->ctdb_version); + goto done; + } + + switch (hdr->operation) { + case CTDB_REPLY_CALL: + ctdb_client_reply_call(ctdb, hdr); + break; + + case CTDB_REQ_MESSAGE: + ctdb_request_message(ctdb, hdr); + break; + + case CTDB_REPLY_CONTROL: + ctdb_client_reply_control(ctdb, hdr); + break; + + default: + DEBUG(DEBUG_CRIT,("bogus operation code:%u\n",hdr->operation)); + } + +done: + talloc_free(tmp_ctx); +} + +/* + connect to a unix domain socket +*/ +int ctdb_socket_connect(struct ctdb_context *ctdb) +{ + struct sockaddr_un addr; + int ret; + + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + strncpy(addr.sun_path, ctdb->daemon.name, sizeof(addr.sun_path)-1); + + ctdb->daemon.sd = socket(AF_UNIX, SOCK_STREAM, 0); + if (ctdb->daemon.sd == -1) { + DEBUG(DEBUG_ERR,(__location__ " Failed to open client socket. Errno:%s(%d)\n", strerror(errno), errno)); + return -1; + } + + if (connect(ctdb->daemon.sd, (struct sockaddr *)&addr, sizeof(addr)) == -1) { + DEBUG(DEBUG_ERR, + (__location__ + "Failed to connect client socket to daemon (%s)\n", + strerror(errno))); + close(ctdb->daemon.sd); + ctdb->daemon.sd = -1; + return -1; + } + + ret = set_blocking(ctdb->daemon.sd, false); + if (ret != 0) { + DEBUG(DEBUG_ERR, + (__location__ + " failed to set socket non-blocking (%s)\n", + strerror(errno))); + close(ctdb->daemon.sd); + ctdb->daemon.sd = -1; + return -1; + } + + set_close_on_exec(ctdb->daemon.sd); + + ctdb->daemon.queue = ctdb_queue_setup(ctdb, ctdb, ctdb->daemon.sd, + CTDB_DS_ALIGNMENT, + ctdb_client_read_cb, ctdb, "to-ctdbd"); + return 0; +} + + +struct ctdb_record_handle { + struct ctdb_db_context *ctdb_db; + TDB_DATA key; + TDB_DATA *data; + struct ctdb_ltdb_header header; +}; + + +/* + make a recv call to the local ctdb daemon - called from client context + + This is called when the program wants to wait for a ctdb_call to complete and get the + results. This call will block unless the call has already completed. +*/ +int ctdb_call_recv(struct ctdb_client_call_state *state, struct ctdb_call *call) +{ + if (state == NULL) { + return -1; + } + + while (state->state < CTDB_CALL_DONE) { + tevent_loop_once(state->ctdb_db->ctdb->ev); + } + if (state->state != CTDB_CALL_DONE) { + DEBUG(DEBUG_ERR,(__location__ " ctdb_call_recv failed\n")); + talloc_free(state); + return -1; + } + + if (state->call->reply_data.dsize) { + call->reply_data.dptr = talloc_memdup(state->ctdb_db, + state->call->reply_data.dptr, + state->call->reply_data.dsize); + call->reply_data.dsize = state->call->reply_data.dsize; + } else { + call->reply_data.dptr = NULL; + call->reply_data.dsize = 0; + } + call->status = state->call->status; + talloc_free(state); + + return call->status; +} + + + + +/* + destroy a ctdb_call in client +*/ +static int ctdb_client_call_destructor(struct ctdb_client_call_state *state) +{ + reqid_remove(state->ctdb_db->ctdb->idr, state->reqid); + return 0; +} + +/* + construct an event driven local ctdb_call + + this is used so that locally processed ctdb_call requests are processed + in an event driven manner +*/ +static struct ctdb_client_call_state *ctdb_client_call_local_send(struct ctdb_db_context *ctdb_db, + struct ctdb_call *call, + struct ctdb_ltdb_header *header, + TDB_DATA *data) +{ + struct ctdb_client_call_state *state; + struct ctdb_context *ctdb = ctdb_db->ctdb; + int ret; + + state = talloc_zero(ctdb_db, struct ctdb_client_call_state); + CTDB_NO_MEMORY_NULL(ctdb, state); + state->call = talloc_zero(state, struct ctdb_call); + CTDB_NO_MEMORY_NULL(ctdb, state->call); + + talloc_steal(state, data->dptr); + + state->state = CTDB_CALL_DONE; + *(state->call) = *call; + state->ctdb_db = ctdb_db; + + ret = ctdb_call_local(ctdb_db, state->call, header, state, data, true); + if (ret != 0) { + DEBUG(DEBUG_DEBUG,("ctdb_call_local() failed, ignoring return code %d\n", ret)); + } + + return state; +} + +/* + make a ctdb call to the local daemon - async send. Called from client context. + + This constructs a ctdb_call request and queues it for processing. + This call never blocks. +*/ +struct ctdb_client_call_state *ctdb_call_send(struct ctdb_db_context *ctdb_db, + struct ctdb_call *call) +{ + struct ctdb_client_call_state *state; + struct ctdb_context *ctdb = ctdb_db->ctdb; + struct ctdb_ltdb_header header; + TDB_DATA data; + int ret; + size_t len; + struct ctdb_req_call_old *c; + + /* if the domain socket is not yet open, open it */ + if (ctdb->daemon.sd==-1) { + ctdb_socket_connect(ctdb); + } + + ret = ctdb_ltdb_lock(ctdb_db, call->key); + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ " Failed to get chainlock\n")); + return NULL; + } + + ret = ctdb_ltdb_fetch(ctdb_db, call->key, &header, ctdb_db, &data); + + if ((call->flags & CTDB_IMMEDIATE_MIGRATION) && (header.flags & CTDB_REC_RO_HAVE_DELEGATIONS)) { + ret = -1; + } + + if (ret == 0 && header.dmaster == ctdb->pnn) { + state = ctdb_client_call_local_send(ctdb_db, call, &header, &data); + talloc_free(data.dptr); + ctdb_ltdb_unlock(ctdb_db, call->key); + return state; + } + + ctdb_ltdb_unlock(ctdb_db, call->key); + talloc_free(data.dptr); + + state = talloc_zero(ctdb_db, struct ctdb_client_call_state); + if (state == NULL) { + DEBUG(DEBUG_ERR, (__location__ " failed to allocate state\n")); + return NULL; + } + state->call = talloc_zero(state, struct ctdb_call); + if (state->call == NULL) { + DEBUG(DEBUG_ERR, (__location__ " failed to allocate state->call\n")); + return NULL; + } + + len = offsetof(struct ctdb_req_call_old, data) + call->key.dsize + call->call_data.dsize; + c = ctdbd_allocate_pkt(ctdb, state, CTDB_REQ_CALL, len, struct ctdb_req_call_old); + if (c == NULL) { + DEBUG(DEBUG_ERR, (__location__ " failed to allocate packet\n")); + return NULL; + } + + state->reqid = reqid_new(ctdb->idr, state); + state->ctdb_db = ctdb_db; + talloc_set_destructor(state, ctdb_client_call_destructor); + + c->hdr.reqid = state->reqid; + c->flags = call->flags; + c->db_id = ctdb_db->db_id; + c->callid = call->call_id; + c->hopcount = 0; + c->keylen = call->key.dsize; + c->calldatalen = call->call_data.dsize; + memcpy(&c->data[0], call->key.dptr, call->key.dsize); + memcpy(&c->data[call->key.dsize], + call->call_data.dptr, call->call_data.dsize); + *(state->call) = *call; + state->call->call_data.dptr = &c->data[call->key.dsize]; + state->call->key.dptr = &c->data[0]; + + state->state = CTDB_CALL_WAIT; + + + ctdb_client_queue_pkt(ctdb, &c->hdr); + + return state; +} + + +/* + full ctdb_call. Equivalent to a ctdb_call_send() followed by a ctdb_call_recv() +*/ +int ctdb_call(struct ctdb_db_context *ctdb_db, struct ctdb_call *call) +{ + struct ctdb_client_call_state *state; + + state = ctdb_call_send(ctdb_db, call); + return ctdb_call_recv(state, call); +} + + +/* + tell the daemon what messaging srvid we will use, and register the message + handler function in the client +*/ +int ctdb_client_set_message_handler(struct ctdb_context *ctdb, uint64_t srvid, + srvid_handler_fn handler, + void *private_data) +{ + int res; + int32_t status; + + res = ctdb_control(ctdb, CTDB_CURRENT_NODE, srvid, + CTDB_CONTROL_REGISTER_SRVID, 0, + tdb_null, NULL, NULL, &status, NULL, NULL); + if (res != 0 || status != 0) { + DEBUG(DEBUG_ERR, + ("Failed to register srvid %llu\n", + (unsigned long long)srvid)); + return -1; + } + + /* also need to register the handler with our own ctdb structure */ + return srvid_register(ctdb->srv, ctdb, srvid, handler, private_data); +} + +/* + tell the daemon we no longer want a srvid +*/ +int ctdb_client_remove_message_handler(struct ctdb_context *ctdb, + uint64_t srvid, void *private_data) +{ + int res; + int32_t status; + + res = ctdb_control(ctdb, CTDB_CURRENT_NODE, srvid, + CTDB_CONTROL_DEREGISTER_SRVID, 0, + tdb_null, NULL, NULL, &status, NULL, NULL); + if (res != 0 || status != 0) { + DEBUG(DEBUG_ERR, + ("Failed to deregister srvid %llu\n", + (unsigned long long)srvid)); + return -1; + } + + /* also need to register the handler with our own ctdb structure */ + srvid_deregister(ctdb->srv, srvid, private_data); + return 0; +} + +/* + send a message - from client context + */ +int ctdb_client_send_message(struct ctdb_context *ctdb, uint32_t pnn, + uint64_t srvid, TDB_DATA data) +{ + struct ctdb_req_message_old *r; + int len, res; + + len = offsetof(struct ctdb_req_message_old, data) + data.dsize; + r = ctdbd_allocate_pkt(ctdb, ctdb, CTDB_REQ_MESSAGE, + len, struct ctdb_req_message_old); + CTDB_NO_MEMORY(ctdb, r); + + r->hdr.destnode = pnn; + r->srvid = srvid; + r->datalen = data.dsize; + memcpy(&r->data[0], data.dptr, data.dsize); + + res = ctdb_client_queue_pkt(ctdb, &r->hdr); + talloc_free(r); + return res; +} + + +/* + called when a control completes or timesout to invoke the callback + function the user provided +*/ +static void invoke_control_callback(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *private_data) +{ + struct ctdb_client_control_state *state; + TALLOC_CTX *tmp_ctx = talloc_new(NULL); + int ret; + + state = talloc_get_type(private_data, struct ctdb_client_control_state); + talloc_steal(tmp_ctx, state); + + ret = ctdb_control_recv(state->ctdb, state, state, + NULL, + NULL, + NULL); + if (ret != 0) { + DEBUG(DEBUG_DEBUG,("ctdb_control_recv() failed, ignoring return code %d\n", ret)); + } + + talloc_free(tmp_ctx); +} + +/* + called when a CTDB_REPLY_CONTROL packet comes in in the client + + This packet comes in response to a CTDB_REQ_CONTROL request packet. It + contains any reply data from the control +*/ +static void ctdb_client_reply_control(struct ctdb_context *ctdb, + struct ctdb_req_header *hdr) +{ + struct ctdb_reply_control_old *c = (struct ctdb_reply_control_old *)hdr; + struct ctdb_client_control_state *state; + + state = reqid_find(ctdb->idr, hdr->reqid, struct ctdb_client_control_state); + if (state == NULL) { + DEBUG(DEBUG_ERR,(__location__ " reqid %u not found\n", hdr->reqid)); + return; + } + + if (hdr->reqid != state->reqid) { + /* we found a record but it was the wrong one */ + DEBUG(DEBUG_ERR, ("Dropped orphaned reply control with reqid:%u\n",hdr->reqid)); + return; + } + + state->outdata.dptr = c->data; + state->outdata.dsize = c->datalen; + state->status = c->status; + if (c->errorlen) { + state->errormsg = talloc_strndup(state, + (char *)&c->data[c->datalen], + c->errorlen); + } + + /* state->outdata now uses resources from c so we don't want c + to just disappear from under us while state is still alive + */ + talloc_steal(state, c); + + state->state = CTDB_CONTROL_DONE; + + /* if we had a callback registered for this control, pull the response + and call the callback. + */ + if (state->async.fn) { + tevent_add_timer(ctdb->ev, state, timeval_zero(), + invoke_control_callback, state); + } +} + + +/* + destroy a ctdb_control in client +*/ +static int ctdb_client_control_destructor(struct ctdb_client_control_state *state) +{ + reqid_remove(state->ctdb->idr, state->reqid); + return 0; +} + + +/* time out handler for ctdb_control */ +static void control_timeout_func(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *private_data) +{ + struct ctdb_client_control_state *state = talloc_get_type(private_data, struct ctdb_client_control_state); + + DEBUG(DEBUG_ERR,(__location__ " control timed out. reqid:%u opcode:%u " + "dstnode:%u\n", state->reqid, state->c->opcode, + state->c->hdr.destnode)); + + state->state = CTDB_CONTROL_TIMEOUT; + + /* if we had a callback registered for this control, pull the response + and call the callback. + */ + if (state->async.fn) { + tevent_add_timer(state->ctdb->ev, state, timeval_zero(), + invoke_control_callback, state); + } +} + +/* async version of send control request */ +struct ctdb_client_control_state *ctdb_control_send(struct ctdb_context *ctdb, + uint32_t destnode, uint64_t srvid, + uint32_t opcode, uint32_t flags, TDB_DATA data, + TALLOC_CTX *mem_ctx, + struct timeval *timeout, + char **errormsg) +{ + struct ctdb_client_control_state *state; + size_t len; + struct ctdb_req_control_old *c; + int ret; + + if (errormsg) { + *errormsg = NULL; + } + + /* if the domain socket is not yet open, open it */ + if (ctdb->daemon.sd==-1) { + ctdb_socket_connect(ctdb); + } + + state = talloc_zero(mem_ctx, struct ctdb_client_control_state); + CTDB_NO_MEMORY_NULL(ctdb, state); + + state->ctdb = ctdb; + state->reqid = reqid_new(ctdb->idr, state); + state->state = CTDB_CONTROL_WAIT; + state->errormsg = NULL; + + talloc_set_destructor(state, ctdb_client_control_destructor); + + len = offsetof(struct ctdb_req_control_old, data) + data.dsize; + c = ctdbd_allocate_pkt(ctdb, state, CTDB_REQ_CONTROL, + len, struct ctdb_req_control_old); + state->c = c; + CTDB_NO_MEMORY_NULL(ctdb, c); + c->hdr.reqid = state->reqid; + c->hdr.destnode = destnode; + c->opcode = opcode; + c->client_id = 0; + c->flags = flags; + c->srvid = srvid; + c->datalen = data.dsize; + if (data.dsize) { + memcpy(&c->data[0], data.dptr, data.dsize); + } + + /* timeout */ + if (timeout && !timeval_is_zero(timeout)) { + tevent_add_timer(ctdb->ev, state, *timeout, + control_timeout_func, state); + } + + ret = ctdb_client_queue_pkt(ctdb, &(c->hdr)); + if (ret != 0) { + talloc_free(state); + return NULL; + } + + if (flags & CTDB_CTRL_FLAG_NOREPLY) { + talloc_free(state); + return NULL; + } + + return state; +} + + +/* async version of receive control reply */ +int ctdb_control_recv(struct ctdb_context *ctdb, + struct ctdb_client_control_state *state, + TALLOC_CTX *mem_ctx, + TDB_DATA *outdata, int32_t *status, char **errormsg) +{ + TALLOC_CTX *tmp_ctx; + + if (status != NULL) { + *status = -1; + } + if (errormsg != NULL) { + *errormsg = NULL; + } + + if (state == NULL) { + return -1; + } + + /* prevent double free of state */ + tmp_ctx = talloc_new(ctdb); + talloc_steal(tmp_ctx, state); + + /* loop one event at a time until we either timeout or the control + completes. + */ + while (state->state == CTDB_CONTROL_WAIT) { + tevent_loop_once(ctdb->ev); + } + + if (state->state != CTDB_CONTROL_DONE) { + DEBUG(DEBUG_ERR,(__location__ " ctdb_control_recv failed\n")); + if (state->async.fn) { + state->async.fn(state); + } + talloc_free(tmp_ctx); + return -1; + } + + if (state->errormsg) { + int s = (state->status == 0 ? -1 : state->status); + DEBUG(DEBUG_ERR,("ctdb_control error: '%s'\n", state->errormsg)); + if (errormsg) { + (*errormsg) = talloc_move(mem_ctx, &state->errormsg); + } + if (state->async.fn) { + state->async.fn(state); + } + talloc_free(tmp_ctx); + return s; + } + + if (outdata) { + *outdata = state->outdata; + outdata->dptr = talloc_memdup(mem_ctx, outdata->dptr, outdata->dsize); + } + + if (status) { + *status = state->status; + } + + if (state->async.fn) { + state->async.fn(state); + } + + talloc_free(tmp_ctx); + return 0; +} + + + +/* + send a ctdb control message + timeout specifies how long we should wait for a reply. + if timeout is NULL we wait indefinitely + */ +int ctdb_control(struct ctdb_context *ctdb, uint32_t destnode, uint64_t srvid, + uint32_t opcode, uint32_t flags, TDB_DATA data, + TALLOC_CTX *mem_ctx, TDB_DATA *outdata, int32_t *status, + struct timeval *timeout, + char **errormsg) +{ + struct ctdb_client_control_state *state; + + state = ctdb_control_send(ctdb, destnode, srvid, opcode, + flags, data, mem_ctx, + timeout, errormsg); + + /* FIXME: Error conditions in ctdb_control_send return NULL without + * setting errormsg. So, there is no way to distinguish between success + * and failure when CTDB_CTRL_FLAG_NOREPLY is set */ + if (flags & CTDB_CTRL_FLAG_NOREPLY) { + if (status != NULL) { + *status = 0; + } + return 0; + } + + return ctdb_control_recv(ctdb, state, mem_ctx, outdata, status, + errormsg); +} + +/* + get vnn map from a remote node + */ +int ctdb_ctrl_getvnnmap(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, TALLOC_CTX *mem_ctx, struct ctdb_vnn_map **vnnmap) +{ + int ret; + TDB_DATA outdata; + int32_t res; + struct ctdb_vnn_map_wire *map; + + ret = ctdb_control(ctdb, destnode, 0, + CTDB_CONTROL_GETVNNMAP, 0, tdb_null, + mem_ctx, &outdata, &res, &timeout, NULL); + if (ret != 0 || res != 0) { + DEBUG(DEBUG_ERR,(__location__ " ctdb_control for getvnnmap failed\n")); + return -1; + } + + map = (struct ctdb_vnn_map_wire *)outdata.dptr; + if (outdata.dsize < offsetof(struct ctdb_vnn_map_wire, map) || + outdata.dsize != map->size*sizeof(uint32_t) + offsetof(struct ctdb_vnn_map_wire, map)) { + DEBUG(DEBUG_ERR,("Bad vnn map size received in ctdb_ctrl_getvnnmap\n")); + return -1; + } + + (*vnnmap) = talloc(mem_ctx, struct ctdb_vnn_map); + CTDB_NO_MEMORY(ctdb, *vnnmap); + (*vnnmap)->generation = map->generation; + (*vnnmap)->size = map->size; + (*vnnmap)->map = talloc_array(*vnnmap, uint32_t, map->size); + + CTDB_NO_MEMORY(ctdb, (*vnnmap)->map); + memcpy((*vnnmap)->map, map->map, sizeof(uint32_t)*map->size); + talloc_free(outdata.dptr); + + return 0; +} + + +/* + get the recovery mode of a remote node + */ +struct ctdb_client_control_state * +ctdb_ctrl_getrecmode_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode) +{ + return ctdb_control_send(ctdb, destnode, 0, + CTDB_CONTROL_GET_RECMODE, 0, tdb_null, + mem_ctx, &timeout, NULL); +} + +int ctdb_ctrl_getrecmode_recv(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct ctdb_client_control_state *state, uint32_t *recmode) +{ + int ret; + int32_t res; + + ret = ctdb_control_recv(ctdb, state, mem_ctx, NULL, &res, NULL); + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ " ctdb_ctrl_getrecmode_recv failed\n")); + return -1; + } + + if (recmode) { + *recmode = (uint32_t)res; + } + + return 0; +} + +int ctdb_ctrl_getrecmode(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode, uint32_t *recmode) +{ + struct ctdb_client_control_state *state; + + state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx, timeout, destnode); + return ctdb_ctrl_getrecmode_recv(ctdb, mem_ctx, state, recmode); +} + + + + +/* + set the recovery mode of a remote node + */ +int ctdb_ctrl_setrecmode(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t recmode) +{ + int ret; + TDB_DATA data; + int32_t res; + + data.dsize = sizeof(uint32_t); + data.dptr = (unsigned char *)&recmode; + + ret = ctdb_control(ctdb, destnode, 0, + CTDB_CONTROL_SET_RECMODE, 0, data, + NULL, NULL, &res, &timeout, NULL); + if (ret != 0 || res != 0) { + DEBUG(DEBUG_ERR,(__location__ " ctdb_control for setrecmode failed\n")); + return -1; + } + + return 0; +} + + + +/* + get a list of nodes (vnn and flags ) from a remote node + */ +int ctdb_ctrl_getnodemap(struct ctdb_context *ctdb, + struct timeval timeout, uint32_t destnode, + TALLOC_CTX *mem_ctx, struct ctdb_node_map_old **nodemap) +{ + int ret; + TDB_DATA outdata; + int32_t res; + + ret = ctdb_control(ctdb, destnode, 0, + CTDB_CONTROL_GET_NODEMAP, 0, tdb_null, + mem_ctx, &outdata, &res, &timeout, NULL); + if (ret != 0 || res != 0 || outdata.dsize == 0) { + DEBUG(DEBUG_ERR,(__location__ " ctdb_control for getnodes failed ret:%d res:%d\n", ret, res)); + return -1; + } + + *nodemap = (struct ctdb_node_map_old *)talloc_memdup(mem_ctx, outdata.dptr, outdata.dsize); + talloc_free(outdata.dptr); + return 0; +} + +int ctdb_ctrl_get_runstate(struct ctdb_context *ctdb, + struct timeval timeout, + uint32_t destnode, + uint32_t *runstate) +{ + TDB_DATA outdata; + int32_t res; + int ret; + + ret = ctdb_control(ctdb, destnode, 0, CTDB_CONTROL_GET_RUNSTATE, 0, + tdb_null, ctdb, &outdata, &res, &timeout, NULL); + if (ret != 0 || res != 0) { + DEBUG(DEBUG_ERR,("ctdb_control for get_runstate failed\n")); + return ret != 0 ? ret : res; + } + + if (outdata.dsize != sizeof(uint32_t)) { + DEBUG(DEBUG_ERR,("Invalid return data in get_runstate\n")); + talloc_free(outdata.dptr); + return -1; + } + + if (runstate != NULL) { + *runstate = *(uint32_t *)outdata.dptr; + } + talloc_free(outdata.dptr); + + return 0; +} + +/* + get debug level on a node + */ +int ctdb_ctrl_get_debuglevel(struct ctdb_context *ctdb, uint32_t destnode, int32_t *level) +{ + int ret; + int32_t res; + TDB_DATA data; + + ret = ctdb_control(ctdb, destnode, 0, CTDB_CONTROL_GET_DEBUG, 0, tdb_null, + ctdb, &data, &res, NULL, NULL); + if (ret != 0 || res != 0) { + return -1; + } + if (data.dsize != sizeof(int32_t)) { + DEBUG(DEBUG_ERR,("Bad control reply size in ctdb_get_debuglevel (got %u)\n", + (unsigned)data.dsize)); + return -1; + } + *level = *(int32_t *)data.dptr; + talloc_free(data.dptr); + return 0; +} + +/* Freeze all databases */ +int ctdb_ctrl_freeze(struct ctdb_context *ctdb, struct timeval timeout, + uint32_t destnode) +{ + int ret; + int32_t res; + + ret = ctdb_control(ctdb, destnode, 0, + CTDB_CONTROL_FREEZE, 0, tdb_null, + NULL, NULL, &res, &timeout, NULL); + if (ret != 0 || res != 0) { + DEBUG(DEBUG_ERR, ("ctdb_ctrl_freeze_priority failed\n")); + return -1; + } + + return 0; +} + +/* + get pnn of a node, or -1 + */ +int ctdb_ctrl_getpnn(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode) +{ + int ret; + int32_t res; + + ret = ctdb_control(ctdb, destnode, 0, + CTDB_CONTROL_GET_PNN, 0, tdb_null, + NULL, NULL, &res, &timeout, NULL); + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ " ctdb_control for getpnn failed\n")); + return -1; + } + + return res; +} + +int ctdb_ctrl_get_public_ips_flags(struct ctdb_context *ctdb, + struct timeval timeout, uint32_t destnode, + TALLOC_CTX *mem_ctx, + uint32_t flags, + struct ctdb_public_ip_list_old **ips) +{ + int ret; + TDB_DATA outdata; + int32_t res; + + ret = ctdb_control(ctdb, destnode, 0, + CTDB_CONTROL_GET_PUBLIC_IPS, flags, tdb_null, + mem_ctx, &outdata, &res, &timeout, NULL); + if (ret != 0 || res != 0) { + DEBUG(DEBUG_ERR,(__location__ + " ctdb_control for getpublicips failed ret:%d res:%d\n", + ret, res)); + return -1; + } + + *ips = (struct ctdb_public_ip_list_old *)talloc_memdup(mem_ctx, outdata.dptr, outdata.dsize); + talloc_free(outdata.dptr); + + return 0; +} + +int ctdb_ctrl_get_public_ips(struct ctdb_context *ctdb, + struct timeval timeout, uint32_t destnode, + TALLOC_CTX *mem_ctx, + struct ctdb_public_ip_list_old **ips) +{ + return ctdb_ctrl_get_public_ips_flags(ctdb, timeout, + destnode, mem_ctx, + 0, ips); +} + +int ctdb_ctrl_get_ifaces(struct ctdb_context *ctdb, + struct timeval timeout, uint32_t destnode, + TALLOC_CTX *mem_ctx, + struct ctdb_iface_list_old **_ifaces) +{ + int ret; + TDB_DATA outdata; + int32_t res; + struct ctdb_iface_list_old *ifaces; + uint32_t len; + uint32_t i; + + ret = ctdb_control(ctdb, destnode, 0, + CTDB_CONTROL_GET_IFACES, 0, tdb_null, + mem_ctx, &outdata, &res, &timeout, NULL); + if (ret != 0 || res != 0) { + DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get ifaces " + "failed ret:%d res:%d\n", + ret, res)); + return -1; + } + + len = offsetof(struct ctdb_iface_list_old, ifaces); + if (len > outdata.dsize) { + DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get ifaces " + "returned invalid data with size %u > %u\n", + (unsigned int)outdata.dsize, + (unsigned int)len)); + dump_data(DEBUG_DEBUG, outdata.dptr, outdata.dsize); + return -1; + } + + ifaces = (struct ctdb_iface_list_old *)outdata.dptr; + len += ifaces->num*sizeof(struct ctdb_iface); + + if (len > outdata.dsize) { + DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get ifaces " + "returned invalid data with size %u > %u\n", + (unsigned int)outdata.dsize, + (unsigned int)len)); + dump_data(DEBUG_DEBUG, outdata.dptr, outdata.dsize); + return -1; + } + + /* make sure we null terminate the returned strings */ + for (i=0; i < ifaces->num; i++) { + ifaces->ifaces[i].name[CTDB_IFACE_SIZE] = '\0'; + } + + *_ifaces = (struct ctdb_iface_list_old *)talloc_memdup(mem_ctx, + outdata.dptr, + outdata.dsize); + talloc_free(outdata.dptr); + if (*_ifaces == NULL) { + DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get ifaces " + "talloc_memdup size %u failed\n", + (unsigned int)outdata.dsize)); + return -1; + } + + return 0; +} + +/* + get all tunables + */ +int ctdb_ctrl_get_all_tunables(struct ctdb_context *ctdb, + struct timeval timeout, + uint32_t destnode, + struct ctdb_tunable_list *tunables) +{ + TDB_DATA outdata; + int ret; + int32_t res; + + ret = ctdb_control(ctdb, destnode, 0, CTDB_CONTROL_GET_ALL_TUNABLES, 0, tdb_null, ctdb, + &outdata, &res, &timeout, NULL); + if (ret != 0 || res != 0) { + DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get all tunables failed\n")); + return -1; + } + + if (outdata.dsize != sizeof(*tunables)) { + DEBUG(DEBUG_ERR,(__location__ " bad data size %u in ctdb_ctrl_get_all_tunables should be %u\n", + (unsigned)outdata.dsize, (unsigned)sizeof(*tunables))); + return -1; + } + + *tunables = *(struct ctdb_tunable_list *)outdata.dptr; + talloc_free(outdata.dptr); + return 0; +} + +/* + set some ctdb flags +*/ +void ctdb_set_flags(struct ctdb_context *ctdb, unsigned flags) +{ + ctdb->flags |= flags; +} + +const char *ctdb_get_socketname(struct ctdb_context *ctdb) +{ + return ctdb->daemon.name; +} + +/* + return the pnn of this node +*/ +uint32_t ctdb_get_pnn(struct ctdb_context *ctdb) +{ + return ctdb->pnn; +} + +/* + callback for the async helpers used when sending the same control + to multiple nodes in parallel. +*/ +static void async_callback(struct ctdb_client_control_state *state) +{ + struct client_async_data *data = talloc_get_type(state->async.private_data, struct client_async_data); + struct ctdb_context *ctdb = talloc_get_type(state->ctdb, struct ctdb_context); + int ret; + TDB_DATA outdata; + int32_t res = -1; + uint32_t destnode = state->c->hdr.destnode; + + outdata.dsize = 0; + outdata.dptr = NULL; + + /* one more node has responded with recmode data */ + data->count--; + + /* if we failed to push the db, then return an error and let + the main loop try again. + */ + if (state->state != CTDB_CONTROL_DONE) { + if ( !data->dont_log_errors) { + DEBUG(DEBUG_ERR,("Async operation failed with state %d, opcode:%u\n", state->state, data->opcode)); + } + data->fail_count++; + if (state->state == CTDB_CONTROL_TIMEOUT) { + res = -ETIMEDOUT; + } else { + res = -1; + } + if (data->fail_callback) { + data->fail_callback(ctdb, destnode, res, outdata, + data->callback_data); + } + return; + } + + state->async.fn = NULL; + + ret = ctdb_control_recv(ctdb, state, data, &outdata, &res, NULL); + if ((ret != 0) || (res != 0)) { + if ( !data->dont_log_errors) { + DEBUG(DEBUG_ERR,("Async operation failed with ret=%d res=%d opcode=%u\n", ret, (int)res, data->opcode)); + } + data->fail_count++; + if (data->fail_callback) { + data->fail_callback(ctdb, destnode, res, outdata, + data->callback_data); + } + } + if ((ret == 0) && (data->callback != NULL)) { + data->callback(ctdb, destnode, res, outdata, + data->callback_data); + } +} + + +void ctdb_client_async_add(struct client_async_data *data, struct ctdb_client_control_state *state) +{ + /* set up the callback functions */ + state->async.fn = async_callback; + state->async.private_data = data; + + /* one more control to wait for to complete */ + data->count++; +} + + +/* wait for up to the maximum number of seconds allowed + or until all nodes we expect a response from has replied +*/ +int ctdb_client_async_wait(struct ctdb_context *ctdb, struct client_async_data *data) +{ + while (data->count > 0) { + tevent_loop_once(ctdb->ev); + } + if (data->fail_count != 0) { + if (!data->dont_log_errors) { + DEBUG(DEBUG_ERR,("Async wait failed - fail_count=%u\n", + data->fail_count)); + } + return -1; + } + return 0; +} + + +/* + perform a simple control on the listed nodes + The control cannot return data + */ +int ctdb_client_async_control(struct ctdb_context *ctdb, + enum ctdb_controls opcode, + uint32_t *nodes, + uint64_t srvid, + struct timeval timeout, + bool dont_log_errors, + TDB_DATA data, + client_async_callback client_callback, + client_async_callback fail_callback, + void *callback_data) +{ + struct client_async_data *async_data; + struct ctdb_client_control_state *state; + int j, num_nodes; + + async_data = talloc_zero(ctdb, struct client_async_data); + CTDB_NO_MEMORY_FATAL(ctdb, async_data); + async_data->dont_log_errors = dont_log_errors; + async_data->callback = client_callback; + async_data->fail_callback = fail_callback; + async_data->callback_data = callback_data; + async_data->opcode = opcode; + + num_nodes = talloc_get_size(nodes) / sizeof(uint32_t); + + /* loop over all nodes and send an async control to each of them */ + for (j=0; j<num_nodes; j++) { + uint32_t pnn = nodes[j]; + + state = ctdb_control_send(ctdb, pnn, srvid, opcode, + 0, data, async_data, &timeout, NULL); + if (state == NULL) { + DEBUG(DEBUG_ERR,(__location__ " Failed to call async control %u\n", (unsigned)opcode)); + talloc_free(async_data); + return -1; + } + + ctdb_client_async_add(async_data, state); + } + + if (ctdb_client_async_wait(ctdb, async_data) != 0) { + talloc_free(async_data); + return -1; + } + + talloc_free(async_data); + return 0; +} + +uint32_t *list_of_vnnmap_nodes(struct ctdb_context *ctdb, + struct ctdb_vnn_map *vnn_map, + TALLOC_CTX *mem_ctx, + bool include_self) +{ + unsigned int i, j, num_nodes; + uint32_t *nodes; + + for (i=num_nodes=0;i<vnn_map->size;i++) { + if (vnn_map->map[i] == ctdb->pnn && !include_self) { + continue; + } + num_nodes++; + } + + nodes = talloc_array(mem_ctx, uint32_t, num_nodes); + CTDB_NO_MEMORY_FATAL(ctdb, nodes); + + for (i=j=0;i<vnn_map->size;i++) { + if (vnn_map->map[i] == ctdb->pnn && !include_self) { + continue; + } + nodes[j++] = vnn_map->map[i]; + } + + return nodes; +} + +/* Get list of nodes not including those with flags specified by mask */ +static uint32_t *list_of_nodes(struct ctdb_context *ctdb, + struct ctdb_node_map_old *node_map, + TALLOC_CTX *mem_ctx, + uint32_t mask, + bool include_self) +{ + unsigned int i, j, num_nodes; + uint32_t exclude_pnn; + uint32_t *nodes; + + exclude_pnn = include_self ? CTDB_UNKNOWN_PNN : ctdb->pnn; + + for (i=num_nodes=0;i<node_map->num;i++) { + if (node_map->nodes[i].flags & mask) { + continue; + } + if (node_map->nodes[i].pnn == exclude_pnn) { + continue; + } + num_nodes++; + } + + nodes = talloc_array(mem_ctx, uint32_t, num_nodes); + CTDB_NO_MEMORY_FATAL(ctdb, nodes); + + for (i=j=0;i<node_map->num;i++) { + if (node_map->nodes[i].flags & mask) { + continue; + } + if (node_map->nodes[i].pnn == exclude_pnn) { + continue; + } + nodes[j++] = node_map->nodes[i].pnn; + } + + return nodes; +} + +uint32_t *list_of_active_nodes(struct ctdb_context *ctdb, + struct ctdb_node_map_old *node_map, + TALLOC_CTX *mem_ctx, + bool include_self) +{ + return list_of_nodes(ctdb, + node_map, + mem_ctx, + NODE_FLAGS_INACTIVE, + include_self); +} + +uint32_t *list_of_connected_nodes(struct ctdb_context *ctdb, + struct ctdb_node_map_old *node_map, + TALLOC_CTX *mem_ctx, + bool include_self) +{ + return list_of_nodes(ctdb, + node_map, + mem_ctx, + NODE_FLAGS_DISCONNECTED, + include_self); +} + +/* + get capabilities of a remote node + */ +struct ctdb_client_control_state * +ctdb_ctrl_getcapabilities_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode) +{ + return ctdb_control_send(ctdb, destnode, 0, + CTDB_CONTROL_GET_CAPABILITIES, 0, tdb_null, + mem_ctx, &timeout, NULL); +} + +int ctdb_ctrl_getcapabilities_recv(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct ctdb_client_control_state *state, uint32_t *capabilities) +{ + int ret; + int32_t res; + TDB_DATA outdata; + + ret = ctdb_control_recv(ctdb, state, mem_ctx, &outdata, &res, NULL); + if ( (ret != 0) || (res != 0) ) { + DEBUG(DEBUG_ERR,(__location__ " ctdb_ctrl_getcapabilities_recv failed\n")); + return -1; + } + + if (capabilities) { + *capabilities = *((uint32_t *)outdata.dptr); + } + + return 0; +} + +int ctdb_ctrl_getcapabilities(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t *capabilities) +{ + struct ctdb_client_control_state *state; + TALLOC_CTX *tmp_ctx = talloc_new(NULL); + int ret; + + state = ctdb_ctrl_getcapabilities_send(ctdb, tmp_ctx, timeout, destnode); + ret = ctdb_ctrl_getcapabilities_recv(ctdb, tmp_ctx, state, capabilities); + talloc_free(tmp_ctx); + return ret; +} + +static void get_capabilities_callback(struct ctdb_context *ctdb, + uint32_t node_pnn, int32_t res, + TDB_DATA outdata, void *callback_data) +{ + struct ctdb_node_capabilities *caps = + talloc_get_type(callback_data, + struct ctdb_node_capabilities); + + if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) { + DEBUG(DEBUG_ERR, (__location__ " Invalid length/pointer for getcap callback : %u %p\n", (unsigned)outdata.dsize, outdata.dptr)); + return; + } + + if (node_pnn >= talloc_array_length(caps)) { + DEBUG(DEBUG_ERR, + (__location__ " unexpected PNN %u\n", node_pnn)); + return; + } + + caps[node_pnn].retrieved = true; + caps[node_pnn].capabilities = *((uint32_t *)outdata.dptr); +} + +struct ctdb_node_capabilities * +ctdb_get_capabilities(struct ctdb_context *ctdb, + TALLOC_CTX *mem_ctx, + struct timeval timeout, + struct ctdb_node_map_old *nodemap) +{ + uint32_t *nodes; + uint32_t i, res; + struct ctdb_node_capabilities *ret; + + nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true); + + ret = talloc_array(mem_ctx, struct ctdb_node_capabilities, + nodemap->num); + CTDB_NO_MEMORY_NULL(ctdb, ret); + /* Prepopulate the expected PNNs */ + for (i = 0; i < talloc_array_length(ret); i++) { + ret[i].retrieved = false; + } + + res = ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES, + nodes, 0, timeout, + false, tdb_null, + get_capabilities_callback, NULL, + ret); + if (res != 0) { + DEBUG(DEBUG_ERR, + (__location__ " Failed to read node capabilities.\n")); + TALLOC_FREE(ret); + } + + return ret; +} + +uint32_t * +ctdb_get_node_capabilities(struct ctdb_node_capabilities *caps, + uint32_t pnn) +{ + if (pnn < talloc_array_length(caps) && caps[pnn].retrieved) { + return &caps[pnn].capabilities; + } + + return NULL; +} + +bool ctdb_node_has_capabilities(struct ctdb_node_capabilities *caps, + uint32_t pnn, + uint32_t capabilities_required) +{ + uint32_t *capp = ctdb_get_node_capabilities(caps, pnn); + return (capp != NULL) && + ((*capp & capabilities_required) == capabilities_required); +} + +/* + recovery daemon ping to main daemon + */ +int ctdb_ctrl_recd_ping(struct ctdb_context *ctdb) +{ + int ret; + int32_t res; + + ret = ctdb_control(ctdb, CTDB_CURRENT_NODE, 0, CTDB_CONTROL_RECD_PING, 0, tdb_null, + ctdb, NULL, &res, NULL, NULL); + if (ret != 0 || res != 0) { + DEBUG(DEBUG_ERR,("Failed to send recd ping\n")); + return -1; + } + + return 0; +} + +/* + tell the main daemon how long it took to lock the reclock file + */ +int ctdb_ctrl_report_recd_lock_latency(struct ctdb_context *ctdb, struct timeval timeout, double latency) +{ + int ret; + int32_t res; + TDB_DATA data; + + data.dptr = (uint8_t *)&latency; + data.dsize = sizeof(latency); + + ret = ctdb_control(ctdb, CTDB_CURRENT_NODE, 0, CTDB_CONTROL_RECD_RECLOCK_LATENCY, 0, data, + ctdb, NULL, &res, NULL, NULL); + if (ret != 0 || res != 0) { + DEBUG(DEBUG_ERR,("Failed to send recd reclock latency\n")); + return -1; + } + + return 0; +} + +int ctdb_ctrl_set_ban(struct ctdb_context *ctdb, struct timeval timeout, + uint32_t destnode, struct ctdb_ban_state *bantime) +{ + int ret; + TDB_DATA data; + int32_t res; + + data.dsize = sizeof(*bantime); + data.dptr = (uint8_t *)bantime; + + ret = ctdb_control(ctdb, destnode, 0, + CTDB_CONTROL_SET_BAN_STATE, 0, data, + NULL, NULL, &res, &timeout, NULL); + if (ret != 0 || res != 0) { + DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set ban state failed\n")); + return -1; + } + + return 0; +} + +struct ctdb_client_control_state * +ctdb_ctrl_updaterecord_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode, struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_ltdb_header *header, TDB_DATA data) +{ + struct ctdb_client_control_state *handle; + struct ctdb_marshall_buffer *m; + struct ctdb_rec_data_old *rec; + TDB_DATA outdata; + + m = talloc_zero(mem_ctx, struct ctdb_marshall_buffer); + if (m == NULL) { + DEBUG(DEBUG_ERR, ("Failed to allocate marshall buffer for update record\n")); + return NULL; + } + + m->db_id = ctdb_db->db_id; + + rec = ctdb_marshall_record(m, 0, key, header, data); + if (rec == NULL) { + DEBUG(DEBUG_ERR,("Failed to marshall record for update record\n")); + talloc_free(m); + return NULL; + } + m = talloc_realloc_size(mem_ctx, m, rec->length + offsetof(struct ctdb_marshall_buffer, data)); + if (m == NULL) { + DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata\n")); + talloc_free(m); + return NULL; + } + m->count++; + memcpy((uint8_t *)m + offsetof(struct ctdb_marshall_buffer, data), rec, rec->length); + + + outdata.dptr = (uint8_t *)m; + outdata.dsize = talloc_get_size(m); + + handle = ctdb_control_send(ctdb, destnode, 0, + CTDB_CONTROL_UPDATE_RECORD, 0, outdata, + mem_ctx, &timeout, NULL); + talloc_free(m); + return handle; +} + +int ctdb_ctrl_updaterecord_recv(struct ctdb_context *ctdb, struct ctdb_client_control_state *state) +{ + int ret; + int32_t res; + + ret = ctdb_control_recv(ctdb, state, state, NULL, &res, NULL); + if ( (ret != 0) || (res != 0) ){ + DEBUG(DEBUG_ERR,(__location__ " ctdb_ctrl_update_record_recv failed\n")); + return -1; + } + + return 0; +} + +int +ctdb_ctrl_updaterecord(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode, struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_ltdb_header *header, TDB_DATA data) +{ + struct ctdb_client_control_state *state; + + state = ctdb_ctrl_updaterecord_send(ctdb, mem_ctx, timeout, destnode, ctdb_db, key, header, data); + return ctdb_ctrl_updaterecord_recv(ctdb, state); +} diff --git a/ctdb/server/ctdb_cluster_mutex.c b/ctdb/server/ctdb_cluster_mutex.c new file mode 100644 index 0000000..2fbe301 --- /dev/null +++ b/ctdb/server/ctdb_cluster_mutex.c @@ -0,0 +1,382 @@ +/* + CTDB cluster mutex handling + + Copyright (C) Andrew Tridgell 2007 + Copyright (C) Ronnie Sahlberg 2007 + Copyright (C) Martin Schwenke 2016 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "replace.h" +#include "system/network.h" +#include "system/filesys.h" + +#include <tevent.h> + +#include "lib/util/debug.h" +#include "lib/util/time.h" +#include "lib/util/strv.h" +#include "lib/util/strv_util.h" +#include "lib/util/sys_rw.h" +#include "lib/util/blocking.h" + +#include "ctdb_private.h" + +#include "ctdb_cluster_mutex.h" + +struct ctdb_cluster_mutex_handle { + struct ctdb_context *ctdb; + cluster_mutex_handler_t handler; + void *private_data; + cluster_mutex_lost_handler_t lost_handler; + void *lost_data; + int fd[2]; + struct tevent_timer *te; + struct tevent_fd *fde; + pid_t child; + struct timeval start_time; + bool have_response; +}; + +static void cluster_mutex_timeout(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *private_data) +{ + struct ctdb_cluster_mutex_handle *h = + talloc_get_type(private_data, struct ctdb_cluster_mutex_handle); + double latency = timeval_elapsed(&h->start_time); + + if (h->handler != NULL) { + h->handler('2', latency, h->private_data); + } +} + + +/* When the handle is freed it causes any child holding the mutex to + * be killed, thus freeing the mutex */ +static int cluster_mutex_destructor(struct ctdb_cluster_mutex_handle *h) +{ + if (h->fd[0] != -1) { + h->fd[0] = -1; + } + ctdb_kill(h->ctdb, h->child, SIGTERM); + return 0; +} + +/* this is called when the client process has completed ctdb_recovery_lock() + and has written data back to us through the pipe. +*/ +static void cluster_mutex_handler(struct tevent_context *ev, + struct tevent_fd *fde, + uint16_t flags, void *private_data) +{ + struct ctdb_cluster_mutex_handle *h= + talloc_get_type(private_data, struct ctdb_cluster_mutex_handle); + double latency = timeval_elapsed(&h->start_time); + char c = '0'; + int ret; + + /* Got response from child process so abort timeout */ + TALLOC_FREE(h->te); + + ret = sys_read(h->fd[0], &c, 1); + + /* Don't call the handler more than once. It only exists to + * process the initial response from the helper. */ + if (h->have_response) { + /* Only deal with EOF due to process exit. Silently + * ignore any other output. */ + if (ret == 0) { + if (h->lost_handler != NULL) { + h->lost_handler(h->lost_data); + } + } + return; + } + h->have_response = true; + + /* If the child wrote status then just pass it to the handler. + * If no status was written then this is an unexpected error + * so pass generic error code to handler. */ + if (h->handler != NULL) { + h->handler(ret == 1 ? c : '3', latency, h->private_data); + } +} + +static char cluster_mutex_helper[PATH_MAX+1] = ""; + +static bool cluster_mutex_helper_args_file(TALLOC_CTX *mem_ctx, + const char *argstring, + char ***argv) +{ + struct stat st; + size_t size = sizeof(cluster_mutex_helper); + const char *t; + char **args = NULL; + int ret; + + if (cluster_mutex_helper[0] != '\0') { + goto helper_done; + } + + t = getenv("CTDB_CLUSTER_MUTEX_HELPER"); + if (t != NULL) { + size_t len; + + len = strlcpy(cluster_mutex_helper, t, size); + if (len >= size) { + DBG_ERR("error: CTDB_CLUSTER_MUTEX_HELPER too long\n"); + exit(1); + } + } else { + ret = snprintf(cluster_mutex_helper, + size, + "%s/%s", + CTDB_HELPER_BINDIR, + "ctdb_mutex_fcntl_helper"); + if (ret < 0 || (size_t)ret >= size) { + D_ERR("Unable to set cluster mutex helper - " + "path too long\n"); + exit(1); + } + } + + ret = stat(cluster_mutex_helper, &st); + if (ret != 0) { + D_ERR("Unable to set cluster mutex helper \"%s\" - %s\n", + cluster_mutex_helper, + strerror(errno)); + exit(1); + } + + if ((st.st_mode & S_IXUSR) == 0) { + D_ERR("Unable to set cluster_mutex helper \"%s\" - " + "not executable\n", + cluster_mutex_helper); + exit(1); + } + + D_NOTICE("Set cluster mutex helper to \"%s\"\n", cluster_mutex_helper); + +helper_done: + + /* Array includes default helper, file and NULL */ + args = talloc_array(mem_ctx, char *, 3); + if (args == NULL) { + DBG_ERR("Memory allocation error\n"); + return false; + } + + args[0] = cluster_mutex_helper; + + args[1] = talloc_strdup(args, argstring); + if (args[1] == NULL) { + DBG_ERR("Memory allocation error\n"); + return false; + } + + args[2] = NULL; + + *argv = args; + return true; +} + +static bool cluster_mutex_helper_args_cmd(TALLOC_CTX *mem_ctx, + const char *argstring, + char ***argv) +{ + int i, ret, n; + char **args = NULL; + char *strv = NULL; + char *t = NULL; + + ret = strv_split(mem_ctx, &strv, argstring, " \t"); + if (ret != 0) { + D_ERR("Unable to parse mutex helper command \"%s\" (%s)\n", + argstring, + strerror(ret)); + return false; + } + n = strv_count(strv); + if (n == 0) { + D_ERR("Mutex helper command is empty \"%s\"\n", argstring); + return false; + } + + /* Extra slot for NULL */ + args = talloc_array(mem_ctx, char *, n + 1); + if (args == NULL) { + DBG_ERR("Memory allocation error\n"); + return false; + } + + talloc_steal(args, strv); + + t = NULL; + for (i = 0 ; i < n; i++) { + t = strv_next(strv, t); + args[i] = t; + } + + args[n] = NULL; + + *argv = args; + return true; +} + +static bool cluster_mutex_helper_args(TALLOC_CTX *mem_ctx, + const char *argstring, + char ***argv) +{ + bool ok; + + if (argstring != NULL && argstring[0] == '!') { + ok = cluster_mutex_helper_args_cmd(mem_ctx, &argstring[1], argv); + } else { + ok = cluster_mutex_helper_args_file(mem_ctx, argstring, argv); + } + + return ok; +} + +struct ctdb_cluster_mutex_handle * +ctdb_cluster_mutex(TALLOC_CTX *mem_ctx, + struct ctdb_context *ctdb, + const char *argstring, + int timeout, + cluster_mutex_handler_t handler, + void *private_data, + cluster_mutex_lost_handler_t lost_handler, + void *lost_data) +{ + struct ctdb_cluster_mutex_handle *h; + char **args; + sigset_t sigset_term; + int ret; + + h = talloc(mem_ctx, struct ctdb_cluster_mutex_handle); + if (h == NULL) { + DBG_ERR("out of memory\n"); + return NULL; + } + + h->start_time = timeval_current(); + h->fd[0] = -1; + h->fd[1] = -1; + h->have_response = false; + + ret = pipe(h->fd); + if (ret != 0) { + talloc_free(h); + DBG_ERR("Failed to open pipe\n"); + return NULL; + } + set_close_on_exec(h->fd[0]); + + /* Create arguments for lock helper */ + if (!cluster_mutex_helper_args(h, argstring, &args)) { + close(h->fd[0]); + close(h->fd[1]); + talloc_free(h); + return NULL; + } + + sigemptyset(&sigset_term); + sigaddset(&sigset_term, SIGTERM); + ret = sigprocmask(SIG_BLOCK, &sigset_term, NULL); + if (ret != 0) { + DBG_WARNING("Failed to block SIGTERM (%d)\n", errno); + } + + h->child = ctdb_fork(ctdb); + if (h->child == (pid_t)-1) { + close(h->fd[0]); + close(h->fd[1]); + talloc_free(h); + ret = sigprocmask(SIG_UNBLOCK, &sigset_term, NULL); + if (ret != 0) { + DBG_WARNING("Failed to unblock SIGTERM (%d)\n", errno); + } + return NULL; + } + + if (h->child == 0) { + struct sigaction sa = { + .sa_handler = SIG_DFL, + }; + + ret = sigaction(SIGTERM, &sa, NULL); + if (ret != 0) { + DBG_WARNING("Failed to reset signal handler (%d)\n", + errno); + } + + ret = sigprocmask(SIG_UNBLOCK, &sigset_term, NULL); + if (ret != 0) { + DBG_WARNING("Failed to unblock SIGTERM (%d)\n", errno); + } + + /* Make stdout point to the pipe */ + close(STDOUT_FILENO); + dup2(h->fd[1], STDOUT_FILENO); + close(h->fd[1]); + + execv(args[0], args); + + /* Only happens on error */ + DBG_ERR("execv() failed\n"); + _exit(1); + } + + /* Parent */ + + ret = sigprocmask(SIG_UNBLOCK, &sigset_term, NULL); + if (ret != 0) { + DBG_WARNING("Failed to unblock SIGTERM (%d)\n", errno); + } + + DBG_DEBUG("Created PIPE FD:%d\n", h->fd[0]); + set_close_on_exec(h->fd[0]); + + close(h->fd[1]); + h->fd[1] = -1; + + talloc_set_destructor(h, cluster_mutex_destructor); + + if (timeout != 0) { + h->te = tevent_add_timer(ctdb->ev, h, + timeval_current_ofs(timeout, 0), + cluster_mutex_timeout, h); + } else { + h->te = NULL; + } + + h->fde = tevent_add_fd(ctdb->ev, h, h->fd[0], TEVENT_FD_READ, + cluster_mutex_handler, (void *)h); + + if (h->fde == NULL) { + talloc_free(h); + return NULL; + } + tevent_fd_set_auto_close(h->fde); + + h->ctdb = ctdb; + h->handler = handler; + h->private_data = private_data; + h->lost_handler = lost_handler; + h->lost_data = lost_data; + + return h; +} diff --git a/ctdb/server/ctdb_cluster_mutex.h b/ctdb/server/ctdb_cluster_mutex.h new file mode 100644 index 0000000..4587290 --- /dev/null +++ b/ctdb/server/ctdb_cluster_mutex.h @@ -0,0 +1,51 @@ +/* + CTDB cluster mutex handling + + Copyright (C) Andrew Tridgell 2007 + Copyright (C) Ronnie Sahlberg 2007 + Copyright (C) Martin Schwenke 2016 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#ifndef __CTDB_CLUSTER_MUTEX_H__ +#define __CTDB_CLUSTER_MUTEX_H__ + +#include <talloc.h> + +#include "replace.h" +#include "system/network.h" + +#include "ctdb_private.h" + +struct ctdb_cluster_mutex_handle; + +typedef void (*cluster_mutex_handler_t) ( + char status, + double latency, + void *private_data); + +typedef void (*cluster_mutex_lost_handler_t) (void *private_data); + +struct ctdb_cluster_mutex_handle * +ctdb_cluster_mutex(TALLOC_CTX *mem_ctx, + struct ctdb_context *ctdb, + const char *argstring, + int timeout, + cluster_mutex_handler_t handler, + void *private_data, + cluster_mutex_lost_handler_t lost_handler, + void *lost_data); + +#endif /* __CTDB_CLUSTER_MUTEX_H__ */ diff --git a/ctdb/server/ctdb_config.c b/ctdb/server/ctdb_config.c new file mode 100644 index 0000000..3f61fda --- /dev/null +++ b/ctdb/server/ctdb_config.c @@ -0,0 +1,183 @@ +/* + CTDB daemon config handling + + Copyright (C) Martin Schwenke 2018 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "replace.h" + +#include "lib/util/debug.h" + +#include "common/conf.h" +#include "common/logging_conf.h" +#include "common/path.h" + +#include "cluster/cluster_conf.h" +#include "database/database_conf.h" +#include "event/event_conf.h" +#include "failover/failover_conf.h" +#include "legacy_conf.h" + +#include "ctdb_config.h" + +struct ctdb_config ctdb_config; + +static void setup_config_pointers(struct conf_context *conf) +{ + /* + * Cluster + */ + + conf_assign_string_pointer(conf, + CLUSTER_CONF_SECTION, + CLUSTER_CONF_TRANSPORT, + &ctdb_config.transport); + conf_assign_string_pointer(conf, + CLUSTER_CONF_SECTION, + CLUSTER_CONF_NODE_ADDRESS, + &ctdb_config.node_address); + conf_assign_string_pointer(conf, + CLUSTER_CONF_SECTION, + CLUSTER_CONF_CLUSTER_LOCK, + &ctdb_config.cluster_lock); + conf_assign_string_pointer(conf, + CLUSTER_CONF_SECTION, + CLUSTER_CONF_RECOVERY_LOCK, + &ctdb_config.recovery_lock); + conf_assign_integer_pointer(conf, + CLUSTER_CONF_SECTION, + CLUSTER_CONF_LEADER_TIMEOUT, + &ctdb_config.leader_timeout); + conf_assign_boolean_pointer(conf, + CLUSTER_CONF_SECTION, + CLUSTER_CONF_LEADER_CAPABILITY, + &ctdb_config.leader_capability); + + /* + * Database + */ + + conf_assign_string_pointer(conf, + DATABASE_CONF_SECTION, + DATABASE_CONF_VOLATILE_DB_DIR, + &ctdb_config.dbdir_volatile); + conf_assign_string_pointer(conf, + DATABASE_CONF_SECTION, + DATABASE_CONF_PERSISTENT_DB_DIR, + &ctdb_config.dbdir_persistent); + conf_assign_string_pointer(conf, + DATABASE_CONF_SECTION, + DATABASE_CONF_STATE_DB_DIR, + &ctdb_config.dbdir_state); + conf_assign_string_pointer(conf, + DATABASE_CONF_SECTION, + DATABASE_CONF_LOCK_DEBUG_SCRIPT, + &ctdb_config.lock_debug_script); + conf_assign_boolean_pointer(conf, + DATABASE_CONF_SECTION, + DATABASE_CONF_TDB_MUTEXES, + &ctdb_config.tdb_mutexes); + + /* + * Event + */ + conf_assign_string_pointer(conf, + EVENT_CONF_SECTION, + EVENT_CONF_DEBUG_SCRIPT, + &ctdb_config.event_debug_script); + + /* + * Failover + */ + conf_assign_boolean_pointer(conf, + FAILOVER_CONF_SECTION, + FAILOVER_CONF_DISABLED, + &ctdb_config.failover_disabled); + + /* + * Legacy + */ + + conf_assign_boolean_pointer(conf, + LEGACY_CONF_SECTION, + LEGACY_CONF_REALTIME_SCHEDULING, + &ctdb_config.realtime_scheduling); + conf_assign_boolean_pointer(conf, + LEGACY_CONF_SECTION, + LEGACY_CONF_LMASTER_CAPABILITY, + &ctdb_config.lmaster_capability); + conf_assign_boolean_pointer(conf, + LEGACY_CONF_SECTION, + LEGACY_CONF_START_AS_STOPPED, + &ctdb_config.start_as_stopped); + conf_assign_boolean_pointer(conf, + LEGACY_CONF_SECTION, + LEGACY_CONF_START_AS_DISABLED, + &ctdb_config.start_as_disabled); + conf_assign_string_pointer(conf, + LEGACY_CONF_SECTION, + LEGACY_CONF_SCRIPT_LOG_LEVEL, + &ctdb_config.script_log_level); +} + +int ctdbd_config_load(TALLOC_CTX *mem_ctx, + struct conf_context **result) +{ + struct conf_context *conf = NULL; + int ret = 0; + char *conf_file = NULL; + + ret = conf_init(mem_ctx, &conf); + if (ret != 0) { + return ret; + } + + logging_conf_init(conf, NULL); + cluster_conf_init(conf); + database_conf_init(conf); + event_conf_init(conf); + failover_conf_init(conf); + legacy_conf_init(conf); + + setup_config_pointers(conf); + + if (! conf_valid(conf)) { + ret = EINVAL; + goto fail; + } + + conf_file = path_config(conf); + if (conf_file == NULL) { + D_ERR("Memory allocation error\n"); + ret = ENOMEM; + goto fail; + } + ret = conf_load(conf, conf_file, true); + /* Configuration file does not need to exist */ + if (ret != 0 && ret != ENOENT) { + D_ERR("Failed to load configuration file %s\n", conf_file); + goto fail; + } + + talloc_free(conf_file); + *result = conf; + + return 0; + +fail: + talloc_free(conf); + return ret; +} diff --git a/ctdb/server/ctdb_config.h b/ctdb/server/ctdb_config.h new file mode 100644 index 0000000..7ccda7d --- /dev/null +++ b/ctdb/server/ctdb_config.h @@ -0,0 +1,59 @@ +/* + CTDB daemon config handling + + Copyright (C) Martin Schwenke 2018 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#ifndef __CTDB_CONFIG_H__ +#define __CTDB_CONFIG_H__ + +#include "common/conf.h" + +struct ctdb_config { + /* Cluster */ + const char *transport; + const char *node_address; + const char *cluster_lock; + const char *recovery_lock; + int leader_timeout; + bool leader_capability; + + /* Database */ + const char *dbdir_volatile; + const char *dbdir_persistent; + const char *dbdir_state; + const char *lock_debug_script; + bool tdb_mutexes; + + /* Event */ + const char *event_debug_script; + + /* Failover */ + bool failover_disabled; + + /* Legacy */ + bool realtime_scheduling; + bool lmaster_capability; + bool start_as_stopped; + bool start_as_disabled; + const char *script_log_level; +}; + +extern struct ctdb_config ctdb_config; + +int ctdbd_config_load(TALLOC_CTX *mem_ctx, struct conf_context **conf); + +#endif /* __CTDB_CONFIG_H__ */ diff --git a/ctdb/server/ctdb_control.c b/ctdb/server/ctdb_control.c new file mode 100644 index 0000000..422c4cf --- /dev/null +++ b/ctdb/server/ctdb_control.c @@ -0,0 +1,1097 @@ +/* + ctdb_control protocol code + + Copyright (C) Andrew Tridgell 2007 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ +#include "replace.h" +#include "system/network.h" +#include "system/filesys.h" +#include "system/wait.h" + +#include <talloc.h> +#include <tevent.h> + +#include "lib/tdb_wrap/tdb_wrap.h" +#include "lib/util/dlinklist.h" +#include "lib/util/debug.h" +#include "lib/util/samba_util.h" +#include "lib/util/talloc_report.h" + +#include "ctdb_private.h" +#include "ctdb_client.h" + +#include "protocol/protocol_private.h" + +#include "common/reqid.h" +#include "common/common.h" +#include "common/logging.h" + + +struct ctdb_control_state { + struct ctdb_context *ctdb; + uint32_t reqid; + ctdb_control_callback_fn_t callback; + void *private_data; + unsigned flags; +}; + + +/* + dump talloc memory hierarchy, returning it as a blob to the client + */ +int32_t ctdb_dump_memory(struct ctdb_context *ctdb, TDB_DATA *outdata) +{ + char *report; + size_t reportlen; + + report = talloc_report_str(outdata, NULL); + if (report == NULL) { + DEBUG(DEBUG_ERR, + (__location__ " talloc_report_str failed\n")); + return -1; + } + reportlen = talloc_get_size(report); + + if (reportlen > 0) { + reportlen -= 1; /* strip trailing zero */ + } + + outdata->dptr = (uint8_t *)report; + outdata->dsize = reportlen; + return 0; +} + +static int32_t control_not_implemented(const char *unsupported, + const char *alternate) +{ + if (alternate == NULL) { + DEBUG(DEBUG_ERR, + ("Control %s is not implemented any more\n", + unsupported)); + } else { + DEBUG(DEBUG_ERR, + ("Control %s is not implemented any more, use %s instead\n", + unsupported, alternate)); + } + return -1; +} + +struct ctdb_echo_data_state { + struct ctdb_context *ctdb; + struct ctdb_req_control_old *c; + struct ctdb_echo_data *data; +}; + +static void ctdb_echo_data_timeout( + struct tevent_context *ev, + struct tevent_timer *te, + struct timeval now, + void *private_data); + +static int32_t ctdb_control_echo_data( + struct ctdb_context *ctdb, + struct ctdb_req_control_old *c, + TDB_DATA indata, + bool *async_reply) +{ + struct ctdb_echo_data_state *state = NULL; + struct tevent_timer *te = NULL; + uint32_t delay = 0; + size_t np = 0; + int ret; + + state = talloc_zero(ctdb, struct ctdb_echo_data_state); + CTDB_NO_MEMORY(ctdb, state); + state->ctdb = ctdb; + + ret = ctdb_echo_data_pull( + indata.dptr, indata.dsize, state, &state->data, &np); + if (ret != 0) { + DBG_DEBUG("ctdb_echo_data_pull failed: %s\n", + strerror(ret)); + TALLOC_FREE(state); + return -1; + } + + te = tevent_add_timer( + ctdb->ev, + state, + timeval_current_ofs_msec(delay), + ctdb_echo_data_timeout, + state); + if (te == NULL) { + DBG_DEBUG("tevent_add_timer failed\n"); + TALLOC_FREE(state); + return -1; + } + + state->c = talloc_move(state, &c); + *async_reply = true; + + return 0; +} + +static void ctdb_echo_data_timeout( + struct tevent_context *ev, + struct tevent_timer *te, + struct timeval now, + void *private_data) +{ + struct ctdb_echo_data_state *state = talloc_get_type_abort( + private_data, struct ctdb_echo_data_state); + size_t len = ctdb_echo_data_len(state->data); + uint8_t *buf = NULL; + size_t np; + TDB_DATA data; + + DBG_DEBUG("reqid=%"PRIu32" len=%zu\n", state->c->hdr.reqid, len); + + buf = talloc_array(state, uint8_t, len); + if (buf == NULL) { + DBG_WARNING("talloc_array(%zu) failed\n", len); + goto done; + } + ctdb_echo_data_push(state->data, buf, &np); + data = (TDB_DATA) { .dptr = buf, .dsize = np }; + + ctdb_request_control_reply(state->ctdb, state->c, &data, 0, NULL); + +done: + TALLOC_FREE(state); +} + +static int ctdb_control_disable_node(struct ctdb_context *ctdb) +{ + struct ctdb_node *node; + + node = ctdb_find_node(ctdb, CTDB_CURRENT_NODE); + if (node == NULL) { + /* Can't happen */ + DBG_ERR("Unable to find current node\n"); + return -1; + } + + D_ERR("Disable node\n"); + node->flags |= NODE_FLAGS_PERMANENTLY_DISABLED; + + return 0; +} + +static int ctdb_control_enable_node(struct ctdb_context *ctdb) +{ + struct ctdb_node *node; + + node = ctdb_find_node(ctdb, CTDB_CURRENT_NODE); + if (node == NULL) { + /* Can't happen */ + DBG_ERR("Unable to find current node\n"); + return -1; + } + + D_ERR("Enable node\n"); + node->flags &= ~NODE_FLAGS_PERMANENTLY_DISABLED; + + return 0; +} + +/* + process a control request + */ +static int32_t ctdb_control_dispatch(struct ctdb_context *ctdb, + struct ctdb_req_control_old *c, + TDB_DATA indata, + TDB_DATA *outdata, uint32_t srcnode, + const char **errormsg, + bool *async_reply) +{ + uint32_t opcode = c->opcode; + uint64_t srvid = c->srvid; + uint32_t client_id = c->client_id; + static int level = DEBUG_ERR; + + switch (opcode) { + case CTDB_CONTROL_PROCESS_EXISTS: { + CHECK_CONTROL_DATA_SIZE(sizeof(pid_t)); + return ctdb_control_process_exists(ctdb, *(pid_t *)indata.dptr); + } + + case CTDB_CONTROL_SET_DEBUG: { + union { + uint8_t *ptr; + int32_t *level; + } debug; + CHECK_CONTROL_DATA_SIZE(sizeof(int32_t)); + debug.ptr = indata.dptr; + debuglevel_set(*debug.level); + return 0; + } + + case CTDB_CONTROL_GET_DEBUG: { + CHECK_CONTROL_DATA_SIZE(0); + level = debuglevel_get(); + outdata->dptr = (uint8_t *)&(level); + outdata->dsize = sizeof(DEBUGLEVEL); + return 0; + } + + case CTDB_CONTROL_STATISTICS: { + CHECK_CONTROL_DATA_SIZE(0); + ctdb->statistics.memory_used = talloc_total_size(NULL); + ctdb->statistics.num_clients = ctdb->num_clients; + ctdb->statistics.frozen = (ctdb_db_all_frozen(ctdb) ? 1 : 0); + ctdb->statistics.recovering = (ctdb->recovery_mode == CTDB_RECOVERY_ACTIVE); + ctdb->statistics.statistics_current_time = timeval_current(); + + outdata->dptr = (uint8_t *)&ctdb->statistics; + outdata->dsize = sizeof(ctdb->statistics); + return 0; + } + + case CTDB_CONTROL_GET_ALL_TUNABLES: { + CHECK_CONTROL_DATA_SIZE(0); + outdata->dptr = (uint8_t *)&ctdb->tunable; + outdata->dsize = sizeof(ctdb->tunable); + return 0; + } + + case CTDB_CONTROL_DUMP_MEMORY: { + CHECK_CONTROL_DATA_SIZE(0); + return ctdb_dump_memory(ctdb, outdata); + } + + case CTDB_CONTROL_STATISTICS_RESET: { + struct ctdb_db_context *ctdb_db; + + CHECK_CONTROL_DATA_SIZE(0); + ZERO_STRUCT(ctdb->statistics); + for (ctdb_db = ctdb->db_list; + ctdb_db != NULL; + ctdb_db = ctdb_db->next) { + ctdb_db_statistics_reset(ctdb_db); + } + ctdb->statistics.statistics_start_time = timeval_current(); + return 0; + } + + case CTDB_CONTROL_GETVNNMAP: + return ctdb_control_getvnnmap(ctdb, opcode, indata, outdata); + + case CTDB_CONTROL_GET_DBMAP: + return ctdb_control_getdbmap(ctdb, opcode, indata, outdata); + + case CTDB_CONTROL_GET_NODEMAPv4: + return control_not_implemented("GET_NODEMAPv4", "GET_NODEMAP"); + + case CTDB_CONTROL_GET_NODEMAP: + return ctdb_control_getnodemap(ctdb, opcode, indata, outdata); + + case CTDB_CONTROL_GET_NODES_FILE: + return ctdb_control_getnodesfile(ctdb, opcode, indata, outdata); + + case CTDB_CONTROL_RELOAD_NODES_FILE: + CHECK_CONTROL_DATA_SIZE(0); + return ctdb_control_reload_nodes_file(ctdb, opcode); + + case CTDB_CONTROL_SET_DB_STICKY: { + uint32_t db_id; + struct ctdb_db_context *ctdb_db; + + CHECK_CONTROL_DATA_SIZE(sizeof(db_id)); + db_id = *(uint32_t *)indata.dptr; + ctdb_db = find_ctdb_db(ctdb, db_id); + if (ctdb_db == NULL) return -1; + return ctdb_set_db_sticky(ctdb, ctdb_db); + } + + case CTDB_CONTROL_SETVNNMAP: + return ctdb_control_setvnnmap(ctdb, opcode, indata, outdata); + + case CTDB_CONTROL_PULL_DB: + return control_not_implemented("PULL_DB", NULL); + + case CTDB_CONTROL_SET_DMASTER: + return control_not_implemented("SET_DMASTER", NULL); + + case CTDB_CONTROL_PUSH_DB: + return control_not_implemented("PUSH_DB", NULL); + + case CTDB_CONTROL_GET_RECMODE: { + return ctdb->recovery_mode; + } + + case CTDB_CONTROL_SET_RECMASTER: + return control_not_implemented("SET_RECMASTER", NULL); + + case CTDB_CONTROL_GET_RECMASTER: + return control_not_implemented("GET_RECMASTER", NULL); + + case CTDB_CONTROL_GET_PID: + return getpid(); + + case CTDB_CONTROL_GET_PNN: + return ctdb->pnn; + + case CTDB_CONTROL_PING: + CHECK_CONTROL_DATA_SIZE(0); + return ctdb->num_clients; + + case CTDB_CONTROL_GET_RUNSTATE: + CHECK_CONTROL_DATA_SIZE(0); + outdata->dptr = (uint8_t *)&ctdb->runstate; + outdata->dsize = sizeof(uint32_t); + return 0; + + + case CTDB_CONTROL_SET_DB_READONLY: { + uint32_t db_id; + struct ctdb_db_context *ctdb_db; + + CHECK_CONTROL_DATA_SIZE(sizeof(db_id)); + db_id = *(uint32_t *)indata.dptr; + ctdb_db = find_ctdb_db(ctdb, db_id); + if (ctdb_db == NULL) return -1; + return ctdb_set_db_readonly(ctdb, ctdb_db); + } + case CTDB_CONTROL_GET_DBNAME: { + uint32_t db_id; + struct ctdb_db_context *ctdb_db; + + CHECK_CONTROL_DATA_SIZE(sizeof(db_id)); + db_id = *(uint32_t *)indata.dptr; + ctdb_db = find_ctdb_db(ctdb, db_id); + if (ctdb_db == NULL) return -1; + outdata->dptr = discard_const(ctdb_db->db_name); + outdata->dsize = strlen(ctdb_db->db_name)+1; + return 0; + } + + case CTDB_CONTROL_GETDBPATH: { + uint32_t db_id; + struct ctdb_db_context *ctdb_db; + + CHECK_CONTROL_DATA_SIZE(sizeof(db_id)); + db_id = *(uint32_t *)indata.dptr; + ctdb_db = find_ctdb_db(ctdb, db_id); + if (ctdb_db == NULL) return -1; + outdata->dptr = discard_const(ctdb_db->db_path); + outdata->dsize = strlen(ctdb_db->db_path)+1; + return 0; + } + + case CTDB_CONTROL_DB_ATTACH: + return ctdb_control_db_attach(ctdb, + indata, + outdata, + 0, + srcnode, + client_id, + c, + async_reply); + + case CTDB_CONTROL_DB_ATTACH_PERSISTENT: + return ctdb_control_db_attach(ctdb, + indata, + outdata, + CTDB_DB_FLAGS_PERSISTENT, + srcnode, + client_id, + c, + async_reply); + + case CTDB_CONTROL_DB_ATTACH_REPLICATED: + return ctdb_control_db_attach(ctdb, + indata, + outdata, + CTDB_DB_FLAGS_REPLICATED, + srcnode, + client_id, + c, + async_reply); + + case CTDB_CONTROL_SET_CALL: + return control_not_implemented("SET_CALL", NULL); + + case CTDB_CONTROL_TRAVERSE_START: + CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_traverse_start)); + return ctdb_control_traverse_start(ctdb, indata, outdata, srcnode, client_id); + + case CTDB_CONTROL_TRAVERSE_START_EXT: + CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_traverse_start_ext)); + return ctdb_control_traverse_start_ext(ctdb, indata, outdata, srcnode, client_id); + + case CTDB_CONTROL_TRAVERSE_ALL: + return ctdb_control_traverse_all(ctdb, indata, outdata); + + case CTDB_CONTROL_TRAVERSE_ALL_EXT: + return ctdb_control_traverse_all_ext(ctdb, indata, outdata); + + case CTDB_CONTROL_TRAVERSE_DATA: + return ctdb_control_traverse_data(ctdb, indata, outdata); + + case CTDB_CONTROL_TRAVERSE_KILL: + CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_traverse_start)); + return ctdb_control_traverse_kill(ctdb, indata, outdata, srcnode); + + case CTDB_CONTROL_REGISTER_SRVID: + return daemon_register_message_handler(ctdb, client_id, srvid); + + case CTDB_CONTROL_DEREGISTER_SRVID: + return daemon_deregister_message_handler(ctdb, client_id, srvid); + + case CTDB_CONTROL_CHECK_SRVIDS: + return control_not_implemented("CHECK_SRVIDS", NULL); + + case CTDB_CONTROL_ENABLE_SEQNUM: + CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t)); + return ctdb_ltdb_enable_seqnum(ctdb, *(uint32_t *)indata.dptr); + + case CTDB_CONTROL_UPDATE_SEQNUM: + CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t)); + return ctdb_ltdb_update_seqnum(ctdb, *(uint32_t *)indata.dptr, srcnode); + + case CTDB_CONTROL_FREEZE: + CHECK_CONTROL_DATA_SIZE(0); + return ctdb_control_freeze(ctdb, c, async_reply); + + case CTDB_CONTROL_THAW: + return control_not_implemented("THAW", NULL); + + case CTDB_CONTROL_SET_RECMODE: + CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t)); + return ctdb_control_set_recmode(ctdb, c, indata, async_reply, errormsg); + + case CTDB_CONTROL_GET_MONMODE: + return control_not_implemented("GET_MONMODE", NULL); + + case CTDB_CONTROL_ENABLE_MONITOR: + return control_not_implemented("ENABLE_MONITOR", NULL); + + case CTDB_CONTROL_RUN_EVENTSCRIPTS: + return control_not_implemented("RUN_EVENTSCRIPTS", NULL); + + case CTDB_CONTROL_DISABLE_MONITOR: + return control_not_implemented("DISABLE_MONITOR", NULL); + + case CTDB_CONTROL_SHUTDOWN: + DEBUG(DEBUG_NOTICE,("Received SHUTDOWN command.\n")); + ctdb_shutdown_sequence(ctdb, 0); + /* In case above returns due to duplicate shutdown */ + return 0; + + case CTDB_CONTROL_TAKEOVER_IPv4: + return control_not_implemented("TAKEOVER_IPv4", "TAKEOVER_IP"); + + case CTDB_CONTROL_TAKEOVER_IP: + CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_public_ip)); + return ctdb_control_takeover_ip(ctdb, c, indata, async_reply); + + case CTDB_CONTROL_RELEASE_IPv4: + return control_not_implemented("RELEASE_IPv4", "RELEASE_IP"); + + case CTDB_CONTROL_RELEASE_IP: + CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_public_ip)); + return ctdb_control_release_ip(ctdb, c, indata, async_reply); + + case CTDB_CONTROL_IPREALLOCATED: + CHECK_CONTROL_DATA_SIZE(0); + return ctdb_control_ipreallocated(ctdb, c, async_reply); + + case CTDB_CONTROL_GET_PUBLIC_IPSv4: + return control_not_implemented("GET_PUBLIC_IPSv4", + "GET_PUBLIC_IPS"); + + case CTDB_CONTROL_GET_PUBLIC_IPS: + CHECK_CONTROL_DATA_SIZE(0); + return ctdb_control_get_public_ips(ctdb, c, outdata); + + case CTDB_CONTROL_TCP_CLIENT: + CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_connection)); + return ctdb_control_tcp_client(ctdb, client_id, indata); + + case CTDB_CONTROL_STARTUP: + CHECK_CONTROL_DATA_SIZE(0); + return ctdb_control_startup(ctdb, srcnode); + + case CTDB_CONTROL_TCP_ADD: + CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_connection)); + return ctdb_control_tcp_add(ctdb, indata, false); + + case CTDB_CONTROL_TCP_ADD_DELAYED_UPDATE: + CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_connection)); + return ctdb_control_tcp_add(ctdb, indata, true); + + case CTDB_CONTROL_TCP_REMOVE: + CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_connection)); + return ctdb_control_tcp_remove(ctdb, indata); + + case CTDB_CONTROL_SET_TUNABLE: + return ctdb_control_set_tunable(ctdb, indata); + + case CTDB_CONTROL_GET_TUNABLE: + return ctdb_control_get_tunable(ctdb, indata, outdata); + + case CTDB_CONTROL_LIST_TUNABLES: + return ctdb_control_list_tunables(ctdb, outdata); + + case CTDB_CONTROL_MODIFY_FLAGS: + CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_node_flag_change)); + return ctdb_control_modflags(ctdb, indata); + + case CTDB_CONTROL_KILL_TCP: + return control_not_implemented("KILL_TCP", NULL); + + case CTDB_CONTROL_GET_TCP_TICKLE_LIST: + CHECK_CONTROL_DATA_SIZE(sizeof(ctdb_sock_addr)); + return ctdb_control_get_tcp_tickle_list(ctdb, indata, outdata); + + case CTDB_CONTROL_SET_TCP_TICKLE_LIST: + /* data size is verified in the called function */ + return ctdb_control_set_tcp_tickle_list(ctdb, indata); + + case CTDB_CONTROL_REGISTER_SERVER_ID: + return control_not_implemented("REGISTER_SERVER_ID", NULL); + + case CTDB_CONTROL_UNREGISTER_SERVER_ID: + return control_not_implemented("UNREGISTER_SERVER_ID", NULL); + + case CTDB_CONTROL_CHECK_SERVER_ID: + return control_not_implemented("CHECK_SERVER_ID", NULL); + + case CTDB_CONTROL_GET_SERVER_ID_LIST: + return control_not_implemented("SERVER_ID_LIST", NULL); + + case CTDB_CONTROL_PERSISTENT_STORE: + return control_not_implemented("PERSISTENT_STORE", NULL); + + case CTDB_CONTROL_UPDATE_RECORD: + return ctdb_control_update_record(ctdb, c, indata, async_reply); + + case CTDB_CONTROL_SEND_GRATUITOUS_ARP: + return ctdb_control_send_gratious_arp(ctdb, indata); + + case CTDB_CONTROL_TRANSACTION_START: + return control_not_implemented("TRANSACTION_START", NULL); + + case CTDB_CONTROL_TRANSACTION_COMMIT: + return control_not_implemented("TRANSACTION_COMMIT", NULL); + + case CTDB_CONTROL_WIPE_DATABASE: + CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_transdb)); + return ctdb_control_wipe_database(ctdb, indata); + + case CTDB_CONTROL_UPTIME: + return ctdb_control_uptime(ctdb, outdata); + + case CTDB_CONTROL_START_RECOVERY: + return ctdb_control_start_recovery(ctdb, c, async_reply); + + case CTDB_CONTROL_END_RECOVERY: + return ctdb_control_end_recovery(ctdb, c, async_reply); + + case CTDB_CONTROL_TRY_DELETE_RECORDS: + return ctdb_control_try_delete_records(ctdb, indata, outdata); + + case CTDB_CONTROL_ADD_PUBLIC_IP: + return ctdb_control_add_public_address(ctdb, indata); + + case CTDB_CONTROL_DEL_PUBLIC_IP: + return ctdb_control_del_public_address(ctdb, indata); + + case CTDB_CONTROL_GET_CAPABILITIES: + return ctdb_control_get_capabilities(ctdb, outdata); + + case CTDB_CONTROL_START_PERSISTENT_UPDATE: + return ctdb_control_start_persistent_update(ctdb, c, indata); + + case CTDB_CONTROL_CANCEL_PERSISTENT_UPDATE: + return ctdb_control_cancel_persistent_update(ctdb, c, indata); + + case CTDB_CONTROL_TRANS2_COMMIT: + case CTDB_CONTROL_TRANS2_COMMIT_RETRY: + return control_not_implemented("TRANS2_COMMIT", "TRANS3_COMMIT"); + + case CTDB_CONTROL_TRANS2_ERROR: + return control_not_implemented("TRANS2_ERROR", NULL); + + case CTDB_CONTROL_TRANS2_FINISHED: + return control_not_implemented("TRANS2_FINISHED", NULL); + + case CTDB_CONTROL_TRANS2_ACTIVE: + return control_not_implemented("TRANS2_ACTIVE", NULL); + + case CTDB_CONTROL_TRANS3_COMMIT: + return ctdb_control_trans3_commit(ctdb, c, indata, async_reply); + + case CTDB_CONTROL_RECD_PING: + CHECK_CONTROL_DATA_SIZE(0); + return ctdb_control_recd_ping(ctdb); + + case CTDB_CONTROL_GET_EVENT_SCRIPT_STATUS: + return control_not_implemented("GET_EVENT_SCRIPT_STATUS", NULL); + + case CTDB_CONTROL_RECD_RECLOCK_LATENCY: + CHECK_CONTROL_DATA_SIZE(sizeof(double)); + CTDB_UPDATE_RECLOCK_LATENCY(ctdb, "recd reclock", reclock.recd, *((double *)indata.dptr)); + return 0; + case CTDB_CONTROL_GET_RECLOCK_FILE: + CHECK_CONTROL_DATA_SIZE(0); + if (ctdb->recovery_lock != NULL) { + outdata->dptr = discard_const(ctdb->recovery_lock); + outdata->dsize = strlen(ctdb->recovery_lock) + 1; + } + return 0; + case CTDB_CONTROL_SET_RECLOCK_FILE: + return control_not_implemented("SET_RECLOCK", NULL); + + case CTDB_CONTROL_STOP_NODE: + CHECK_CONTROL_DATA_SIZE(0); + return ctdb_control_stop_node(ctdb); + + case CTDB_CONTROL_CONTINUE_NODE: + CHECK_CONTROL_DATA_SIZE(0); + return ctdb_control_continue_node(ctdb); + + case CTDB_CONTROL_SET_NATGWSTATE: + return control_not_implemented("SET_NATGWSTATE", NULL); + + case CTDB_CONTROL_SET_LMASTERROLE: { + uint32_t lmasterrole; + + CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t)); + lmasterrole = *(uint32_t *)indata.dptr; + if (lmasterrole == 0) { + ctdb->capabilities &= ~CTDB_CAP_LMASTER; + } else { + ctdb->capabilities |= CTDB_CAP_LMASTER; + } + return 0; + } + + case CTDB_CONTROL_SET_RECMASTERROLE: { + uint32_t recmasterrole; + + CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t)); + recmasterrole = *(uint32_t *)indata.dptr; + if (recmasterrole == 0) { + ctdb->capabilities &= ~CTDB_CAP_RECMASTER; + } else { + ctdb->capabilities |= CTDB_CAP_RECMASTER; + } + return 0; + } + + case CTDB_CONTROL_ENABLE_SCRIPT: + return control_not_implemented("ENABLE_SCRIPT", NULL); + + case CTDB_CONTROL_DISABLE_SCRIPT: + return control_not_implemented("DISABLE_SCRIPT", NULL); + + case CTDB_CONTROL_SET_BAN_STATE: + CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_ban_state)); + return ctdb_control_set_ban_state(ctdb, indata); + + case CTDB_CONTROL_GET_BAN_STATE: + CHECK_CONTROL_DATA_SIZE(0); + return ctdb_control_get_ban_state(ctdb, outdata); + + case CTDB_CONTROL_SET_DB_PRIORITY: + return control_not_implemented("SET_DB_PRIORITY", NULL); + + case CTDB_CONTROL_GET_DB_PRIORITY: + return control_not_implemented("GET_DB_PRIORITY", NULL); + + case CTDB_CONTROL_TRANSACTION_CANCEL: + return control_not_implemented("TRANSACTION_CANCEL", NULL); + + case CTDB_CONTROL_REGISTER_NOTIFY: + return ctdb_control_register_notify(ctdb, client_id, indata); + + case CTDB_CONTROL_DEREGISTER_NOTIFY: + CHECK_CONTROL_DATA_SIZE(sizeof(uint64_t)); + return ctdb_control_deregister_notify(ctdb, client_id, indata); + + case CTDB_CONTROL_GET_LOG: + return control_not_implemented("GET_LOG", NULL); + + case CTDB_CONTROL_CLEAR_LOG: + return control_not_implemented("CLEAR_LOG", NULL); + + case CTDB_CONTROL_GET_DB_SEQNUM: + CHECK_CONTROL_DATA_SIZE(sizeof(uint64_t)); + return ctdb_control_get_db_seqnum(ctdb, indata, outdata); + + case CTDB_CONTROL_DB_SET_HEALTHY: + CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t)); + return ctdb_control_db_set_healthy(ctdb, indata); + + case CTDB_CONTROL_DB_GET_HEALTH: + CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t)); + return ctdb_control_db_get_health(ctdb, indata, outdata); + + case CTDB_CONTROL_GET_PUBLIC_IP_INFO: + CHECK_CONTROL_DATA_SIZE(sizeof(ctdb_sock_addr)); + return ctdb_control_get_public_ip_info(ctdb, c, indata, outdata); + + case CTDB_CONTROL_GET_IFACES: + CHECK_CONTROL_DATA_SIZE(0); + return ctdb_control_get_ifaces(ctdb, c, outdata); + + case CTDB_CONTROL_SET_IFACE_LINK_STATE: + CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_iface)); + return ctdb_control_set_iface_link(ctdb, c, indata); + + case CTDB_CONTROL_GET_STAT_HISTORY: + CHECK_CONTROL_DATA_SIZE(0); + return ctdb_control_get_stat_history(ctdb, c, outdata); + + case CTDB_CONTROL_SCHEDULE_FOR_DELETION: { + struct ctdb_control_schedule_for_deletion *d; + size_t size = offsetof(struct ctdb_control_schedule_for_deletion, key); + CHECK_CONTROL_MIN_DATA_SIZE(size); + d = (struct ctdb_control_schedule_for_deletion *)indata.dptr; + size += d->keylen; + CHECK_CONTROL_DATA_SIZE(size); + return ctdb_control_schedule_for_deletion(ctdb, indata); + } + case CTDB_CONTROL_GET_DB_STATISTICS: + CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t)); + return ctdb_control_get_db_statistics(ctdb, *(uint32_t *)indata.dptr, outdata); + + case CTDB_CONTROL_RELOAD_PUBLIC_IPS: + CHECK_CONTROL_DATA_SIZE(0); + return ctdb_control_reload_public_ips(ctdb, c, async_reply); + + case CTDB_CONTROL_RECEIVE_RECORDS: + return control_not_implemented("RECEIVE_RECORDS", NULL); + + case CTDB_CONTROL_DB_DETACH: + return ctdb_control_db_detach(ctdb, indata, client_id); + + case CTDB_CONTROL_DB_FREEZE: + CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t)); + return ctdb_control_db_freeze(ctdb, c, *(uint32_t *)indata.dptr, + async_reply); + + case CTDB_CONTROL_DB_THAW: + CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t)); + return ctdb_control_db_thaw(ctdb, *(uint32_t *)indata.dptr); + + case CTDB_CONTROL_DB_TRANSACTION_START: + CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_transdb)); + return ctdb_control_db_transaction_start(ctdb, indata); + + case CTDB_CONTROL_DB_TRANSACTION_COMMIT: + CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_transdb)); + return ctdb_control_db_transaction_commit(ctdb, indata); + + case CTDB_CONTROL_DB_TRANSACTION_CANCEL: + CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t)); + return ctdb_control_db_transaction_cancel(ctdb, indata); + + case CTDB_CONTROL_DB_PULL: + CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_pulldb_ext)); + return ctdb_control_db_pull(ctdb, c, indata, outdata); + + case CTDB_CONTROL_DB_PUSH_START: + CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_pulldb_ext)); + return ctdb_control_db_push_start(ctdb, indata); + + case CTDB_CONTROL_DB_PUSH_CONFIRM: + CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t)); + return ctdb_control_db_push_confirm(ctdb, indata, outdata); + + case CTDB_CONTROL_DB_OPEN_FLAGS: { + uint32_t db_id; + struct ctdb_db_context *ctdb_db; + int tdb_flags; + + CHECK_CONTROL_DATA_SIZE(sizeof(db_id)); + db_id = *(uint32_t *)indata.dptr; + ctdb_db = find_ctdb_db(ctdb, db_id); + if (ctdb_db == NULL) { + return -1; + } + + tdb_flags = tdb_get_flags(ctdb_db->ltdb->tdb); + + outdata->dptr = talloc_size(outdata, sizeof(tdb_flags)); + if (outdata->dptr == NULL) { + return -1; + } + + outdata->dsize = sizeof(tdb_flags); + memcpy(outdata->dptr, &tdb_flags, outdata->dsize); + return 0; + } + + case CTDB_CONTROL_CHECK_PID_SRVID: + CHECK_CONTROL_DATA_SIZE((sizeof(pid_t) + sizeof(uint64_t))); + return ctdb_control_check_pid_srvid(ctdb, indata); + + case CTDB_CONTROL_TUNNEL_REGISTER: + return ctdb_control_tunnel_register(ctdb, client_id, srvid); + + case CTDB_CONTROL_TUNNEL_DEREGISTER: + return ctdb_control_tunnel_deregister(ctdb, client_id, srvid); + + case CTDB_CONTROL_VACUUM_FETCH: + return ctdb_control_vacuum_fetch(ctdb, indata); + + case CTDB_CONTROL_DB_VACUUM: { + struct ctdb_db_vacuum db_vacuum; + + CHECK_CONTROL_DATA_SIZE(ctdb_db_vacuum_len(&db_vacuum)); + return ctdb_control_db_vacuum(ctdb, c, indata, async_reply); + } + case CTDB_CONTROL_ECHO_DATA: { + return ctdb_control_echo_data(ctdb, c, indata, async_reply); + } + + case CTDB_CONTROL_DISABLE_NODE: + CHECK_CONTROL_DATA_SIZE(0); + return ctdb_control_disable_node(ctdb); + + case CTDB_CONTROL_ENABLE_NODE: + CHECK_CONTROL_DATA_SIZE(0); + return ctdb_control_enable_node(ctdb); + + case CTDB_CONTROL_TCP_CLIENT_DISCONNECTED: + CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_connection)); + return ctdb_control_tcp_client_disconnected(ctdb, client_id, indata); + + case CTDB_CONTROL_TCP_CLIENT_PASSED: + CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_connection)); + return ctdb_control_tcp_client_passed(ctdb, client_id, indata); + + default: + DEBUG(DEBUG_CRIT,(__location__ " Unknown CTDB control opcode %u\n", opcode)); + return -1; + } +} + +/* + send a reply for a ctdb control + */ +void ctdb_request_control_reply(struct ctdb_context *ctdb, struct ctdb_req_control_old *c, + TDB_DATA *outdata, int32_t status, const char *errormsg) +{ + struct ctdb_reply_control_old *r; + size_t len; + + /* some controls send no reply */ + if (c->flags & CTDB_CTRL_FLAG_NOREPLY) { + return; + } + + len = offsetof(struct ctdb_reply_control_old, data) + (outdata?outdata->dsize:0); + if (errormsg) { + len += strlen(errormsg); + } + r = ctdb_transport_allocate(ctdb, ctdb, CTDB_REPLY_CONTROL, len, struct ctdb_reply_control_old); + if (r == NULL) { + DEBUG(DEBUG_ERR,(__location__ "Unable to allocate transport - OOM or transport is down\n")); + return; + } + + r->hdr.destnode = c->hdr.srcnode; + r->hdr.reqid = c->hdr.reqid; + r->status = status; + r->datalen = outdata?outdata->dsize:0; + if (outdata && outdata->dsize) { + memcpy(&r->data[0], outdata->dptr, outdata->dsize); + } + if (errormsg) { + r->errorlen = strlen(errormsg); + memcpy(&r->data[r->datalen], errormsg, r->errorlen); + } + + ctdb_queue_packet_opcode(ctdb, &r->hdr, c->opcode); + + talloc_free(r); +} + +/* + called when a CTDB_REQ_CONTROL packet comes in +*/ +void ctdb_request_control(struct ctdb_context *ctdb, struct ctdb_req_header *hdr) +{ + struct ctdb_req_control_old *c = (struct ctdb_req_control_old *)hdr; + TDB_DATA data, *outdata; + int32_t status; + bool async_reply = false; + const char *errormsg = NULL; + + data.dptr = &c->data[0]; + data.dsize = c->datalen; + + outdata = talloc_zero(c, TDB_DATA); + + status = ctdb_control_dispatch(ctdb, c, data, outdata, hdr->srcnode, + &errormsg, &async_reply); + + if (!async_reply) { + ctdb_request_control_reply(ctdb, c, outdata, status, errormsg); + } +} + +/* + called when a CTDB_REPLY_CONTROL packet comes in +*/ +void ctdb_reply_control(struct ctdb_context *ctdb, struct ctdb_req_header *hdr) +{ + struct ctdb_reply_control_old *c = (struct ctdb_reply_control_old *)hdr; + TDB_DATA data; + struct ctdb_control_state *state; + const char *errormsg = NULL; + + state = reqid_find(ctdb->idr, hdr->reqid, struct ctdb_control_state); + if (state == NULL) { + DEBUG(DEBUG_ERR,("pnn %u Invalid reqid %u in ctdb_reply_control\n", + ctdb->pnn, hdr->reqid)); + return; + } + + if (hdr->reqid != state->reqid) { + /* we found a record but it was the wrong one */ + DEBUG(DEBUG_ERR, ("Dropped orphaned control reply with reqid:%u\n", hdr->reqid)); + return; + } + + data.dptr = &c->data[0]; + data.dsize = c->datalen; + if (c->errorlen) { + errormsg = talloc_strndup(state, + (char *)&c->data[c->datalen], c->errorlen); + } + + /* make state a child of the packet, so it goes away when the packet + is freed. */ + talloc_steal(hdr, state); + + state->callback(ctdb, c->status, data, errormsg, state->private_data); +} + +static int ctdb_control_destructor(struct ctdb_control_state *state) +{ + reqid_remove(state->ctdb->idr, state->reqid); + return 0; +} + +/* + handle a timeout of a control + */ +static void ctdb_control_timeout(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *private_data) +{ + struct ctdb_control_state *state = talloc_get_type(private_data, struct ctdb_control_state); + TALLOC_CTX *tmp_ctx = talloc_new(ev); + + CTDB_INCREMENT_STAT(state->ctdb, timeouts.control); + + talloc_steal(tmp_ctx, state); + + state->callback(state->ctdb, -1, tdb_null, + "ctdb_control timed out", + state->private_data); + talloc_free(tmp_ctx); +} + + +/* + send a control message to a node + */ +int ctdb_daemon_send_control(struct ctdb_context *ctdb, uint32_t destnode, + uint64_t srvid, uint32_t opcode, uint32_t client_id, + uint32_t flags, + TDB_DATA data, + ctdb_control_callback_fn_t callback, + void *private_data) +{ + struct ctdb_req_control_old *c; + struct ctdb_control_state *state; + size_t len; + + if (ctdb->methods == NULL) { + DEBUG(DEBUG_INFO,(__location__ " Failed to send control. Transport is DOWN\n")); + return -1; + } + + if (((destnode == CTDB_BROADCAST_ACTIVE) || + (destnode == CTDB_BROADCAST_ALL) || + (destnode == CTDB_BROADCAST_CONNECTED)) && + !(flags & CTDB_CTRL_FLAG_NOREPLY)) { + DEBUG(DEBUG_CRIT,("Attempt to broadcast control without NOREPLY\n")); + return -1; + } + + if (destnode != CTDB_BROADCAST_ACTIVE && + destnode != CTDB_BROADCAST_ALL && + destnode != CTDB_BROADCAST_CONNECTED && + (!ctdb_validate_pnn(ctdb, destnode) || + (ctdb->nodes[destnode]->flags & NODE_FLAGS_DISCONNECTED))) { + if (!(flags & CTDB_CTRL_FLAG_NOREPLY)) { + callback(ctdb, -1, tdb_null, "ctdb_control to disconnected node", private_data); + } + return 0; + } + + /* the state is made a child of private_data if possible. This means any reply + will be discarded if the private_data goes away */ + state = talloc(private_data?private_data:ctdb, struct ctdb_control_state); + CTDB_NO_MEMORY(ctdb, state); + + state->reqid = reqid_new(ctdb->idr, state); + state->callback = callback; + state->private_data = private_data; + state->ctdb = ctdb; + state->flags = flags; + + talloc_set_destructor(state, ctdb_control_destructor); + + len = offsetof(struct ctdb_req_control_old, data) + data.dsize; + c = ctdb_transport_allocate(ctdb, state, CTDB_REQ_CONTROL, len, + struct ctdb_req_control_old); + CTDB_NO_MEMORY(ctdb, c); + talloc_set_name_const(c, "ctdb_req_control packet"); + + c->hdr.destnode = destnode; + c->hdr.reqid = state->reqid; + c->opcode = opcode; + c->client_id = client_id; + c->flags = flags; + c->srvid = srvid; + c->datalen = data.dsize; + if (data.dsize) { + memcpy(&c->data[0], data.dptr, data.dsize); + } + + ctdb_queue_packet(ctdb, &c->hdr); + + if (flags & CTDB_CTRL_FLAG_NOREPLY) { + talloc_free(state); + return 0; + } + + if (ctdb->tunable.control_timeout) { + tevent_add_timer(ctdb->ev, state, + timeval_current_ofs(ctdb->tunable.control_timeout, 0), + ctdb_control_timeout, state); + } + + talloc_free(c); + return 0; +} diff --git a/ctdb/server/ctdb_daemon.c b/ctdb/server/ctdb_daemon.c new file mode 100644 index 0000000..eb9d634 --- /dev/null +++ b/ctdb/server/ctdb_daemon.c @@ -0,0 +1,2248 @@ +/* + ctdb daemon code + + Copyright (C) Andrew Tridgell 2006 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "replace.h" +#include "system/network.h" +#include "system/filesys.h" +#include "system/wait.h" +#include "system/time.h" + +#include <talloc.h> +/* Allow use of deprecated function tevent_loop_allow_nesting() */ +#define TEVENT_DEPRECATED +#include <tevent.h> +#include <tdb.h> + +#include "lib/tdb_wrap/tdb_wrap.h" +#include "lib/util/dlinklist.h" +#include "lib/util/debug.h" +#include "lib/util/time.h" +#include "lib/util/blocking.h" +#include "lib/util/become_daemon.h" + +#include "version.h" +#include "ctdb_private.h" +#include "ctdb_client.h" + +#include "common/rb_tree.h" +#include "common/reqid.h" +#include "common/system.h" +#include "common/common.h" +#include "common/logging.h" +#include "common/pidfile.h" +#include "common/sock_io.h" + +struct ctdb_client_pid_list { + struct ctdb_client_pid_list *next, *prev; + struct ctdb_context *ctdb; + pid_t pid; + struct ctdb_client *client; +}; + +const char *ctdbd_pidfile = NULL; +static struct pidfile_context *ctdbd_pidfile_ctx = NULL; + +static void daemon_incoming_packet(void *, struct ctdb_req_header *); + +static pid_t __ctdbd_pid; + +static void print_exit_message(void) +{ + if (getpid() == __ctdbd_pid) { + DEBUG(DEBUG_NOTICE,("CTDB daemon shutting down\n")); + + /* Wait a second to allow pending log messages to be flushed */ + sleep(1); + } +} + +#ifdef HAVE_GETRUSAGE + +struct cpu_check_threshold_data { + unsigned short percent; + struct timeval timeofday; + struct timeval ru_time; +}; + +static void ctdb_cpu_check_threshold(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval tv, + void *private_data) +{ + struct ctdb_context *ctdb = talloc_get_type_abort( + private_data, struct ctdb_context); + uint32_t interval = 60; + + static unsigned short threshold = 0; + static struct cpu_check_threshold_data prev = { + .percent = 0, + .timeofday = { .tv_sec = 0 }, + .ru_time = { .tv_sec = 0 }, + }; + + struct rusage usage; + struct cpu_check_threshold_data curr = { + .percent = 0, + }; + int64_t ru_time_diff, timeofday_diff; + bool first; + int ret; + + /* + * Cache the threshold so that we don't waste time checking + * the environment variable every time + */ + if (threshold == 0) { + const char *t; + + threshold = 90; + + t = getenv("CTDB_TEST_CPU_USAGE_THRESHOLD"); + if (t != NULL) { + int th; + + th = atoi(t); + if (th <= 0 || th > 100) { + DBG_WARNING("Failed to parse env var: %s\n", t); + } else { + threshold = th; + } + } + } + + ret = getrusage(RUSAGE_SELF, &usage); + if (ret != 0) { + DBG_WARNING("rusage() failed: %d\n", ret); + goto next; + } + + /* Sum the system and user CPU usage */ + curr.ru_time = timeval_sum(&usage.ru_utime, &usage.ru_stime); + + curr.timeofday = tv; + + first = timeval_is_zero(&prev.timeofday); + if (first) { + /* No previous values recorded so no calculation to do */ + goto done; + } + + timeofday_diff = usec_time_diff(&curr.timeofday, &prev.timeofday); + if (timeofday_diff <= 0) { + /* + * Time went backwards or didn't progress so no (sane) + * calculation can be done + */ + goto done; + } + + ru_time_diff = usec_time_diff(&curr.ru_time, &prev.ru_time); + + curr.percent = ru_time_diff * 100 / timeofday_diff; + + if (curr.percent >= threshold) { + /* Log only if the utilisation changes */ + if (curr.percent != prev.percent) { + D_WARNING("WARNING: CPU utilisation %hu%% >= " + "threshold (%hu%%)\n", + curr.percent, + threshold); + } + } else { + /* Log if the utilisation falls below the threshold */ + if (prev.percent >= threshold) { + D_WARNING("WARNING: CPU utilisation %hu%% < " + "threshold (%hu%%)\n", + curr.percent, + threshold); + } + } + +done: + prev = curr; + +next: + tevent_add_timer(ctdb->ev, ctdb, + timeval_current_ofs(interval, 0), + ctdb_cpu_check_threshold, + ctdb); +} + +static void ctdb_start_cpu_check_threshold(struct ctdb_context *ctdb) +{ + tevent_add_timer(ctdb->ev, ctdb, + timeval_current(), + ctdb_cpu_check_threshold, + ctdb); +} +#endif /* HAVE_GETRUSAGE */ + +static void ctdb_time_tick(struct tevent_context *ev, struct tevent_timer *te, + struct timeval t, void *private_data) +{ + struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context); + + if (getpid() != ctdb->ctdbd_pid) { + return; + } + + tevent_add_timer(ctdb->ev, ctdb, + timeval_current_ofs(1, 0), + ctdb_time_tick, ctdb); +} + +/* Used to trigger a dummy event once per second, to make + * detection of hangs more reliable. + */ +static void ctdb_start_time_tickd(struct ctdb_context *ctdb) +{ + tevent_add_timer(ctdb->ev, ctdb, + timeval_current_ofs(1, 0), + ctdb_time_tick, ctdb); +} + +static void ctdb_start_periodic_events(struct ctdb_context *ctdb) +{ + /* start monitoring for connected/disconnected nodes */ + ctdb_start_keepalive(ctdb); + + /* start periodic update of tcp tickle lists */ + ctdb_start_tcp_tickle_update(ctdb); + + /* start listening for recovery daemon pings */ + ctdb_control_recd_ping(ctdb); + + /* start listening to timer ticks */ + ctdb_start_time_tickd(ctdb); + +#ifdef HAVE_GETRUSAGE + ctdb_start_cpu_check_threshold(ctdb); +#endif /* HAVE_GETRUSAGE */ +} + +static void ignore_signal(int signum) +{ + struct sigaction act; + + memset(&act, 0, sizeof(act)); + + act.sa_handler = SIG_IGN; + sigemptyset(&act.sa_mask); + sigaddset(&act.sa_mask, signum); + sigaction(signum, &act, NULL); +} + + +/* + send a packet to a client + */ +static int daemon_queue_send(struct ctdb_client *client, struct ctdb_req_header *hdr) +{ + CTDB_INCREMENT_STAT(client->ctdb, client_packets_sent); + if (hdr->operation == CTDB_REQ_MESSAGE) { + if (ctdb_queue_length(client->queue) > client->ctdb->tunable.max_queue_depth_drop_msg) { + DEBUG(DEBUG_ERR,("CTDB_REQ_MESSAGE queue full - killing client connection.\n")); + talloc_free(client); + return -1; + } + } + return ctdb_queue_send(client->queue, (uint8_t *)hdr, hdr->length); +} + +/* + message handler for when we are in daemon mode. This redirects the message + to the right client + */ +static void daemon_message_handler(uint64_t srvid, TDB_DATA data, + void *private_data) +{ + struct ctdb_client *client = talloc_get_type(private_data, struct ctdb_client); + struct ctdb_req_message_old *r; + int len; + + /* construct a message to send to the client containing the data */ + len = offsetof(struct ctdb_req_message_old, data) + data.dsize; + r = ctdbd_allocate_pkt(client->ctdb, client->ctdb, CTDB_REQ_MESSAGE, + len, struct ctdb_req_message_old); + CTDB_NO_MEMORY_VOID(client->ctdb, r); + + talloc_set_name_const(r, "req_message packet"); + + r->srvid = srvid; + r->datalen = data.dsize; + memcpy(&r->data[0], data.dptr, data.dsize); + + daemon_queue_send(client, &r->hdr); + + talloc_free(r); +} + +/* + this is called when the ctdb daemon received a ctdb request to + set the srvid from the client + */ +int daemon_register_message_handler(struct ctdb_context *ctdb, uint32_t client_id, uint64_t srvid) +{ + struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client); + int res; + if (client == NULL) { + DEBUG(DEBUG_ERR,("Bad client_id in daemon_request_register_message_handler\n")); + return -1; + } + res = srvid_register(ctdb->srv, client, srvid, daemon_message_handler, + client); + if (res != 0) { + DEBUG(DEBUG_ERR,(__location__ " Failed to register handler %llu in daemon\n", + (unsigned long long)srvid)); + } else { + DEBUG(DEBUG_INFO,(__location__ " Registered message handler for srvid=%llu\n", + (unsigned long long)srvid)); + } + + return res; +} + +/* + this is called when the ctdb daemon received a ctdb request to + remove a srvid from the client + */ +int daemon_deregister_message_handler(struct ctdb_context *ctdb, uint32_t client_id, uint64_t srvid) +{ + struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client); + if (client == NULL) { + DEBUG(DEBUG_ERR,("Bad client_id in daemon_request_deregister_message_handler\n")); + return -1; + } + return srvid_deregister(ctdb->srv, srvid, client); +} + +void daemon_tunnel_handler(uint64_t tunnel_id, TDB_DATA data, + void *private_data) +{ + struct ctdb_client *client = + talloc_get_type_abort(private_data, struct ctdb_client); + struct ctdb_req_tunnel_old *c, *pkt; + size_t len; + + pkt = (struct ctdb_req_tunnel_old *)data.dptr; + + len = offsetof(struct ctdb_req_tunnel_old, data) + pkt->datalen; + c = ctdbd_allocate_pkt(client->ctdb, client->ctdb, CTDB_REQ_TUNNEL, + len, struct ctdb_req_tunnel_old); + if (c == NULL) { + DEBUG(DEBUG_ERR, ("Memory error in daemon_tunnel_handler\n")); + return; + } + + talloc_set_name_const(c, "req_tunnel packet"); + + c->tunnel_id = tunnel_id; + c->flags = pkt->flags; + c->datalen = pkt->datalen; + memcpy(c->data, pkt->data, pkt->datalen); + + daemon_queue_send(client, &c->hdr); + + talloc_free(c); +} + +/* + destroy a ctdb_client +*/ +static int ctdb_client_destructor(struct ctdb_client *client) +{ + struct ctdb_db_context *ctdb_db; + + ctdb_takeover_client_destructor_hook(client); + reqid_remove(client->ctdb->idr, client->client_id); + client->ctdb->num_clients--; + + if (client->num_persistent_updates != 0) { + DEBUG(DEBUG_ERR,(__location__ " Client disconnecting with %u persistent updates in flight. Starting recovery\n", client->num_persistent_updates)); + client->ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE; + } + ctdb_db = find_ctdb_db(client->ctdb, client->db_id); + if (ctdb_db) { + DEBUG(DEBUG_ERR, (__location__ " client exit while transaction " + "commit active. Forcing recovery.\n")); + client->ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE; + + /* + * trans3 transaction state: + * + * The destructor sets the pointer to NULL. + */ + talloc_free(ctdb_db->persistent_state); + } + + return 0; +} + + +/* + this is called when the ctdb daemon received a ctdb request message + from a local client over the unix domain socket + */ +static void daemon_request_message_from_client(struct ctdb_client *client, + struct ctdb_req_message_old *c) +{ + TDB_DATA data; + int res; + + if (c->hdr.destnode == CTDB_CURRENT_NODE) { + c->hdr.destnode = ctdb_get_pnn(client->ctdb); + } + + /* maybe the message is for another client on this node */ + if (ctdb_get_pnn(client->ctdb)==c->hdr.destnode) { + ctdb_request_message(client->ctdb, (struct ctdb_req_header *)c); + return; + } + + /* its for a remote node */ + data.dptr = &c->data[0]; + data.dsize = c->datalen; + res = ctdb_daemon_send_message(client->ctdb, c->hdr.destnode, + c->srvid, data); + if (res != 0) { + DEBUG(DEBUG_ERR,(__location__ " Failed to send message to remote node %u\n", + c->hdr.destnode)); + } +} + + +struct daemon_call_state { + struct ctdb_client *client; + uint32_t reqid; + struct ctdb_call *call; + struct timeval start_time; + + /* readonly request ? */ + uint32_t readonly_fetch; + uint32_t client_callid; +}; + +/* + complete a call from a client +*/ +static void daemon_call_from_client_callback(struct ctdb_call_state *state) +{ + struct daemon_call_state *dstate = talloc_get_type(state->async.private_data, + struct daemon_call_state); + struct ctdb_reply_call_old *r; + int res; + uint32_t length; + struct ctdb_client *client = dstate->client; + struct ctdb_db_context *ctdb_db = state->ctdb_db; + + talloc_steal(client, dstate); + talloc_steal(dstate, dstate->call); + + res = ctdb_daemon_call_recv(state, dstate->call); + if (res != 0) { + DEBUG(DEBUG_ERR, (__location__ " ctdbd_call_recv() returned error\n")); + CTDB_DECREMENT_STAT(client->ctdb, pending_calls); + + CTDB_UPDATE_LATENCY(client->ctdb, ctdb_db, "call_from_client_cb 1", call_latency, dstate->start_time); + return; + } + + length = offsetof(struct ctdb_reply_call_old, data) + dstate->call->reply_data.dsize; + /* If the client asked for readonly FETCH, we remapped this to + FETCH_WITH_HEADER when calling the daemon. So we must + strip the extra header off the reply data before passing + it back to the client. + */ + if (dstate->readonly_fetch + && dstate->client_callid == CTDB_FETCH_FUNC) { + length -= sizeof(struct ctdb_ltdb_header); + } + + r = ctdbd_allocate_pkt(client->ctdb, dstate, CTDB_REPLY_CALL, + length, struct ctdb_reply_call_old); + if (r == NULL) { + DEBUG(DEBUG_ERR, (__location__ " Failed to allocate reply_call in ctdb daemon\n")); + CTDB_DECREMENT_STAT(client->ctdb, pending_calls); + CTDB_UPDATE_LATENCY(client->ctdb, ctdb_db, "call_from_client_cb 2", call_latency, dstate->start_time); + return; + } + r->hdr.reqid = dstate->reqid; + r->status = dstate->call->status; + + if (dstate->readonly_fetch + && dstate->client_callid == CTDB_FETCH_FUNC) { + /* client only asked for a FETCH so we must strip off + the extra ctdb_ltdb header + */ + r->datalen = dstate->call->reply_data.dsize - sizeof(struct ctdb_ltdb_header); + memcpy(&r->data[0], dstate->call->reply_data.dptr + sizeof(struct ctdb_ltdb_header), r->datalen); + } else { + r->datalen = dstate->call->reply_data.dsize; + memcpy(&r->data[0], dstate->call->reply_data.dptr, r->datalen); + } + + res = daemon_queue_send(client, &r->hdr); + if (res == -1) { + /* client is dead - return immediately */ + return; + } + if (res != 0) { + DEBUG(DEBUG_ERR, (__location__ " Failed to queue packet from daemon to client\n")); + } + CTDB_UPDATE_LATENCY(client->ctdb, ctdb_db, "call_from_client_cb 3", call_latency, dstate->start_time); + CTDB_DECREMENT_STAT(client->ctdb, pending_calls); + talloc_free(dstate); +} + +struct ctdb_daemon_packet_wrap { + struct ctdb_context *ctdb; + uint32_t client_id; +}; + +/* + a wrapper to catch disconnected clients + */ +static void daemon_incoming_packet_wrap(void *p, struct ctdb_req_header *hdr) +{ + struct ctdb_client *client; + struct ctdb_daemon_packet_wrap *w = talloc_get_type(p, + struct ctdb_daemon_packet_wrap); + if (w == NULL) { + DEBUG(DEBUG_CRIT,(__location__ " Bad packet type '%s'\n", talloc_get_name(p))); + return; + } + + client = reqid_find(w->ctdb->idr, w->client_id, struct ctdb_client); + if (client == NULL) { + DEBUG(DEBUG_ERR,(__location__ " Packet for disconnected client %u\n", + w->client_id)); + talloc_free(w); + return; + } + talloc_free(w); + + /* process it */ + daemon_incoming_packet(client, hdr); +} + +struct ctdb_deferred_fetch_call { + struct ctdb_deferred_fetch_call *next, *prev; + struct ctdb_req_call_old *c; + struct ctdb_daemon_packet_wrap *w; +}; + +struct ctdb_deferred_fetch_queue { + struct ctdb_deferred_fetch_call *deferred_calls; +}; + +struct ctdb_deferred_requeue { + struct ctdb_deferred_fetch_call *dfc; + struct ctdb_client *client; +}; + +/* called from a timer event and starts reprocessing the deferred call.*/ +static void reprocess_deferred_call(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *private_data) +{ + struct ctdb_deferred_requeue *dfr = (struct ctdb_deferred_requeue *)private_data; + struct ctdb_client *client = dfr->client; + + talloc_steal(client, dfr->dfc->c); + daemon_incoming_packet(client, (struct ctdb_req_header *)dfr->dfc->c); + talloc_free(dfr); +} + +/* the referral context is destroyed either after a timeout or when the initial + fetch-lock has finished. + at this stage, immediately start reprocessing the queued up deferred + calls so they get reprocessed immediately (and since we are dmaster at + this stage, trigger the waiting smbd processes to pick up and acquire the + record right away. +*/ +static int deferred_fetch_queue_destructor(struct ctdb_deferred_fetch_queue *dfq) +{ + + /* need to reprocess the packets from the queue explicitly instead of + just using a normal destructor since we need to + call the clients in the same order as the requests queued up + */ + while (dfq->deferred_calls != NULL) { + struct ctdb_client *client; + struct ctdb_deferred_fetch_call *dfc = dfq->deferred_calls; + struct ctdb_deferred_requeue *dfr; + + DLIST_REMOVE(dfq->deferred_calls, dfc); + + client = reqid_find(dfc->w->ctdb->idr, dfc->w->client_id, struct ctdb_client); + if (client == NULL) { + DEBUG(DEBUG_ERR,(__location__ " Packet for disconnected client %u\n", + dfc->w->client_id)); + continue; + } + + /* process it by pushing it back onto the eventloop */ + dfr = talloc(client, struct ctdb_deferred_requeue); + if (dfr == NULL) { + DEBUG(DEBUG_ERR,("Failed to allocate deferred fetch requeue structure\n")); + continue; + } + + dfr->dfc = talloc_steal(dfr, dfc); + dfr->client = client; + + tevent_add_timer(dfc->w->ctdb->ev, client, timeval_zero(), + reprocess_deferred_call, dfr); + } + + return 0; +} + +/* insert the new deferral context into the rb tree. + there should never be a pre-existing context here, but check for it + warn and destroy the previous context if there is already a deferral context + for this key. +*/ +static void *insert_dfq_callback(void *parm, void *data) +{ + if (data) { + DEBUG(DEBUG_ERR,("Already have DFQ registered. Free old %p and create new %p\n", data, parm)); + talloc_free(data); + } + return parm; +} + +/* if the original fetch-lock did not complete within a reasonable time, + free the context and context for all deferred requests to cause them to be + re-inserted into the event system. +*/ +static void dfq_timeout(struct tevent_context *ev, struct tevent_timer *te, + struct timeval t, void *private_data) +{ + talloc_free(private_data); +} + +/* This function is used in the local daemon to register a KEY in a database + for being "fetched" + While the remote fetch is in-flight, any further attempts to re-fetch the + same record will be deferred until the fetch completes. +*/ +static int setup_deferred_fetch_locks(struct ctdb_db_context *ctdb_db, struct ctdb_call *call) +{ + uint32_t *k; + struct ctdb_deferred_fetch_queue *dfq; + + k = ctdb_key_to_idkey(call, call->key); + if (k == NULL) { + DEBUG(DEBUG_ERR,("Failed to allocate key for deferred fetch\n")); + return -1; + } + + dfq = talloc(call, struct ctdb_deferred_fetch_queue); + if (dfq == NULL) { + DEBUG(DEBUG_ERR,("Failed to allocate key for deferred fetch queue structure\n")); + talloc_free(k); + return -1; + } + dfq->deferred_calls = NULL; + + trbt_insertarray32_callback(ctdb_db->deferred_fetch, k[0], &k[0], insert_dfq_callback, dfq); + + talloc_set_destructor(dfq, deferred_fetch_queue_destructor); + + /* If the fetch hasn't completed in 30 seconds, just tear it all down + and let it try again as the events are reissued */ + tevent_add_timer(ctdb_db->ctdb->ev, dfq, timeval_current_ofs(30, 0), + dfq_timeout, dfq); + + talloc_free(k); + return 0; +} + +/* check if this is a duplicate request to a fetch already in-flight + if it is, make this call deferred to be reprocessed later when + the in-flight fetch completes. +*/ +static int requeue_duplicate_fetch(struct ctdb_db_context *ctdb_db, struct ctdb_client *client, TDB_DATA key, struct ctdb_req_call_old *c) +{ + uint32_t *k; + struct ctdb_deferred_fetch_queue *dfq; + struct ctdb_deferred_fetch_call *dfc; + + k = ctdb_key_to_idkey(c, key); + if (k == NULL) { + DEBUG(DEBUG_ERR,("Failed to allocate key for deferred fetch\n")); + return -1; + } + + dfq = trbt_lookuparray32(ctdb_db->deferred_fetch, k[0], &k[0]); + if (dfq == NULL) { + talloc_free(k); + return -1; + } + + + talloc_free(k); + + dfc = talloc(dfq, struct ctdb_deferred_fetch_call); + if (dfc == NULL) { + DEBUG(DEBUG_ERR, ("Failed to allocate deferred fetch call structure\n")); + return -1; + } + + dfc->w = talloc(dfc, struct ctdb_daemon_packet_wrap); + if (dfc->w == NULL) { + DEBUG(DEBUG_ERR,("Failed to allocate deferred fetch daemon packet wrap structure\n")); + talloc_free(dfc); + return -1; + } + + dfc->c = talloc_steal(dfc, c); + dfc->w->ctdb = ctdb_db->ctdb; + dfc->w->client_id = client->client_id; + + DLIST_ADD_END(dfq->deferred_calls, dfc); + + return 0; +} + + +/* + this is called when the ctdb daemon received a ctdb request call + from a local client over the unix domain socket + */ +static void daemon_request_call_from_client(struct ctdb_client *client, + struct ctdb_req_call_old *c) +{ + struct ctdb_call_state *state; + struct ctdb_db_context *ctdb_db; + struct daemon_call_state *dstate; + struct ctdb_call *call; + struct ctdb_ltdb_header header; + TDB_DATA key, data; + int ret; + struct ctdb_context *ctdb = client->ctdb; + struct ctdb_daemon_packet_wrap *w; + + CTDB_INCREMENT_STAT(ctdb, total_calls); + CTDB_INCREMENT_STAT(ctdb, pending_calls); + + ctdb_db = find_ctdb_db(client->ctdb, c->db_id); + if (!ctdb_db) { + DEBUG(DEBUG_ERR, (__location__ " Unknown database in request. db_id==0x%08x\n", + c->db_id)); + CTDB_DECREMENT_STAT(ctdb, pending_calls); + return; + } + + if (ctdb_db->unhealthy_reason) { + /* + * this is just a warning, as the tdb should be empty anyway, + * and only persistent databases can be unhealthy, which doesn't + * use this code patch + */ + DEBUG(DEBUG_WARNING,("warn: db(%s) unhealty in daemon_request_call_from_client(): %s\n", + ctdb_db->db_name, ctdb_db->unhealthy_reason)); + } + + key.dptr = c->data; + key.dsize = c->keylen; + + w = talloc(ctdb, struct ctdb_daemon_packet_wrap); + CTDB_NO_MEMORY_VOID(ctdb, w); + + w->ctdb = ctdb; + w->client_id = client->client_id; + + ret = ctdb_ltdb_lock_fetch_requeue(ctdb_db, key, &header, + (struct ctdb_req_header *)c, &data, + daemon_incoming_packet_wrap, w, true); + if (ret == -2) { + /* will retry later */ + CTDB_DECREMENT_STAT(ctdb, pending_calls); + return; + } + + talloc_free(w); + + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ " Unable to fetch record\n")); + CTDB_DECREMENT_STAT(ctdb, pending_calls); + return; + } + + + /* check if this fetch request is a duplicate for a + request we already have in flight. If so defer it until + the first request completes. + */ + if (ctdb->tunable.fetch_collapse == 1) { + if (requeue_duplicate_fetch(ctdb_db, client, key, c) == 0) { + ret = ctdb_ltdb_unlock(ctdb_db, key); + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret)); + } + CTDB_DECREMENT_STAT(ctdb, pending_calls); + talloc_free(data.dptr); + return; + } + } + + /* Dont do READONLY if we don't have a tracking database */ + if ((c->flags & CTDB_WANT_READONLY) && !ctdb_db_readonly(ctdb_db)) { + c->flags &= ~CTDB_WANT_READONLY; + } + + if (header.flags & CTDB_REC_RO_REVOKE_COMPLETE) { + header.flags &= ~CTDB_REC_RO_FLAGS; + CTDB_INCREMENT_STAT(ctdb, total_ro_revokes); + CTDB_INCREMENT_DB_STAT(ctdb_db, db_ro_revokes); + if (ctdb_ltdb_store(ctdb_db, key, &header, data) != 0) { + ctdb_fatal(ctdb, "Failed to write header with cleared REVOKE flag"); + } + /* and clear out the tracking data */ + if (tdb_delete(ctdb_db->rottdb, key) != 0) { + DEBUG(DEBUG_ERR,(__location__ " Failed to clear out trackingdb record\n")); + } + } + + /* if we are revoking, we must defer all other calls until the revoke + * had completed. + */ + if (header.flags & CTDB_REC_RO_REVOKING_READONLY) { + talloc_free(data.dptr); + ret = ctdb_ltdb_unlock(ctdb_db, key); + + if (ctdb_add_revoke_deferred_call(ctdb, ctdb_db, key, (struct ctdb_req_header *)c, daemon_incoming_packet, client) != 0) { + ctdb_fatal(ctdb, "Failed to add deferred call for revoke child"); + } + CTDB_DECREMENT_STAT(ctdb, pending_calls); + return; + } + + if ((header.dmaster == ctdb->pnn) + && (!(c->flags & CTDB_WANT_READONLY)) + && (header.flags & (CTDB_REC_RO_HAVE_DELEGATIONS|CTDB_REC_RO_HAVE_READONLY)) ) { + header.flags |= CTDB_REC_RO_REVOKING_READONLY; + if (ctdb_ltdb_store(ctdb_db, key, &header, data) != 0) { + ctdb_fatal(ctdb, "Failed to store record with HAVE_DELEGATIONS set"); + } + ret = ctdb_ltdb_unlock(ctdb_db, key); + + if (ctdb_start_revoke_ro_record(ctdb, ctdb_db, key, &header, data) != 0) { + ctdb_fatal(ctdb, "Failed to start record revoke"); + } + talloc_free(data.dptr); + + if (ctdb_add_revoke_deferred_call(ctdb, ctdb_db, key, (struct ctdb_req_header *)c, daemon_incoming_packet, client) != 0) { + ctdb_fatal(ctdb, "Failed to add deferred call for revoke child"); + } + + CTDB_DECREMENT_STAT(ctdb, pending_calls); + return; + } + + dstate = talloc(client, struct daemon_call_state); + if (dstate == NULL) { + ret = ctdb_ltdb_unlock(ctdb_db, key); + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret)); + } + + DEBUG(DEBUG_ERR,(__location__ " Unable to allocate dstate\n")); + CTDB_DECREMENT_STAT(ctdb, pending_calls); + return; + } + dstate->start_time = timeval_current(); + dstate->client = client; + dstate->reqid = c->hdr.reqid; + talloc_steal(dstate, data.dptr); + + call = dstate->call = talloc_zero(dstate, struct ctdb_call); + if (call == NULL) { + ret = ctdb_ltdb_unlock(ctdb_db, key); + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret)); + } + + DEBUG(DEBUG_ERR,(__location__ " Unable to allocate call\n")); + CTDB_DECREMENT_STAT(ctdb, pending_calls); + CTDB_UPDATE_LATENCY(ctdb, ctdb_db, "call_from_client 1", call_latency, dstate->start_time); + return; + } + + dstate->readonly_fetch = 0; + call->call_id = c->callid; + call->key = key; + call->call_data.dptr = c->data + c->keylen; + call->call_data.dsize = c->calldatalen; + call->flags = c->flags; + + if (c->flags & CTDB_WANT_READONLY) { + /* client wants readonly record, so translate this into a + fetch with header. remember what the client asked for + so we can remap the reply back to the proper format for + the client in the reply + */ + dstate->client_callid = call->call_id; + call->call_id = CTDB_FETCH_WITH_HEADER_FUNC; + dstate->readonly_fetch = 1; + } + + if (header.dmaster == ctdb->pnn) { + state = ctdb_call_local_send(ctdb_db, call, &header, &data); + } else { + state = ctdb_daemon_call_send_remote(ctdb_db, call, &header); + if (ctdb->tunable.fetch_collapse == 1) { + /* This request triggered a remote fetch-lock. + set up a deferral for this key so any additional + fetch-locks are deferred until the current one + finishes. + */ + setup_deferred_fetch_locks(ctdb_db, call); + } + } + + ret = ctdb_ltdb_unlock(ctdb_db, key); + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret)); + } + + if (state == NULL) { + DEBUG(DEBUG_ERR,(__location__ " Unable to setup call send\n")); + CTDB_DECREMENT_STAT(ctdb, pending_calls); + CTDB_UPDATE_LATENCY(ctdb, ctdb_db, "call_from_client 2", call_latency, dstate->start_time); + return; + } + talloc_steal(state, dstate); + talloc_steal(client, state); + + state->async.fn = daemon_call_from_client_callback; + state->async.private_data = dstate; +} + + +static void daemon_request_control_from_client(struct ctdb_client *client, + struct ctdb_req_control_old *c); +static void daemon_request_tunnel_from_client(struct ctdb_client *client, + struct ctdb_req_tunnel_old *c); + +/* data contains a packet from the client */ +static void daemon_incoming_packet(void *p, struct ctdb_req_header *hdr) +{ + struct ctdb_client *client = talloc_get_type(p, struct ctdb_client); + TALLOC_CTX *tmp_ctx; + struct ctdb_context *ctdb = client->ctdb; + + /* place the packet as a child of a tmp_ctx. We then use + talloc_free() below to free it. If any of the calls want + to keep it, then they will steal it somewhere else, and the + talloc_free() will be a no-op */ + tmp_ctx = talloc_new(client); + talloc_steal(tmp_ctx, hdr); + + if (hdr->ctdb_magic != CTDB_MAGIC) { + ctdb_set_error(client->ctdb, "Non CTDB packet rejected in daemon\n"); + goto done; + } + + if (hdr->ctdb_version != CTDB_PROTOCOL) { + ctdb_set_error(client->ctdb, "Bad CTDB version 0x%x rejected in daemon\n", hdr->ctdb_version); + goto done; + } + + switch (hdr->operation) { + case CTDB_REQ_CALL: + CTDB_INCREMENT_STAT(ctdb, client.req_call); + daemon_request_call_from_client(client, (struct ctdb_req_call_old *)hdr); + break; + + case CTDB_REQ_MESSAGE: + CTDB_INCREMENT_STAT(ctdb, client.req_message); + daemon_request_message_from_client(client, (struct ctdb_req_message_old *)hdr); + break; + + case CTDB_REQ_CONTROL: + CTDB_INCREMENT_STAT(ctdb, client.req_control); + daemon_request_control_from_client(client, (struct ctdb_req_control_old *)hdr); + break; + + case CTDB_REQ_TUNNEL: + CTDB_INCREMENT_STAT(ctdb, client.req_tunnel); + daemon_request_tunnel_from_client(client, (struct ctdb_req_tunnel_old *)hdr); + break; + + default: + DEBUG(DEBUG_CRIT,(__location__ " daemon: unrecognized operation %u\n", + hdr->operation)); + } + +done: + talloc_free(tmp_ctx); +} + +/* + called when the daemon gets a incoming packet + */ +static void ctdb_daemon_read_cb(uint8_t *data, size_t cnt, void *args) +{ + struct ctdb_client *client = talloc_get_type(args, struct ctdb_client); + struct ctdb_req_header *hdr; + + if (cnt == 0) { + talloc_free(client); + return; + } + + CTDB_INCREMENT_STAT(client->ctdb, client_packets_recv); + + if (cnt < sizeof(*hdr)) { + ctdb_set_error(client->ctdb, "Bad packet length %u in daemon\n", + (unsigned)cnt); + return; + } + hdr = (struct ctdb_req_header *)data; + + if (hdr->ctdb_magic != CTDB_MAGIC) { + ctdb_set_error(client->ctdb, "Non CTDB packet rejected\n"); + goto err_out; + } + + if (hdr->ctdb_version != CTDB_PROTOCOL) { + ctdb_set_error(client->ctdb, "Bad CTDB version 0x%x rejected in daemon\n", hdr->ctdb_version); + goto err_out; + } + + DEBUG(DEBUG_DEBUG,(__location__ " client request %u of type %u length %u from " + "node %u to %u\n", hdr->reqid, hdr->operation, hdr->length, + hdr->srcnode, hdr->destnode)); + + /* it is the responsibility of the incoming packet function to free 'data' */ + daemon_incoming_packet(client, hdr); + return; + +err_out: + TALLOC_FREE(data); +} + + +static int ctdb_clientpid_destructor(struct ctdb_client_pid_list *client_pid) +{ + if (client_pid->ctdb->client_pids != NULL) { + DLIST_REMOVE(client_pid->ctdb->client_pids, client_pid); + } + + return 0; +} + +static int get_new_client_id(struct reqid_context *idr, + struct ctdb_client *client, + uint32_t *out) +{ + uint32_t client_id; + + client_id = reqid_new(idr, client); + /* + * Some places in the code (e.g. ctdb_control_db_attach(), + * ctdb_control_db_detach()) assign a special meaning to + * client_id 0. The assumption is that if client_id is 0 then + * the control has come from another daemon. Therefore, we + * should never return client_id == 0. + */ + if (client_id == 0) { + /* + * Don't leak ID 0. This is safe because the ID keeps + * increasing. A test will be added to ensure that + * this doesn't change. + */ + reqid_remove(idr, 0); + + client_id = reqid_new(idr, client); + } + + if (client_id == REQID_INVALID) { + return EINVAL; + } + + if (client_id == 0) { + /* Every other ID must have been used and we can't use 0 */ + reqid_remove(idr, 0); + return EINVAL; + } + + *out = client_id; + return 0; +} + +static void ctdb_accept_client(struct tevent_context *ev, + struct tevent_fd *fde, uint16_t flags, + void *private_data) +{ + struct sockaddr_un addr; + socklen_t len; + int fd; + struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context); + struct ctdb_client *client; + struct ctdb_client_pid_list *client_pid; + pid_t peer_pid = 0; + int ret; + + memset(&addr, 0, sizeof(addr)); + len = sizeof(addr); + fd = accept(ctdb->daemon.sd, (struct sockaddr *)&addr, &len); + if (fd == -1) { + return; + } + smb_set_close_on_exec(fd); + + ret = set_blocking(fd, false); + if (ret != 0) { + DEBUG(DEBUG_ERR, + (__location__ + " failed to set socket non-blocking (%s)\n", + strerror(errno))); + close(fd); + return; + } + + set_close_on_exec(fd); + + DEBUG(DEBUG_DEBUG,(__location__ " Created SOCKET FD:%d to connected child\n", fd)); + + client = talloc_zero(ctdb, struct ctdb_client); + if (ctdb_get_peer_pid(fd, &peer_pid) == 0) { + DEBUG(DEBUG_INFO,("Connected client with pid:%u\n", (unsigned)peer_pid)); + } + + client->ctdb = ctdb; + client->fd = fd; + + ret = get_new_client_id(ctdb->idr, client, &client->client_id); + if (ret != 0) { + DBG_ERR("Unable to get client ID (%d)\n", ret); + close(fd); + talloc_free(client); + return; + } + + client->pid = peer_pid; + + client_pid = talloc(client, struct ctdb_client_pid_list); + if (client_pid == NULL) { + DEBUG(DEBUG_ERR,("Failed to allocate client pid structure\n")); + close(fd); + talloc_free(client); + return; + } + client_pid->ctdb = ctdb; + client_pid->pid = peer_pid; + client_pid->client = client; + + DLIST_ADD(ctdb->client_pids, client_pid); + + client->queue = ctdb_queue_setup(ctdb, client, fd, CTDB_DS_ALIGNMENT, + ctdb_daemon_read_cb, client, + "client-%u", client->pid); + + talloc_set_destructor(client, ctdb_client_destructor); + talloc_set_destructor(client_pid, ctdb_clientpid_destructor); + ctdb->num_clients++; +} + + + +/* + * Create a unix domain socket, bind it, secure it and listen. Return + * the file descriptor for the socket. + */ +static int ux_socket_bind(struct ctdb_context *ctdb, bool test_mode_enabled) +{ + struct sockaddr_un addr = { .sun_family = AF_UNIX }; + int ret; + + ctdb->daemon.sd = socket(AF_UNIX, SOCK_STREAM, 0); + if (ctdb->daemon.sd == -1) { + return -1; + } + + strncpy(addr.sun_path, ctdb->daemon.name, sizeof(addr.sun_path)-1); + + if (! sock_clean(ctdb->daemon.name)) { + return -1; + } + + set_close_on_exec(ctdb->daemon.sd); + + ret = set_blocking(ctdb->daemon.sd, false); + if (ret != 0) { + DBG_ERR("Failed to set socket non-blocking (%s)\n", + strerror(errno)); + goto failed; + } + + ret = bind(ctdb->daemon.sd, (struct sockaddr *)&addr, sizeof(addr)); + if (ret == -1) { + D_ERR("Unable to bind on ctdb socket '%s'\n", ctdb->daemon.name); + goto failed; + } + + if (!test_mode_enabled) { + ret = chown(ctdb->daemon.name, geteuid(), getegid()); + if (ret != 0 && !test_mode_enabled) { + D_ERR("Unable to secure (chown) ctdb socket '%s'\n", + ctdb->daemon.name); + goto failed; + } + } + + ret = chmod(ctdb->daemon.name, 0700); + if (ret != 0) { + D_ERR("Unable to secure (chmod) ctdb socket '%s'\n", + ctdb->daemon.name); + goto failed; + } + + + ret = listen(ctdb->daemon.sd, 100); + if (ret != 0) { + D_ERR("Unable to listen on ctdb socket '%s'\n", + ctdb->daemon.name); + goto failed; + } + + D_NOTICE("Listening to ctdb socket %s\n", ctdb->daemon.name); + return 0; + +failed: + close(ctdb->daemon.sd); + ctdb->daemon.sd = -1; + return -1; +} + +struct ctdb_node *ctdb_find_node(struct ctdb_context *ctdb, uint32_t pnn) +{ + struct ctdb_node *node = NULL; + unsigned int i; + + if (pnn == CTDB_CURRENT_NODE) { + pnn = ctdb->pnn; + } + + /* Always found: PNN correctly set just before this is called */ + for (i = 0; i < ctdb->num_nodes; i++) { + node = ctdb->nodes[i]; + if (pnn == node->pnn) { + return node; + } + } + + return NULL; +} + +static void initialise_node_flags (struct ctdb_context *ctdb) +{ + struct ctdb_node *node = NULL; + + node = ctdb_find_node(ctdb, CTDB_CURRENT_NODE); + /* + * PNN correctly set just before this is called so always + * found but keep static analysers happy... + */ + if (node == NULL) { + DBG_ERR("Unable to find current node\n"); + return; + } + + node->flags &= ~NODE_FLAGS_DISCONNECTED; + + /* do we start out in DISABLED mode? */ + if (ctdb->start_as_disabled != 0) { + D_ERR("This node is configured to start in DISABLED state\n"); + node->flags |= NODE_FLAGS_PERMANENTLY_DISABLED; + } + /* do we start out in STOPPED mode? */ + if (ctdb->start_as_stopped != 0) { + D_ERR("This node is configured to start in STOPPED state\n"); + node->flags |= NODE_FLAGS_STOPPED; + } +} + +static void ctdb_setup_event_callback(struct ctdb_context *ctdb, int status, + void *private_data) +{ + if (status != 0) { + ctdb_die(ctdb, "Failed to run setup event"); + } + ctdb_run_notification_script(ctdb, "setup"); + + /* Start the recovery daemon */ + if (ctdb_start_recoverd(ctdb) != 0) { + DEBUG(DEBUG_ALERT,("Failed to start recovery daemon\n")); + exit(11); + } + + ctdb_start_periodic_events(ctdb); + + ctdb_wait_for_first_recovery(ctdb); +} + +static struct timeval tevent_before_wait_ts; +static struct timeval tevent_after_wait_ts; + +static void ctdb_tevent_trace_init(void) +{ + struct timeval now; + + now = timeval_current(); + + tevent_before_wait_ts = now; + tevent_after_wait_ts = now; +} + +static void ctdb_tevent_trace(enum tevent_trace_point tp, + void *private_data) +{ + struct timeval diff; + struct timeval now; + struct ctdb_context *ctdb = + talloc_get_type(private_data, struct ctdb_context); + + if (getpid() != ctdb->ctdbd_pid) { + return; + } + + now = timeval_current(); + + switch (tp) { + case TEVENT_TRACE_BEFORE_WAIT: + diff = timeval_until(&tevent_after_wait_ts, &now); + if (diff.tv_sec > 3) { + DEBUG(DEBUG_ERR, + ("Handling event took %ld seconds!\n", + (long)diff.tv_sec)); + } + tevent_before_wait_ts = now; + break; + + case TEVENT_TRACE_AFTER_WAIT: + diff = timeval_until(&tevent_before_wait_ts, &now); + if (diff.tv_sec > 3) { + DEBUG(DEBUG_ERR, + ("No event for %ld seconds!\n", + (long)diff.tv_sec)); + } + tevent_after_wait_ts = now; + break; + + default: + /* Do nothing for future tevent trace points */ ; + } +} + +static void ctdb_remove_pidfile(void) +{ + TALLOC_FREE(ctdbd_pidfile_ctx); +} + +static void ctdb_create_pidfile(TALLOC_CTX *mem_ctx) +{ + if (ctdbd_pidfile != NULL) { + int ret = pidfile_context_create(mem_ctx, ctdbd_pidfile, + &ctdbd_pidfile_ctx); + if (ret != 0) { + DEBUG(DEBUG_ERR, + ("Failed to create PID file %s\n", + ctdbd_pidfile)); + exit(11); + } + + DEBUG(DEBUG_NOTICE, ("Created PID file %s\n", ctdbd_pidfile)); + atexit(ctdb_remove_pidfile); + } +} + +static void ctdb_initialise_vnn_map(struct ctdb_context *ctdb) +{ + unsigned int i, j, count; + + /* initialize the vnn mapping table, skipping any deleted nodes */ + ctdb->vnn_map = talloc(ctdb, struct ctdb_vnn_map); + CTDB_NO_MEMORY_FATAL(ctdb, ctdb->vnn_map); + + count = 0; + for (i = 0; i < ctdb->num_nodes; i++) { + if ((ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) == 0) { + count++; + } + } + + ctdb->vnn_map->generation = INVALID_GENERATION; + ctdb->vnn_map->size = count; + ctdb->vnn_map->map = talloc_array(ctdb->vnn_map, uint32_t, ctdb->vnn_map->size); + CTDB_NO_MEMORY_FATAL(ctdb, ctdb->vnn_map->map); + + for(i=0, j=0; i < ctdb->vnn_map->size; i++) { + if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) { + continue; + } + ctdb->vnn_map->map[j] = i; + j++; + } +} + +static void ctdb_set_my_pnn(struct ctdb_context *ctdb) +{ + if (ctdb->address == NULL) { + ctdb_fatal(ctdb, + "Can not determine PNN - node address is not set\n"); + } + + ctdb->pnn = ctdb_ip_to_pnn(ctdb, ctdb->address); + if (ctdb->pnn == CTDB_UNKNOWN_PNN) { + ctdb_fatal(ctdb, + "Can not determine PNN - unknown node address\n"); + } + + D_NOTICE("PNN is %u\n", ctdb->pnn); +} + +static void stdin_handler(struct tevent_context *ev, + struct tevent_fd *fde, + uint16_t flags, + void *private_data) +{ + struct ctdb_context *ctdb = talloc_get_type_abort( + private_data, struct ctdb_context); + ssize_t nread; + char c; + + nread = read(STDIN_FILENO, &c, 1); + if (nread != 1) { + D_ERR("stdin closed, exiting\n"); + talloc_free(fde); + ctdb_shutdown_sequence(ctdb, EPIPE); + } +} + +static int setup_stdin_handler(struct ctdb_context *ctdb) +{ + struct tevent_fd *fde; + struct stat st; + int ret; + + ret = fstat(STDIN_FILENO, &st); + if (ret != 0) { + /* Problem with stdin, ignore... */ + DBG_INFO("Can't fstat() stdin\n"); + return 0; + } + + if (!S_ISFIFO(st.st_mode)) { + DBG_INFO("Not a pipe...\n"); + return 0; + } + + fde = tevent_add_fd(ctdb->ev, + ctdb, + STDIN_FILENO, + TEVENT_FD_READ, + stdin_handler, + ctdb); + if (fde == NULL) { + return ENOMEM; + } + + DBG_INFO("Set up stdin handler\n"); + return 0; +} + +static void fork_only(void) +{ + pid_t pid; + + pid = fork(); + if (pid == -1) { + D_ERR("Fork failed (errno=%d)\n", errno); + exit(1); + } + + if (pid != 0) { + /* Parent simply exits... */ + exit(0); + } +} + +static void sighup_hook(void *private_data) +{ + struct ctdb_context *ctdb = talloc_get_type_abort(private_data, + struct ctdb_context); + + if (ctdb->recoverd_pid > 0) { + kill(ctdb->recoverd_pid, SIGHUP); + } + ctdb_event_reopen_logs(ctdb); +} + +/* + start the protocol going as a daemon +*/ +int ctdb_start_daemon(struct ctdb_context *ctdb, + bool interactive, + bool test_mode_enabled) +{ + bool status; + int ret; + struct tevent_fd *fde; + + /* Fork if not interactive */ + if (!interactive) { + if (test_mode_enabled) { + /* Keep stdin open */ + fork_only(); + } else { + /* Fork, close stdin, start a session */ + become_daemon(true, false, false); + } + } + + ignore_signal(SIGPIPE); + ignore_signal(SIGUSR1); + + ctdb->ctdbd_pid = getpid(); + DEBUG(DEBUG_ERR, ("Starting CTDBD (Version %s) as PID: %u\n", + SAMBA_VERSION_STRING, ctdb->ctdbd_pid)); + ctdb_create_pidfile(ctdb); + + /* create a unix domain stream socket to listen to */ + ret = ux_socket_bind(ctdb, test_mode_enabled); + if (ret != 0) { + D_ERR("Cannot continue. Exiting!\n"); + exit(10); + } + + /* Make sure we log something when the daemon terminates. + * This must be the first exit handler to run (so the last to + * be registered. + */ + __ctdbd_pid = getpid(); + atexit(print_exit_message); + + if (ctdb->do_setsched) { + /* try to set us up as realtime */ + if (!set_scheduler()) { + exit(1); + } + DEBUG(DEBUG_NOTICE, ("Set real-time scheduler priority\n")); + } + + ctdb->ev = tevent_context_init(NULL); + if (ctdb->ev == NULL) { + DEBUG(DEBUG_ALERT,("tevent_context_init() failed\n")); + exit(1); + } + tevent_loop_allow_nesting(ctdb->ev); + ctdb_tevent_trace_init(); + tevent_set_trace_callback(ctdb->ev, ctdb_tevent_trace, ctdb); + + status = logging_setup_sighup_handler(ctdb->ev, + ctdb, + sighup_hook, + ctdb); + if (!status) { + D_ERR("Failed to set up signal handler for SIGHUP\n"); + exit(1); + } + + /* set up a handler to pick up sigchld */ + if (ctdb_init_sigchld(ctdb) == NULL) { + DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD\n")); + exit(1); + } + + if (!interactive) { + ctdb_set_child_logging(ctdb); + } + + /* Exit if stdin is closed */ + if (test_mode_enabled) { + ret = setup_stdin_handler(ctdb); + if (ret != 0) { + DBG_ERR("Failed to setup stdin handler\n"); + exit(1); + } + } + + TALLOC_FREE(ctdb->srv); + if (srvid_init(ctdb, &ctdb->srv) != 0) { + DEBUG(DEBUG_CRIT,("Failed to setup message srvid context\n")); + exit(1); + } + + TALLOC_FREE(ctdb->tunnels); + if (srvid_init(ctdb, &ctdb->tunnels) != 0) { + DEBUG(DEBUG_ERR, ("Failed to setup tunnels context\n")); + exit(1); + } + + /* initialize statistics collection */ + ctdb_statistics_init(ctdb); + + /* force initial recovery for election */ + ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE; + + if (ctdb_start_eventd(ctdb) != 0) { + DEBUG(DEBUG_ERR, ("Failed to start event daemon\n")); + exit(1); + } + + ctdb_set_runstate(ctdb, CTDB_RUNSTATE_INIT); + ret = ctdb_event_script(ctdb, CTDB_EVENT_INIT); + if (ret != 0) { + ctdb_die(ctdb, "Failed to run init event\n"); + } + ctdb_run_notification_script(ctdb, "init"); + + if (strcmp(ctdb->transport, "tcp") == 0) { + ret = ctdb_tcp_init(ctdb); + } +#ifdef USE_INFINIBAND + if (strcmp(ctdb->transport, "ib") == 0) { + ret = ctdb_ibw_init(ctdb); + } +#endif + if (ret != 0) { + DEBUG(DEBUG_ERR,("Failed to initialise transport '%s'\n", ctdb->transport)); + return -1; + } + + if (ctdb->methods == NULL) { + DEBUG(DEBUG_ALERT,(__location__ " Can not initialize transport. ctdb->methods is NULL\n")); + ctdb_fatal(ctdb, "transport is unavailable. can not initialize."); + } + + /* Initialise the transport. This sets the node address if it + * was not set via the command-line. */ + if (ctdb->methods->initialise(ctdb) != 0) { + ctdb_fatal(ctdb, "transport failed to initialise"); + } + + ctdb_set_my_pnn(ctdb); + + initialise_node_flags(ctdb); + + ret = ctdb_set_public_addresses(ctdb, true); + if (ret == -1) { + D_ERR("Unable to setup public IP addresses\n"); + exit(1); + } + + ctdb_initialise_vnn_map(ctdb); + + /* attach to existing databases */ + if (ctdb_attach_databases(ctdb) != 0) { + ctdb_fatal(ctdb, "Failed to attach to databases\n"); + } + + /* start frozen, then let the first election sort things out */ + if (!ctdb_blocking_freeze(ctdb)) { + ctdb_fatal(ctdb, "Failed to get initial freeze\n"); + } + + /* now start accepting clients, only can do this once frozen */ + fde = tevent_add_fd(ctdb->ev, ctdb, ctdb->daemon.sd, TEVENT_FD_READ, + ctdb_accept_client, ctdb); + if (fde == NULL) { + ctdb_fatal(ctdb, "Failed to add daemon socket to event loop"); + } + tevent_fd_set_auto_close(fde); + + /* Start the transport */ + if (ctdb->methods->start(ctdb) != 0) { + DEBUG(DEBUG_ALERT,("transport failed to start!\n")); + ctdb_fatal(ctdb, "transport failed to start"); + } + + /* Recovery daemon and timed events are started from the + * callback, only after the setup event completes + * successfully. + */ + ctdb_set_runstate(ctdb, CTDB_RUNSTATE_SETUP); + ret = ctdb_event_script_callback(ctdb, + ctdb, + ctdb_setup_event_callback, + ctdb, + CTDB_EVENT_SETUP, + "%s", + ""); + if (ret != 0) { + DEBUG(DEBUG_CRIT,("Failed to set up 'setup' event\n")); + exit(1); + } + + lockdown_memory(ctdb->valgrinding); + + /* go into a wait loop to allow other nodes to complete */ + tevent_loop_wait(ctdb->ev); + + DEBUG(DEBUG_CRIT,("event_loop_wait() returned. this should not happen\n")); + exit(1); +} + +/* + allocate a packet for use in daemon<->daemon communication + */ +struct ctdb_req_header *_ctdb_transport_allocate(struct ctdb_context *ctdb, + TALLOC_CTX *mem_ctx, + enum ctdb_operation operation, + size_t length, size_t slength, + const char *type) +{ + int size; + struct ctdb_req_header *hdr; + + length = MAX(length, slength); + size = (length+(CTDB_DS_ALIGNMENT-1)) & ~(CTDB_DS_ALIGNMENT-1); + + if (ctdb->methods == NULL) { + DEBUG(DEBUG_INFO,(__location__ " Unable to allocate transport packet for operation %u of length %u. Transport is DOWN.\n", + operation, (unsigned)length)); + return NULL; + } + + hdr = (struct ctdb_req_header *)ctdb->methods->allocate_pkt(mem_ctx, size); + if (hdr == NULL) { + DEBUG(DEBUG_ERR,("Unable to allocate transport packet for operation %u of length %u\n", + operation, (unsigned)length)); + return NULL; + } + talloc_set_name_const(hdr, type); + memset(hdr, 0, slength); + hdr->length = length; + hdr->operation = operation; + hdr->ctdb_magic = CTDB_MAGIC; + hdr->ctdb_version = CTDB_PROTOCOL; + hdr->generation = ctdb->vnn_map->generation; + hdr->srcnode = ctdb->pnn; + + return hdr; +} + +struct daemon_control_state { + struct daemon_control_state *next, *prev; + struct ctdb_client *client; + struct ctdb_req_control_old *c; + uint32_t reqid; + struct ctdb_node *node; +}; + +/* + callback when a control reply comes in + */ +static void daemon_control_callback(struct ctdb_context *ctdb, + int32_t status, TDB_DATA data, + const char *errormsg, + void *private_data) +{ + struct daemon_control_state *state = talloc_get_type(private_data, + struct daemon_control_state); + struct ctdb_client *client = state->client; + struct ctdb_reply_control_old *r; + size_t len; + int ret; + + /* construct a message to send to the client containing the data */ + len = offsetof(struct ctdb_reply_control_old, data) + data.dsize; + if (errormsg) { + len += strlen(errormsg); + } + r = ctdbd_allocate_pkt(ctdb, state, CTDB_REPLY_CONTROL, len, + struct ctdb_reply_control_old); + CTDB_NO_MEMORY_VOID(ctdb, r); + + r->hdr.reqid = state->reqid; + r->status = status; + r->datalen = data.dsize; + r->errorlen = 0; + memcpy(&r->data[0], data.dptr, data.dsize); + if (errormsg) { + r->errorlen = strlen(errormsg); + memcpy(&r->data[r->datalen], errormsg, r->errorlen); + } + + ret = daemon_queue_send(client, &r->hdr); + if (ret != -1) { + talloc_free(state); + } +} + +/* + fail all pending controls to a disconnected node + */ +void ctdb_daemon_cancel_controls(struct ctdb_context *ctdb, struct ctdb_node *node) +{ + struct daemon_control_state *state; + while ((state = node->pending_controls)) { + DLIST_REMOVE(node->pending_controls, state); + daemon_control_callback(ctdb, (uint32_t)-1, tdb_null, + "node is disconnected", state); + } +} + +/* + destroy a daemon_control_state + */ +static int daemon_control_destructor(struct daemon_control_state *state) +{ + if (state->node) { + DLIST_REMOVE(state->node->pending_controls, state); + } + return 0; +} + +/* + this is called when the ctdb daemon received a ctdb request control + from a local client over the unix domain socket + */ +static void daemon_request_control_from_client(struct ctdb_client *client, + struct ctdb_req_control_old *c) +{ + TDB_DATA data; + int res; + struct daemon_control_state *state; + TALLOC_CTX *tmp_ctx = talloc_new(client); + + if (c->hdr.destnode == CTDB_CURRENT_NODE) { + c->hdr.destnode = client->ctdb->pnn; + } + + state = talloc(client, struct daemon_control_state); + CTDB_NO_MEMORY_VOID(client->ctdb, state); + + state->client = client; + state->c = talloc_steal(state, c); + state->reqid = c->hdr.reqid; + if (ctdb_validate_pnn(client->ctdb, c->hdr.destnode)) { + state->node = client->ctdb->nodes[c->hdr.destnode]; + DLIST_ADD(state->node->pending_controls, state); + } else { + state->node = NULL; + } + + talloc_set_destructor(state, daemon_control_destructor); + + if (c->flags & CTDB_CTRL_FLAG_NOREPLY) { + talloc_steal(tmp_ctx, state); + } + + data.dptr = &c->data[0]; + data.dsize = c->datalen; + res = ctdb_daemon_send_control(client->ctdb, c->hdr.destnode, + c->srvid, c->opcode, client->client_id, + c->flags, + data, daemon_control_callback, + state); + if (res != 0) { + DEBUG(DEBUG_ERR,(__location__ " Failed to send control to remote node %u\n", + c->hdr.destnode)); + } + + talloc_free(tmp_ctx); +} + +static void daemon_request_tunnel_from_client(struct ctdb_client *client, + struct ctdb_req_tunnel_old *c) +{ + TDB_DATA data; + int ret; + + if (! ctdb_validate_pnn(client->ctdb, c->hdr.destnode)) { + DEBUG(DEBUG_ERR, ("Invalid destination 0x%x\n", + c->hdr.destnode)); + return; + } + + ret = srvid_exists(client->ctdb->tunnels, c->tunnel_id, NULL); + if (ret != 0) { + DEBUG(DEBUG_ERR, + ("tunnel id 0x%"PRIx64" not registered, dropping pkt\n", + c->tunnel_id)); + return; + } + + data = (TDB_DATA) { + .dsize = c->datalen, + .dptr = &c->data[0], + }; + + ret = ctdb_daemon_send_tunnel(client->ctdb, c->hdr.destnode, + c->tunnel_id, c->flags, data); + if (ret != 0) { + DEBUG(DEBUG_ERR, ("Failed to set tunnel to remote note %u\n", + c->hdr.destnode)); + } +} + +/* + register a call function +*/ +int ctdb_daemon_set_call(struct ctdb_context *ctdb, uint32_t db_id, + ctdb_fn_t fn, int id) +{ + struct ctdb_registered_call *call; + struct ctdb_db_context *ctdb_db; + + ctdb_db = find_ctdb_db(ctdb, db_id); + if (ctdb_db == NULL) { + return -1; + } + + call = talloc(ctdb_db, struct ctdb_registered_call); + call->fn = fn; + call->id = id; + + DLIST_ADD(ctdb_db->calls, call); + return 0; +} + + + +/* + this local messaging handler is ugly, but is needed to prevent + recursion in ctdb_send_message() when the destination node is the + same as the source node + */ +struct ctdb_local_message { + struct ctdb_context *ctdb; + uint64_t srvid; + TDB_DATA data; +}; + +static void ctdb_local_message_trigger(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *private_data) +{ + struct ctdb_local_message *m = talloc_get_type( + private_data, struct ctdb_local_message); + + srvid_dispatch(m->ctdb->srv, m->srvid, CTDB_SRVID_ALL, m->data); + talloc_free(m); +} + +static int ctdb_local_message(struct ctdb_context *ctdb, uint64_t srvid, TDB_DATA data) +{ + struct ctdb_local_message *m; + m = talloc(ctdb, struct ctdb_local_message); + CTDB_NO_MEMORY(ctdb, m); + + m->ctdb = ctdb; + m->srvid = srvid; + m->data = data; + m->data.dptr = talloc_memdup(m, m->data.dptr, m->data.dsize); + if (m->data.dptr == NULL) { + talloc_free(m); + return -1; + } + + /* this needs to be done as an event to prevent recursion */ + tevent_add_timer(ctdb->ev, m, timeval_zero(), + ctdb_local_message_trigger, m); + return 0; +} + +/* + send a ctdb message +*/ +int ctdb_daemon_send_message(struct ctdb_context *ctdb, uint32_t pnn, + uint64_t srvid, TDB_DATA data) +{ + struct ctdb_req_message_old *r; + int len; + + if (ctdb->methods == NULL) { + DEBUG(DEBUG_INFO,(__location__ " Failed to send message. Transport is DOWN\n")); + return -1; + } + + /* see if this is a message to ourselves */ + if (pnn == ctdb->pnn) { + return ctdb_local_message(ctdb, srvid, data); + } + + len = offsetof(struct ctdb_req_message_old, data) + data.dsize; + r = ctdb_transport_allocate(ctdb, ctdb, CTDB_REQ_MESSAGE, len, + struct ctdb_req_message_old); + CTDB_NO_MEMORY(ctdb, r); + + r->hdr.destnode = pnn; + r->srvid = srvid; + r->datalen = data.dsize; + memcpy(&r->data[0], data.dptr, data.dsize); + + ctdb_queue_packet(ctdb, &r->hdr); + + talloc_free(r); + return 0; +} + + + +struct ctdb_client_notify_list { + struct ctdb_client_notify_list *next, *prev; + struct ctdb_context *ctdb; + uint64_t srvid; + TDB_DATA data; +}; + + +static int ctdb_client_notify_destructor(struct ctdb_client_notify_list *nl) +{ + int ret; + + DEBUG(DEBUG_ERR,("Sending client notify message for srvid:%llu\n", (unsigned long long)nl->srvid)); + + ret = ctdb_daemon_send_message(nl->ctdb, CTDB_BROADCAST_CONNECTED, (unsigned long long)nl->srvid, nl->data); + if (ret != 0) { + DEBUG(DEBUG_ERR,("Failed to send client notify message\n")); + } + + return 0; +} + +int32_t ctdb_control_register_notify(struct ctdb_context *ctdb, uint32_t client_id, TDB_DATA indata) +{ + struct ctdb_notify_data_old *notify = (struct ctdb_notify_data_old *)indata.dptr; + struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client); + struct ctdb_client_notify_list *nl; + + DEBUG(DEBUG_INFO,("Register srvid %llu for client %d\n", (unsigned long long)notify->srvid, client_id)); + + if (indata.dsize < offsetof(struct ctdb_notify_data_old, notify_data)) { + DEBUG(DEBUG_ERR,(__location__ " Too little data in control : %d\n", (int)indata.dsize)); + return -1; + } + + if (indata.dsize != (notify->len + offsetof(struct ctdb_notify_data_old, notify_data))) { + DEBUG(DEBUG_ERR,(__location__ " Wrong amount of data in control. Got %d, expected %d\n", (int)indata.dsize, (int)(notify->len + offsetof(struct ctdb_notify_data_old, notify_data)))); + return -1; + } + + + if (client == NULL) { + DEBUG(DEBUG_ERR,(__location__ " Could not find client parent structure. You can not send this control to a remote node\n")); + return -1; + } + + for(nl=client->notify; nl; nl=nl->next) { + if (nl->srvid == notify->srvid) { + break; + } + } + if (nl != NULL) { + DEBUG(DEBUG_ERR,(__location__ " Notification for srvid:%llu already exists for this client\n", (unsigned long long)notify->srvid)); + return -1; + } + + nl = talloc(client, struct ctdb_client_notify_list); + CTDB_NO_MEMORY(ctdb, nl); + nl->ctdb = ctdb; + nl->srvid = notify->srvid; + nl->data.dsize = notify->len; + nl->data.dptr = talloc_memdup(nl, notify->notify_data, + nl->data.dsize); + CTDB_NO_MEMORY(ctdb, nl->data.dptr); + + DLIST_ADD(client->notify, nl); + talloc_set_destructor(nl, ctdb_client_notify_destructor); + + return 0; +} + +int32_t ctdb_control_deregister_notify(struct ctdb_context *ctdb, uint32_t client_id, TDB_DATA indata) +{ + uint64_t srvid = *(uint64_t *)indata.dptr; + struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client); + struct ctdb_client_notify_list *nl; + + DEBUG(DEBUG_INFO,("Deregister srvid %llu for client %d\n", (unsigned long long)srvid, client_id)); + + if (client == NULL) { + DEBUG(DEBUG_ERR,(__location__ " Could not find client parent structure. You can not send this control to a remote node\n")); + return -1; + } + + for(nl=client->notify; nl; nl=nl->next) { + if (nl->srvid == srvid) { + break; + } + } + if (nl == NULL) { + DEBUG(DEBUG_ERR,(__location__ " No notification for srvid:%llu found for this client\n", (unsigned long long)srvid)); + return -1; + } + + DLIST_REMOVE(client->notify, nl); + talloc_set_destructor(nl, NULL); + talloc_free(nl); + + return 0; +} + +struct ctdb_client *ctdb_find_client_by_pid(struct ctdb_context *ctdb, pid_t pid) +{ + struct ctdb_client_pid_list *client_pid; + + for (client_pid = ctdb->client_pids; client_pid; client_pid=client_pid->next) { + if (client_pid->pid == pid) { + return client_pid->client; + } + } + return NULL; +} + + +/* This control is used by samba when probing if a process (of a samba daemon) + exists on the node. + Samba does this when it needs/wants to check if a subrecord in one of the + databases is still valid, or if it is stale and can be removed. + If the node is in unhealthy or stopped state we just kill of the samba + process holding this sub-record and return to the calling samba that + the process does not exist. + This allows us to forcefully recall subrecords registered by samba processes + on banned and stopped nodes. +*/ +int32_t ctdb_control_process_exists(struct ctdb_context *ctdb, pid_t pid) +{ + struct ctdb_client *client; + + client = ctdb_find_client_by_pid(ctdb, pid); + if (client == NULL) { + return -1; + } + + if (ctdb->nodes[ctdb->pnn]->flags & NODE_FLAGS_INACTIVE) { + DEBUG(DEBUG_NOTICE, + ("Killing client with pid:%d on banned/stopped node\n", + (int)pid)); + talloc_free(client); + return -1; + } + + return kill(pid, 0); +} + +int32_t ctdb_control_check_pid_srvid(struct ctdb_context *ctdb, + TDB_DATA indata) +{ + struct ctdb_client_pid_list *client_pid; + pid_t pid; + uint64_t srvid; + int ret; + + pid = *(pid_t *)indata.dptr; + srvid = *(uint64_t *)(indata.dptr + sizeof(pid_t)); + + for (client_pid = ctdb->client_pids; + client_pid != NULL; + client_pid = client_pid->next) { + if (client_pid->pid == pid) { + ret = srvid_exists(ctdb->srv, srvid, + client_pid->client); + if (ret == 0) { + return 0; + } + } + } + + return -1; +} + +int ctdb_control_getnodesfile(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata) +{ + struct ctdb_node_map_old *node_map = NULL; + + CHECK_CONTROL_DATA_SIZE(0); + + node_map = ctdb_read_nodes_file(ctdb, ctdb->nodes_file); + if (node_map == NULL) { + DEBUG(DEBUG_ERR, ("Failed to read nodes file\n")); + return -1; + } + + outdata->dptr = (unsigned char *)node_map; + outdata->dsize = talloc_get_size(outdata->dptr); + + return 0; +} + +void ctdb_shutdown_sequence(struct ctdb_context *ctdb, int exit_code) +{ + if (ctdb->runstate == CTDB_RUNSTATE_SHUTDOWN) { + DEBUG(DEBUG_NOTICE,("Already shutting down so will not proceed.\n")); + return; + } + + DEBUG(DEBUG_ERR,("Shutdown sequence commencing.\n")); + ctdb_set_runstate(ctdb, CTDB_RUNSTATE_SHUTDOWN); + ctdb_stop_recoverd(ctdb); + ctdb_stop_keepalive(ctdb); + ctdb_stop_monitoring(ctdb); + ctdb_event_script(ctdb, CTDB_EVENT_SHUTDOWN); + ctdb_stop_eventd(ctdb); + if (ctdb->methods != NULL && ctdb->methods->shutdown != NULL) { + ctdb->methods->shutdown(ctdb); + } + + DEBUG(DEBUG_ERR,("Shutdown sequence complete, exiting.\n")); + exit(exit_code); +} + +/* When forking the main daemon and the child process needs to connect + * back to the daemon as a client process, this function can be used + * to change the ctdb context from daemon into client mode. The child + * process must be created using ctdb_fork() and not fork() - + * ctdb_fork() does some necessary housekeeping. + */ +int switch_from_server_to_client(struct ctdb_context *ctdb) +{ + int ret; + + if (ctdb->daemon.sd != -1) { + close(ctdb->daemon.sd); + ctdb->daemon.sd = -1; + } + + /* get a new event context */ + ctdb->ev = tevent_context_init(ctdb); + if (ctdb->ev == NULL) { + DEBUG(DEBUG_ALERT,("tevent_context_init() failed\n")); + exit(1); + } + tevent_loop_allow_nesting(ctdb->ev); + + /* Connect to main CTDB daemon */ + ret = ctdb_socket_connect(ctdb); + if (ret != 0) { + DEBUG(DEBUG_ALERT, (__location__ " Failed to init ctdb client\n")); + return -1; + } + + ctdb->can_send_controls = true; + + return 0; +} diff --git a/ctdb/server/ctdb_fork.c b/ctdb/server/ctdb_fork.c new file mode 100644 index 0000000..1065423 --- /dev/null +++ b/ctdb/server/ctdb_fork.c @@ -0,0 +1,216 @@ +/* + functions to track and manage processes + + Copyright (C) Ronnie Sahlberg 2012 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "replace.h" +#include "system/wait.h" +#include "system/network.h" + +#include <talloc.h> +#include <tevent.h> + +#include "lib/util/debug.h" +#include "lib/util/time.h" + +#include "ctdb_private.h" +#include "ctdb_client.h" + +#include "common/rb_tree.h" +#include "common/system.h" +#include "common/common.h" +#include "common/logging.h" + +void ctdb_track_child(struct ctdb_context *ctdb, pid_t pid) +{ + char *process; + + /* Only CTDB main daemon should track child processes */ + if (getpid() != ctdb->ctdbd_pid) { + return; + } + + process = talloc_asprintf(ctdb->child_processes, "process:%d", (int)pid); + trbt_insert32(ctdb->child_processes, pid, process); +} + +/* + * This function forks a child process and drops the realtime + * scheduler for the child process. + */ +pid_t ctdb_fork(struct ctdb_context *ctdb) +{ + pid_t pid; + struct timeval before; + double delta_t; + + before = timeval_current(); + + pid = fork(); + if (pid == -1) { + DEBUG(DEBUG_ERR, + (__location__ " fork() failed (%s)\n", strerror(errno))); + return -1; + } + if (pid == 0) { + /* Close the Unix Domain socket and the TCP socket. + * This ensures that none of the child processes will + * look like the main daemon when it is not running. + * tevent needs to be stopped before closing sockets. + */ + if (ctdb->ev != NULL) { + talloc_free(ctdb->ev); + ctdb->ev = NULL; + } + if (ctdb->daemon.sd != -1) { + close(ctdb->daemon.sd); + ctdb->daemon.sd = -1; + } + if (ctdb->methods != NULL && ctdb->methods->shutdown != NULL) { + ctdb->methods->shutdown(ctdb); + } + + /* The child does not need to be realtime */ + if (ctdb->do_setsched) { + reset_scheduler(); + } + ctdb->can_send_controls = false; + + return 0; + } + + delta_t = timeval_elapsed(&before); + if (delta_t > 3.0) { + DEBUG(DEBUG_WARNING, ("fork() took %lf seconds\n", delta_t)); + } + + ctdb_track_child(ctdb, pid); + return pid; +} + +/* + * vfork + exec + */ +pid_t ctdb_vfork_exec(TALLOC_CTX *mem_ctx, struct ctdb_context *ctdb, + const char *helper, int helper_argc, + const char **helper_argv) +{ + pid_t pid; + struct timeval before; + double delta_t; + char **argv; + int i; + + argv = talloc_array(mem_ctx, char *, helper_argc + 1); + if (argv == NULL) { + DEBUG(DEBUG_ERR, ("Memory allocation error\n")); + return -1; + } + + argv[0] = discard_const(helper); + for (i=0; i<helper_argc; i++) { + argv[i+1] = discard_const(helper_argv[i]); + } + + before = timeval_current(); + + pid = vfork(); + if (pid == -1) { + DEBUG(DEBUG_ERR, ("vfork() failed (%s)\n", strerror(errno))); + return -1; + } + + if (pid == 0) { + execv(helper, argv); + _exit(1); + } + + delta_t = timeval_elapsed(&before); + if (delta_t > 3.0) { + DEBUG(DEBUG_WARNING, ("vfork() took %lf seconds\n", delta_t)); + } + + ctdb_track_child(ctdb, pid); + return pid; +} + +static void ctdb_sigchld_handler(struct tevent_context *ev, + struct tevent_signal *te, int signum, int count, + void *dont_care, + void *private_data) +{ + struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context); + int status; + pid_t pid = -1; + + while (pid != 0) { + pid = waitpid(-1, &status, WNOHANG); + if (pid == -1) { + DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%d\n", errno)); + return; + } + if (pid > 0) { + char *process; + + if (getpid() != ctdb->ctdbd_pid) { + continue; + } + + process = trbt_lookup32(ctdb->child_processes, pid); + if (process == NULL) { + DEBUG(DEBUG_ERR,("Got SIGCHLD from pid:%d we didn not spawn with ctdb_fork\n", pid)); + } + + DEBUG(DEBUG_DEBUG, ("SIGCHLD from %d %s\n", (int)pid, process)); + talloc_free(process); + } + } +} + + +struct tevent_signal * +ctdb_init_sigchld(struct ctdb_context *ctdb) +{ + struct tevent_signal *se; + + ctdb->child_processes = trbt_create(ctdb, 0); + + se = tevent_add_signal(ctdb->ev, ctdb, SIGCHLD, 0, ctdb_sigchld_handler, ctdb); + return se; +} + +int +ctdb_kill(struct ctdb_context *ctdb, pid_t pid, int signum) +{ + char *process; + + if (signum == 0) { + return kill(pid, signum); + } + + if (getpid() != ctdb->ctdbd_pid) { + return kill(pid, signum); + } + + process = trbt_lookup32(ctdb->child_processes, pid); + if (process == NULL) { + DEBUG(DEBUG_ERR,("ctdb_kill: trying to kill(%d, %d) a process that does not exist\n", pid, signum)); + return 0; + } + + return kill(pid, signum); +} diff --git a/ctdb/server/ctdb_freeze.c b/ctdb/server/ctdb_freeze.c new file mode 100644 index 0000000..06aeacf --- /dev/null +++ b/ctdb/server/ctdb_freeze.c @@ -0,0 +1,923 @@ +/* + ctdb freeze handling + + Copyright (C) Andrew Tridgell 2007 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ +#include "replace.h" +#include "system/network.h" +#include "system/filesys.h" +#include "system/wait.h" + +#include <talloc.h> +#include <tevent.h> + +#include "lib/tdb_wrap/tdb_wrap.h" +#include "lib/util/dlinklist.h" +#include "lib/util/debug.h" + +#include "ctdb_private.h" + +#include "common/rb_tree.h" +#include "common/common.h" +#include "common/logging.h" + +/** + * Cancel a transaction on database + */ +static int db_transaction_cancel_handler(struct ctdb_db_context *ctdb_db, + void *private_data) +{ + int ret; + + tdb_add_flags(ctdb_db->ltdb->tdb, TDB_NOLOCK); + ret = tdb_transaction_cancel(ctdb_db->ltdb->tdb); + if (ret != 0) { + DEBUG(DEBUG_ERR, ("Failed to cancel transaction for db %s\n", + ctdb_db->db_name)); + } + tdb_remove_flags(ctdb_db->ltdb->tdb, TDB_NOLOCK); + return 0; +} + +/** + * Start a transaction on database + */ +static int db_transaction_start_handler(struct ctdb_db_context *ctdb_db, + void *private_data) +{ + bool freeze_transaction_started = *(bool *)private_data; + int ret; + + tdb_add_flags(ctdb_db->ltdb->tdb, TDB_NOLOCK); + if (freeze_transaction_started) { + ret = tdb_transaction_cancel(ctdb_db->ltdb->tdb); + if (ret != 0) { + DEBUG(DEBUG_ERR, + ("Failed to cancel transaction for db %s\n", + ctdb_db->db_name)); + } + } + ret = tdb_transaction_start(ctdb_db->ltdb->tdb); + tdb_remove_flags(ctdb_db->ltdb->tdb, TDB_NOLOCK); + if (ret != 0) { + DEBUG(DEBUG_ERR, ("Failed to start transaction for db %s\n", + ctdb_db->db_name)); + return -1; + } + return 0; +} + +/** + * Commit a transaction on database + */ +static int db_transaction_commit_handler(struct ctdb_db_context *ctdb_db, + void *private_data) +{ + unsigned int healthy_nodes = *(unsigned int *)private_data; + int ret; + + tdb_add_flags(ctdb_db->ltdb->tdb, TDB_NOLOCK); + ret = tdb_transaction_commit(ctdb_db->ltdb->tdb); + tdb_remove_flags(ctdb_db->ltdb->tdb, TDB_NOLOCK); + if (ret != 0) { + DEBUG(DEBUG_ERR, ("Failed to commit transaction for db %s\n", + ctdb_db->db_name)); + return -1; + } + + ret = ctdb_update_persistent_health(ctdb_db->ctdb, ctdb_db, NULL, + healthy_nodes); + if (ret != 0) { + DEBUG(DEBUG_ERR, ("Failed to update persistent health for db %s\n", + ctdb_db->db_name)); + } + return ret; +} + +/* a list of control requests waiting for db freeze */ +struct ctdb_db_freeze_waiter { + struct ctdb_db_freeze_waiter *next, *prev; + struct ctdb_context *ctdb; + void *private_data; + int32_t status; +}; + +/* a handle to a db freeze lock child process */ +struct ctdb_db_freeze_handle { + struct ctdb_db_context *ctdb_db; + struct lock_request *lreq; + struct ctdb_db_freeze_waiter *waiters; +}; + +/** + * Called when freeing database freeze handle + */ +static int ctdb_db_freeze_handle_destructor(struct ctdb_db_freeze_handle *h) +{ + struct ctdb_db_context *ctdb_db = h->ctdb_db; + + DEBUG(DEBUG_ERR, ("Release freeze handle for db %s\n", + ctdb_db->db_name)); + + /* Cancel any pending transactions */ + if (ctdb_db->freeze_transaction_started) { + db_transaction_cancel_handler(ctdb_db, NULL); + ctdb_db->freeze_transaction_started = false; + } + ctdb_db->freeze_mode = CTDB_FREEZE_NONE; + ctdb_db->freeze_handle = NULL; + + /* Clear invalid records flag */ + ctdb_db->invalid_records = false; + + talloc_free(h->lreq); + return 0; +} + +/** + * Called when a database is frozen + */ +static void ctdb_db_freeze_handler(void *private_data, bool locked) +{ + struct ctdb_db_freeze_handle *h = talloc_get_type_abort( + private_data, struct ctdb_db_freeze_handle); + struct ctdb_db_freeze_waiter *w; + + if (h->ctdb_db->freeze_mode == CTDB_FREEZE_FROZEN) { + DEBUG(DEBUG_ERR, ("Freeze db child died - unfreezing\n")); + h->ctdb_db->freeze_mode = CTDB_FREEZE_NONE; + talloc_free(h); + return; + } + + if (!locked) { + DEBUG(DEBUG_ERR, ("Failed to get db lock for %s\n", + h->ctdb_db->db_name)); + h->ctdb_db->freeze_mode = CTDB_FREEZE_NONE; + talloc_free(h); + return; + } + + h->ctdb_db->freeze_mode = CTDB_FREEZE_FROZEN; + + /* notify the waiters */ + while ((w = h->waiters) != NULL) { + w->status = 0; + DLIST_REMOVE(h->waiters, w); + talloc_free(w); + } +} + +/** + * Start freeze process for a database + */ +static void ctdb_start_db_freeze(struct ctdb_db_context *ctdb_db) +{ + struct ctdb_db_freeze_handle *h; + + if (ctdb_db->freeze_mode == CTDB_FREEZE_FROZEN) { + return; + } + + if (ctdb_db->freeze_handle != NULL) { + return; + } + + DEBUG(DEBUG_ERR, ("Freeze db: %s\n", ctdb_db->db_name)); + + ctdb_stop_vacuuming(ctdb_db->ctdb); + + h = talloc_zero(ctdb_db, struct ctdb_db_freeze_handle); + CTDB_NO_MEMORY_FATAL(ctdb_db->ctdb, h); + + h->ctdb_db = ctdb_db; + h->lreq = ctdb_lock_db(h, ctdb_db, false, ctdb_db_freeze_handler, h); + CTDB_NO_MEMORY_FATAL(ctdb_db->ctdb, h->lreq); + talloc_set_destructor(h, ctdb_db_freeze_handle_destructor); + + ctdb_db->freeze_handle = h; + ctdb_db->freeze_mode = CTDB_FREEZE_PENDING; +} + +/** + * Reply to a waiter for db freeze + */ +static int ctdb_db_freeze_waiter_destructor(struct ctdb_db_freeze_waiter *w) +{ + /* 'c' pointer is talloc_memdup(), so cannot use talloc_get_type */ + struct ctdb_req_control_old *c = + (struct ctdb_req_control_old *)w->private_data; + + ctdb_request_control_reply(w->ctdb, c, NULL, w->status, NULL); + return 0; +} + +/** + * freeze a database + */ +int32_t ctdb_control_db_freeze(struct ctdb_context *ctdb, + struct ctdb_req_control_old *c, + uint32_t db_id, + bool *async_reply) +{ + struct ctdb_db_context *ctdb_db; + struct ctdb_db_freeze_waiter *w; + + ctdb_db = find_ctdb_db(ctdb, db_id); + if (ctdb_db == NULL) { + DEBUG(DEBUG_ERR, ("Freeze db for unknown dbid 0x%08x\n", db_id)); + return -1; + } + + if (ctdb_db->freeze_mode == CTDB_FREEZE_FROZEN) { + DEBUG(DEBUG_ERR, ("Freeze db: %s frozen\n", ctdb_db->db_name)); + return 0; + } + + ctdb_start_db_freeze(ctdb_db); + + /* add ourselves to the list of waiters */ + w = talloc(ctdb_db->freeze_handle, struct ctdb_db_freeze_waiter); + CTDB_NO_MEMORY(ctdb, w); + w->ctdb = ctdb; + w->private_data = talloc_steal(w, c); + w->status = -1; + talloc_set_destructor(w, ctdb_db_freeze_waiter_destructor); + DLIST_ADD(ctdb_db->freeze_handle->waiters, w); + + *async_reply = true; + return 0; +} + +/** + * Thaw a database + */ +int32_t ctdb_control_db_thaw(struct ctdb_context *ctdb, uint32_t db_id) +{ + struct ctdb_db_context *ctdb_db; + + ctdb_db = find_ctdb_db(ctdb, db_id); + if (ctdb_db == NULL) { + DEBUG(DEBUG_ERR, ("Thaw db for unknown dbid 0x%08x\n", db_id)); + return -1; + } + + DEBUG(DEBUG_ERR, ("Thaw db: %s generation %u\n", ctdb_db->db_name, + ctdb_db->generation)); + + TALLOC_FREE(ctdb_db->freeze_handle); + ctdb_call_resend_db(ctdb_db); + return 0; +} + + +/* + a list of control requests waiting for a freeze lock child to get + the database locks + */ +struct ctdb_freeze_waiter { + struct ctdb_freeze_waiter *next, *prev; + struct ctdb_context *ctdb; + struct ctdb_req_control_old *c; + int32_t status; +}; + +/* a handle to a freeze lock child process */ +struct ctdb_freeze_handle { + struct ctdb_context *ctdb; + unsigned int num_total, num_locked, num_failed; + struct ctdb_freeze_waiter *waiters; +}; + +static int db_thaw(struct ctdb_db_context *ctdb_db, void *private_data) +{ + talloc_free(ctdb_db->freeze_handle); + return 0; +} + +/* + destroy a freeze handle + */ +static int ctdb_freeze_handle_destructor(struct ctdb_freeze_handle *h) +{ + struct ctdb_context *ctdb = h->ctdb; + + DEBUG(DEBUG_ERR,("Release freeze handle\n")); + + /* cancel any pending transactions */ + if (ctdb->freeze_transaction_started) { + ctdb_db_iterator(ctdb, db_transaction_cancel_handler, NULL); + ctdb->freeze_transaction_started = false; + } + + ctdb_db_iterator(ctdb, db_thaw, NULL); + + ctdb->freeze_mode = CTDB_FREEZE_NONE; + ctdb->freeze_handle = NULL; + + return 0; +} + +/* + called when the child writes its status to us + */ +static void ctdb_freeze_lock_handler(void *private_data, bool locked) +{ + struct ctdb_freeze_handle *h = talloc_get_type_abort(private_data, + struct ctdb_freeze_handle); + struct ctdb_freeze_waiter *w; + + if (h->ctdb->freeze_mode == CTDB_FREEZE_FROZEN) { + DEBUG(DEBUG_INFO,("freeze child died - unfreezing\n")); + talloc_free(h); + return; + } + + if (!locked) { + DEBUG(DEBUG_ERR,("Failed to get locks in ctdb_freeze_child\n")); + /* we didn't get the locks - destroy the handle */ + talloc_free(h); + return; + } + + h->ctdb->freeze_mode = CTDB_FREEZE_FROZEN; + + /* notify the waiters */ + if (h != h->ctdb->freeze_handle) { + DEBUG(DEBUG_ERR,("lockwait finished but h is not linked\n")); + } + while ((w = h->waiters)) { + w->status = 0; + DLIST_REMOVE(h->waiters, w); + talloc_free(w); + } +} + +/** + * When single database is frozen + */ +static int db_freeze_waiter_destructor(struct ctdb_db_freeze_waiter *w) +{ + struct ctdb_freeze_handle *h = talloc_get_type_abort( + w->private_data, struct ctdb_freeze_handle); + + if (w->status == 0) { + h->num_locked += 1; + } else { + h->num_failed += 1; + } + + /* Call ctdb_freeze_lock_handler() only when the status of all + * databases is known. + */ + if (h->num_locked + h->num_failed == h->num_total) { + bool locked; + + if (h->num_locked == h->num_total) { + locked = true; + } else { + locked = false; + } + ctdb_freeze_lock_handler(h, locked); + } + return 0; +} + +/** + * Invalidate the records in the database. + * This only applies to volatile databases. + */ +static int db_invalidate(struct ctdb_db_context *ctdb_db, void *private_data) +{ + if (ctdb_db_volatile(ctdb_db)) { + ctdb_db->invalid_records = true; + } + + return 0; +} + +/** + * Count the number of databases + */ +static int db_count(struct ctdb_db_context *ctdb_db, void *private_data) +{ + unsigned int *count = (unsigned int *)private_data; + + *count += 1; + + return 0; +} + +/** + * Freeze a single database + */ +static int db_freeze(struct ctdb_db_context *ctdb_db, void *private_data) +{ + struct ctdb_freeze_handle *h = talloc_get_type_abort( + private_data, struct ctdb_freeze_handle); + struct ctdb_db_freeze_waiter *w; + + ctdb_start_db_freeze(ctdb_db); + + w = talloc(ctdb_db->freeze_handle, struct ctdb_db_freeze_waiter); + CTDB_NO_MEMORY(h->ctdb, w); + w->ctdb = h->ctdb; + w->private_data = h; + w->status = -1; + talloc_set_destructor(w, db_freeze_waiter_destructor); + + if (ctdb_db->freeze_mode == CTDB_FREEZE_FROZEN) { + /* Early return if already frozen */ + w->status = 0; + talloc_free(w); + return 0; + } + + DLIST_ADD(ctdb_db->freeze_handle->waiters, w); + + return 0; +} + +/* + start the freeze process for all databases + This is only called from ctdb_control_freeze(), which is called + only on node becoming INACTIVE. So mark the records invalid. + */ +static void ctdb_start_freeze(struct ctdb_context *ctdb) +{ + struct ctdb_freeze_handle *h; + int ret; + + ctdb_db_iterator(ctdb, db_invalidate, NULL); + + if (ctdb->freeze_mode == CTDB_FREEZE_FROZEN) { + unsigned int count = 0; + + /* + * Check if all the databases are frozen + * + * It's possible that the databases can get attached after + * initial freeze. This typically happens during startup as + * CTDB will only attach persistent databases and go in to + * startup freeze. The recovery master during recovery will + * attach all the missing databases. + */ + + h = ctdb->freeze_handle; + if (h == NULL) { + ctdb->freeze_mode = CTDB_FREEZE_NONE; + return; + } + + ret = ctdb_db_iterator(ctdb, db_count, &count); + if (ret != 0) { + TALLOC_FREE(ctdb->freeze_handle); + ctdb->freeze_mode = CTDB_FREEZE_NONE; + return; + } + + if (count != h->num_total) { + DEBUG(DEBUG_ERR, ("Freeze all: incremental\n")); + + h->num_total = count; + h->num_locked = 0; + h->num_failed = 0; + + ctdb->freeze_mode = CTDB_FREEZE_PENDING; + + ret = ctdb_db_iterator(ctdb, db_freeze, h); + if (ret != 0) { + TALLOC_FREE(ctdb->freeze_handle); + ctdb->freeze_mode = CTDB_FREEZE_NONE; + } + } + return; + } + + if (ctdb->freeze_handle != NULL) { + /* already trying to freeze */ + return; + } + + DEBUG(DEBUG_ERR, ("Freeze all\n")); + + /* Stop any vacuuming going on: we don't want to wait. */ + ctdb_stop_vacuuming(ctdb); + + /* create freeze lock children for each database */ + h = talloc_zero(ctdb, struct ctdb_freeze_handle); + CTDB_NO_MEMORY_FATAL(ctdb, h); + h->ctdb = ctdb; + talloc_set_destructor(h, ctdb_freeze_handle_destructor); + ctdb->freeze_handle = h; + + ret = ctdb_db_iterator(ctdb, db_count, &h->num_total); + if (ret != 0) { + talloc_free(h); + return; + } + + ctdb->freeze_mode = CTDB_FREEZE_PENDING; + + ret = ctdb_db_iterator(ctdb, db_freeze, h); + if (ret != 0) { + talloc_free(h); + return; + } + + if (h->num_total == 0) { + ctdb->freeze_mode = CTDB_FREEZE_FROZEN; + } +} + +/* + destroy a waiter for a freeze mode change + */ +static int ctdb_freeze_waiter_destructor(struct ctdb_freeze_waiter *w) +{ + ctdb_request_control_reply(w->ctdb, w->c, NULL, w->status, NULL); + return 0; +} + +/* + freeze all the databases + This control is only used when freezing database on node becoming INACTIVE. + So mark the records invalid in ctdb_start_freeze(). + */ +int32_t ctdb_control_freeze(struct ctdb_context *ctdb, + struct ctdb_req_control_old *c, bool *async_reply) +{ + struct ctdb_freeze_waiter *w; + + ctdb_start_freeze(ctdb); + + if (ctdb->freeze_mode == CTDB_FREEZE_FROZEN) { + DEBUG(DEBUG_ERR, ("Freeze all: frozen\n")); + /* we're already frozen */ + return 0; + } + + if (ctdb->freeze_handle == NULL) { + DEBUG(DEBUG_ERR,("No freeze lock handle when adding a waiter\n")); + return -1; + } + + /* If there are no databases, we are done. */ + if (ctdb->freeze_handle->num_total == 0) { + return 0; + } + + /* add ourselves to list of waiters */ + w = talloc(ctdb->freeze_handle, struct ctdb_freeze_waiter); + CTDB_NO_MEMORY(ctdb, w); + w->ctdb = ctdb; + w->c = talloc_steal(w, c); + w->status = -1; + talloc_set_destructor(w, ctdb_freeze_waiter_destructor); + DLIST_ADD(ctdb->freeze_handle->waiters, w); + + /* we won't reply till later */ + *async_reply = true; + return 0; +} + + +static int db_freeze_block(struct ctdb_db_context *ctdb_db, void *private_data) +{ + struct tevent_context *ev = (struct tevent_context *)private_data; + + ctdb_start_db_freeze(ctdb_db); + + while (ctdb_db->freeze_mode == CTDB_FREEZE_PENDING) { + tevent_loop_once(ev); + } + + if (ctdb_db->freeze_mode != CTDB_FREEZE_FROZEN) { + return -1; + } + + return 0; +} + +/* + block until we are frozen, used during daemon startup + */ +bool ctdb_blocking_freeze(struct ctdb_context *ctdb) +{ + int ret; + + ret = ctdb_db_iterator(ctdb, db_freeze_block, ctdb->ev); + if (ret != 0) { + return false; + } + + return true; +} + +/* + thaw the databases + */ +int32_t ctdb_control_thaw(struct ctdb_context *ctdb, bool check_recmode) +{ + if (check_recmode && ctdb->recovery_mode == CTDB_RECOVERY_ACTIVE) { + DEBUG(DEBUG_ERR, ("Failing to thaw databases while " + "recovery is active\n")); + return -1; + } + + DEBUG(DEBUG_ERR,("Thawing all\n")); + + /* cancel any pending transactions */ + if (ctdb->freeze_transaction_started) { + ctdb_db_iterator(ctdb, db_transaction_cancel_handler, NULL); + ctdb->freeze_transaction_started = false; + } + + ctdb_db_iterator(ctdb, db_thaw, NULL); + TALLOC_FREE(ctdb->freeze_handle); + + ctdb_call_resend_all(ctdb); + return 0; +} + +/** + * Database transaction wrappers + * + * These functions are wrappers around transaction start/cancel/commit handlers. + */ + +struct db_start_transaction_state { + uint32_t transaction_id; + bool transaction_started; +}; + +static int db_start_transaction(struct ctdb_db_context *ctdb_db, + void *private_data) +{ + struct db_start_transaction_state *state = + (struct db_start_transaction_state *)private_data; + int ret; + bool transaction_started; + + if (ctdb_db->freeze_mode != CTDB_FREEZE_FROZEN) { + DEBUG(DEBUG_ERR, + ("Database %s not frozen, cannot start transaction\n", + ctdb_db->db_name)); + return -1; + } + + transaction_started = state->transaction_started & + ctdb_db->freeze_transaction_started; + + ret = db_transaction_start_handler(ctdb_db, + &transaction_started); + if (ret != 0) { + return -1; + } + + ctdb_db->freeze_transaction_started = true; + ctdb_db->freeze_transaction_id = state->transaction_id; + + return 0; +} + +static int db_cancel_transaction(struct ctdb_db_context *ctdb_db, + void *private_data) +{ + int ret; + + ret = db_transaction_cancel_handler(ctdb_db, private_data); + if (ret != 0) { + return ret; + } + + ctdb_db->freeze_transaction_started = false; + + return 0; +} + +struct db_commit_transaction_state { + uint32_t transaction_id; + unsigned int healthy_nodes; +}; + +static int db_commit_transaction(struct ctdb_db_context *ctdb_db, + void *private_data) +{ + struct db_commit_transaction_state *state = + (struct db_commit_transaction_state *)private_data; + int ret; + + if (ctdb_db->freeze_mode != CTDB_FREEZE_FROZEN) { + DEBUG(DEBUG_ERR, + ("Database %s not frozen, cannot commit transaction\n", + ctdb_db->db_name)); + return -1; + } + + if (!ctdb_db->freeze_transaction_started) { + DEBUG(DEBUG_ERR, ("Transaction not started on %s\n", + ctdb_db->db_name)); + return -1; + } + + if (ctdb_db->freeze_transaction_id != state->transaction_id) { + DEBUG(DEBUG_ERR, + ("Incorrect transaction commit id 0x%08x for %s\n", + state->transaction_id, ctdb_db->db_name)); + return -1; + } + + ret = db_transaction_commit_handler(ctdb_db, &state->healthy_nodes); + if (ret != 0) { + return -1; + } + + ctdb_db->freeze_transaction_started = false; + ctdb_db->freeze_transaction_id = 0; + ctdb_db->generation = state->transaction_id; + return 0; +} + +/** + * Start a transaction on a database - used for db recovery + */ +int32_t ctdb_control_db_transaction_start(struct ctdb_context *ctdb, + TDB_DATA indata) +{ + struct ctdb_transdb *w = + (struct ctdb_transdb *)indata.dptr; + struct ctdb_db_context *ctdb_db; + struct db_start_transaction_state state; + + ctdb_db = find_ctdb_db(ctdb, w->db_id); + if (ctdb_db == NULL) { + DEBUG(DEBUG_ERR, + ("Transaction start for unknown dbid 0x%08x\n", + w->db_id)); + return -1; + } + + state.transaction_id = w->tid; + state.transaction_started = true; + + return db_start_transaction(ctdb_db, &state); +} + +/** + * Cancel a transaction on a database - used for db recovery + */ +int32_t ctdb_control_db_transaction_cancel(struct ctdb_context *ctdb, + TDB_DATA indata) +{ + uint32_t db_id = *(uint32_t *)indata.dptr; + struct ctdb_db_context *ctdb_db; + + ctdb_db = find_ctdb_db(ctdb, db_id); + if (ctdb_db == NULL) { + DEBUG(DEBUG_ERR, + ("Transaction cancel for unknown dbid 0x%08x\n", db_id)); + return -1; + } + + DEBUG(DEBUG_ERR, ("Recovery db transaction cancelled for %s\n", + ctdb_db->db_name)); + + return db_cancel_transaction(ctdb_db, NULL); +} + +/** + * Commit a transaction on a database - used for db recovery + */ +int32_t ctdb_control_db_transaction_commit(struct ctdb_context *ctdb, + TDB_DATA indata) +{ + struct ctdb_transdb *w = + (struct ctdb_transdb *)indata.dptr; + struct ctdb_db_context *ctdb_db; + struct db_commit_transaction_state state; + unsigned int healthy_nodes, i; + + ctdb_db = find_ctdb_db(ctdb, w->db_id); + if (ctdb_db == NULL) { + DEBUG(DEBUG_ERR, + ("Transaction commit for unknown dbid 0x%08x\n", + w->db_id)); + return -1; + } + + healthy_nodes = 0; + for (i=0; i < ctdb->num_nodes; i++) { + if (ctdb->nodes[i]->flags == 0) { + healthy_nodes += 1; + } + } + + state.transaction_id = w->tid; + state.healthy_nodes = healthy_nodes; + + return db_commit_transaction(ctdb_db, &state); +} + +/* + wipe a database - only possible when in a frozen transaction + */ +int32_t ctdb_control_wipe_database(struct ctdb_context *ctdb, TDB_DATA indata) +{ + struct ctdb_transdb w = *(struct ctdb_transdb *)indata.dptr; + struct ctdb_db_context *ctdb_db; + + ctdb_db = find_ctdb_db(ctdb, w.db_id); + if (!ctdb_db) { + DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%x\n", w.db_id)); + return -1; + } + + if (ctdb_db->freeze_mode != CTDB_FREEZE_FROZEN) { + DEBUG(DEBUG_ERR,(__location__ " Failed transaction_start while not frozen\n")); + return -1; + } + + if (!ctdb_db->freeze_transaction_started) { + DEBUG(DEBUG_ERR,(__location__ " transaction not started\n")); + return -1; + } + + if (w.tid != ctdb_db->freeze_transaction_id) { + DEBUG(DEBUG_ERR,(__location__ " incorrect transaction id 0x%x in commit\n", w.tid)); + return -1; + } + + if (tdb_wipe_all(ctdb_db->ltdb->tdb) != 0) { + DEBUG(DEBUG_ERR,(__location__ " Failed to wipe database for db '%s'\n", + ctdb_db->db_name)); + return -1; + } + + if (ctdb_db_volatile(ctdb_db)) { + talloc_free(ctdb_db->delete_queue); + talloc_free(ctdb_db->fetch_queue); + ctdb_db->delete_queue = trbt_create(ctdb_db, 0); + if (ctdb_db->delete_queue == NULL) { + DEBUG(DEBUG_ERR, (__location__ " Failed to re-create " + "the delete queue.\n")); + return -1; + } + ctdb_db->fetch_queue = trbt_create(ctdb_db, 0); + if (ctdb_db->fetch_queue == NULL) { + DEBUG(DEBUG_ERR, (__location__ " Failed to re-create " + "the fetch queue.\n")); + return -1; + } + } + + return 0; +} + +bool ctdb_db_frozen(struct ctdb_db_context *ctdb_db) +{ + if (ctdb_db->freeze_mode != CTDB_FREEZE_FROZEN) { + return false; + } + + return true; +} + +bool ctdb_db_all_frozen(struct ctdb_context *ctdb) +{ + if (ctdb->freeze_mode != CTDB_FREEZE_FROZEN) { + return false; + } + return true; +} + +bool ctdb_db_allow_access(struct ctdb_db_context *ctdb_db) +{ + if (ctdb_db->freeze_mode == CTDB_FREEZE_NONE) { + /* If database is not frozen, then allow access. */ + return true; + } else if (ctdb_db->freeze_transaction_started) { + /* If database is frozen, allow access only if the + * transaction is started. This is required during + * recovery. + * + * If a node is inactive, then transaction is not started. + */ + return true; + } + + return false; +} diff --git a/ctdb/server/ctdb_keepalive.c b/ctdb/server/ctdb_keepalive.c new file mode 100644 index 0000000..9155ade --- /dev/null +++ b/ctdb/server/ctdb_keepalive.c @@ -0,0 +1,234 @@ +/* + monitoring links to all other nodes to detect dead nodes + + + Copyright (C) Ronnie Sahlberg 2007 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "replace.h" +#include "system/filesys.h" +#include "system/network.h" +#include "system/time.h" +#include "system/wait.h" + +#include <talloc.h> +#include <tevent.h> + +#include "lib/util/debug.h" +#include "lib/util/samba_util.h" + +#include "ctdb_private.h" +#include "version.h" + +#include "common/common.h" +#include "common/logging.h" + + +static uint32_t keepalive_version(void) +{ + static uint32_t version = 0; + + if (version == 0) { + const char *t; + + version = (SAMBA_VERSION_MAJOR << 16) | SAMBA_VERSION_MINOR; + + t = getenv("CTDB_TEST_SAMBA_VERSION"); + if (t != NULL) { + int v; + + v = atoi(t); + if (v <= 0) { + DBG_WARNING("Failed to parse env var: %s\n", t); + } else { + version = v; + } + } + } + + return version; +} + +static uint32_t keepalive_uptime(struct ctdb_context *ctdb) +{ + struct timeval current = tevent_timeval_current(); + + return current.tv_sec - ctdb->ctdbd_start_time.tv_sec; +} + +/* + send a keepalive packet to the other node +*/ +static void ctdb_send_keepalive(struct ctdb_context *ctdb, uint32_t destnode) +{ + struct ctdb_req_keepalive_old *r; + + if (ctdb->methods == NULL) { + DEBUG(DEBUG_INFO, + ("Failed to send keepalive. Transport is DOWN\n")); + return; + } + + r = ctdb_transport_allocate(ctdb, ctdb, CTDB_REQ_KEEPALIVE, + sizeof(struct ctdb_req_keepalive_old), + struct ctdb_req_keepalive_old); + CTDB_NO_MEMORY_FATAL(ctdb, r); + r->hdr.destnode = destnode; + r->hdr.reqid = 0; + + r->version = keepalive_version(); + r->uptime = keepalive_uptime(ctdb); + + CTDB_INCREMENT_STAT(ctdb, keepalive_packets_sent); + + ctdb_queue_packet(ctdb, &r->hdr); + + talloc_free(r); +} + +/* + see if any nodes are dead + */ +static void ctdb_check_for_dead_nodes(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *private_data) +{ + struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context); + unsigned int i; + + /* send a keepalive to all other nodes, unless */ + for (i=0;i<ctdb->num_nodes;i++) { + struct ctdb_node *node = ctdb->nodes[i]; + + if (node->flags & NODE_FLAGS_DELETED) { + continue; + } + + if (node->pnn == ctdb->pnn) { + continue; + } + + if (node->flags & NODE_FLAGS_DISCONNECTED) { + /* it might have come alive again */ + if (node->rx_cnt != 0) { + ctdb_node_connected(node); + } + continue; + } + + + if (node->rx_cnt == 0) { + node->dead_count++; + } else { + node->dead_count = 0; + } + + node->rx_cnt = 0; + + if (node->dead_count >= ctdb->tunable.keepalive_limit) { + DEBUG(DEBUG_NOTICE,("dead count reached for node %u\n", node->pnn)); + ctdb_node_dead(node); + ctdb_send_keepalive(ctdb, node->pnn); + /* maybe tell the transport layer to kill the + sockets as well? + */ + continue; + } + + DEBUG(DEBUG_DEBUG,("sending keepalive to %u\n", node->pnn)); + ctdb_send_keepalive(ctdb, node->pnn); + + node->tx_cnt = 0; + } + + tevent_add_timer(ctdb->ev, ctdb->keepalive_ctx, + timeval_current_ofs(ctdb->tunable.keepalive_interval, 0), + ctdb_check_for_dead_nodes, ctdb); +} + + +void ctdb_start_keepalive(struct ctdb_context *ctdb) +{ + struct tevent_timer *te; + + ctdb->keepalive_ctx = talloc_new(ctdb); + CTDB_NO_MEMORY_FATAL(ctdb, ctdb->keepalive_ctx); + + te = tevent_add_timer(ctdb->ev, ctdb->keepalive_ctx, + timeval_current_ofs(ctdb->tunable.keepalive_interval, 0), + ctdb_check_for_dead_nodes, ctdb); + CTDB_NO_MEMORY_FATAL(ctdb, te); + + DEBUG(DEBUG_NOTICE,("Keepalive monitoring has been started\n")); + + if (ctdb->tunable.allow_mixed_versions == 1) { + DEBUG(DEBUG_WARNING, + ("CTDB cluster with mixed versions configured\n")); + } +} + +void ctdb_stop_keepalive(struct ctdb_context *ctdb) +{ + talloc_free(ctdb->keepalive_ctx); + ctdb->keepalive_ctx = NULL; +} + +void ctdb_request_keepalive(struct ctdb_context *ctdb, + struct ctdb_req_header *hdr) +{ + struct ctdb_req_keepalive_old *c = + (struct ctdb_req_keepalive_old *)hdr; + uint32_t my_version = keepalive_version(); + uint32_t my_uptime = keepalive_uptime(ctdb); + + /* Don't check anything if mixed versions are allowed */ + if (ctdb->tunable.allow_mixed_versions == 1) { + return; + } + + if (hdr->length == sizeof(struct ctdb_req_header)) { + /* Old keepalive */ + goto fail1; + } + + if (c->version != my_version) { + if (c->uptime > my_uptime) { + goto fail2; + } else if (c->uptime == my_uptime) { + if (c->version > my_version) { + goto fail2; + } + } + } + + return; + +fail1: + DEBUG(DEBUG_ERR, + ("Keepalive version missing from node %u\n", hdr->srcnode)); + goto shutdown; + +fail2: + DEBUG(DEBUG_ERR, + ("Keepalive version mismatch 0x%08x != 0x%08x from node %u\n", + my_version, c->version, hdr->srcnode)); + goto shutdown; + +shutdown: + DEBUG(DEBUG_ERR, + ("CTDB Cluster with mixed versions, cannot continue\n")); + ctdb_shutdown_sequence(ctdb, 0); +} diff --git a/ctdb/server/ctdb_lock.c b/ctdb/server/ctdb_lock.c new file mode 100644 index 0000000..063ebfa --- /dev/null +++ b/ctdb/server/ctdb_lock.c @@ -0,0 +1,996 @@ +/* + ctdb lock handling + provide API to do non-blocking locks for single or all databases + + Copyright (C) Amitay Isaacs 2012 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ +#include "replace.h" +#include "system/filesys.h" +#include "system/network.h" + +#include <talloc.h> +#include <tevent.h> + +#include "lib/tdb_wrap/tdb_wrap.h" +#include "lib/util/dlinklist.h" +#include "lib/util/debug.h" +#include "lib/util/samba_util.h" +#include "lib/util/sys_rw.h" + +#include "ctdb_private.h" + +#include "common/common.h" +#include "common/logging.h" + +/* + * Non-blocking Locking API + * + * 1. Create a child process to do blocking locks. + * 2. Once the locks are obtained, signal parent process via fd. + * 3. Invoke registered callback routine with locking status. + * 4. If the child process cannot get locks within certain time, + * execute an external script to debug. + * + * ctdb_lock_record() - get a lock on a record + * ctdb_lock_db() - get a lock on a DB + * + * auto_mark - whether to mark/unmark DBs in before/after callback + * = false is used for freezing databases for + * recovery since the recovery cannot start till + * databases are locked on all the nodes. + * = true is used for record locks. + */ + +enum lock_type { + LOCK_RECORD, + LOCK_DB, +}; + +static const char * const lock_type_str[] = { + "lock_record", + "lock_db", +}; + +struct lock_request; + +/* lock_context is the common part for a lock request */ +struct lock_context { + struct lock_context *next, *prev; + enum lock_type type; + struct ctdb_context *ctdb; + struct ctdb_db_context *ctdb_db; + TDB_DATA key; + uint32_t priority; + bool auto_mark; + struct lock_request *request; + pid_t child; + int fd[2]; + struct tevent_fd *tfd; + struct tevent_timer *ttimer; + struct timeval start_time; + uint32_t key_hash; + bool can_schedule; +}; + +/* lock_request is the client specific part for a lock request */ +struct lock_request { + struct lock_context *lctx; + void (*callback)(void *, bool); + void *private_data; +}; + + +int ctdb_db_iterator(struct ctdb_context *ctdb, ctdb_db_handler_t handler, + void *private_data) +{ + struct ctdb_db_context *ctdb_db; + int ret; + + for (ctdb_db = ctdb->db_list; ctdb_db; ctdb_db = ctdb_db->next) { + ret = handler(ctdb_db, private_data); + if (ret != 0) { + return -1; + } + } + + return 0; +} + +/* + * lock all databases - mark only + */ +static int db_lock_mark_handler(struct ctdb_db_context *ctdb_db, + void *private_data) +{ + int tdb_transaction_write_lock_mark(struct tdb_context *); + + DEBUG(DEBUG_INFO, ("marking locked database %s\n", ctdb_db->db_name)); + + if (tdb_transaction_write_lock_mark(ctdb_db->ltdb->tdb) != 0) { + DEBUG(DEBUG_ERR, ("Failed to mark (transaction lock) database %s\n", + ctdb_db->db_name)); + return -1; + } + + if (tdb_lockall_mark(ctdb_db->ltdb->tdb) != 0) { + DEBUG(DEBUG_ERR, ("Failed to mark (all lock) database %s\n", + ctdb_db->db_name)); + return -1; + } + + return 0; +} + +int ctdb_lockdb_mark(struct ctdb_db_context *ctdb_db) +{ + if (!ctdb_db_frozen(ctdb_db)) { + DEBUG(DEBUG_ERR, + ("Attempt to mark database locked when not frozen\n")); + return -1; + } + + return db_lock_mark_handler(ctdb_db, NULL); +} + +/* + * lock all databases - unmark only + */ +static int db_lock_unmark_handler(struct ctdb_db_context *ctdb_db, + void *private_data) +{ + int tdb_transaction_write_lock_unmark(struct tdb_context *); + + DEBUG(DEBUG_INFO, ("unmarking locked database %s\n", ctdb_db->db_name)); + + if (tdb_transaction_write_lock_unmark(ctdb_db->ltdb->tdb) != 0) { + DEBUG(DEBUG_ERR, ("Failed to unmark (transaction lock) database %s\n", + ctdb_db->db_name)); + return -1; + } + + if (tdb_lockall_unmark(ctdb_db->ltdb->tdb) != 0) { + DEBUG(DEBUG_ERR, ("Failed to unmark (all lock) database %s\n", + ctdb_db->db_name)); + return -1; + } + + return 0; +} + +int ctdb_lockdb_unmark(struct ctdb_db_context *ctdb_db) +{ + if (!ctdb_db_frozen(ctdb_db)) { + DEBUG(DEBUG_ERR, + ("Attempt to unmark database locked when not frozen\n")); + return -1; + } + + return db_lock_unmark_handler(ctdb_db, NULL); +} + +static void ctdb_lock_schedule(struct ctdb_context *ctdb); + +/* + * Destructor to kill the child locking process + */ +static int ctdb_lock_context_destructor(struct lock_context *lock_ctx) +{ + if (lock_ctx->request) { + lock_ctx->request->lctx = NULL; + } + if (lock_ctx->child > 0) { + ctdb_kill(lock_ctx->ctdb, lock_ctx->child, SIGTERM); + if (lock_ctx->type == LOCK_RECORD) { + DLIST_REMOVE(lock_ctx->ctdb_db->lock_current, lock_ctx); + } else { + DLIST_REMOVE(lock_ctx->ctdb->lock_current, lock_ctx); + } + if (lock_ctx->ctdb_db->lock_num_current == 0) { + ctdb_fatal(NULL, "Lock count is 0 before decrement\n"); + } + lock_ctx->ctdb_db->lock_num_current--; + CTDB_DECREMENT_STAT(lock_ctx->ctdb, locks.num_current); + CTDB_DECREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_current); + } else { + if (lock_ctx->type == LOCK_RECORD) { + DLIST_REMOVE(lock_ctx->ctdb_db->lock_pending, lock_ctx); + } else { + DLIST_REMOVE(lock_ctx->ctdb->lock_pending, lock_ctx); + } + CTDB_DECREMENT_STAT(lock_ctx->ctdb, locks.num_pending); + CTDB_DECREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_pending); + } + + ctdb_lock_schedule(lock_ctx->ctdb); + + return 0; +} + + +/* + * Destructor to remove lock request + */ +static int ctdb_lock_request_destructor(struct lock_request *lock_request) +{ + if (lock_request->lctx == NULL) { + return 0; + } + + lock_request->lctx->request = NULL; + TALLOC_FREE(lock_request->lctx); + + return 0; +} + +/* + * Process all the callbacks waiting for lock + * + * If lock has failed, callback is executed with locked=false + */ +static void process_callbacks(struct lock_context *lock_ctx, bool locked) +{ + struct lock_request *request; + bool auto_mark = lock_ctx->auto_mark; + + if (auto_mark && locked) { + switch (lock_ctx->type) { + case LOCK_RECORD: + tdb_chainlock_mark(lock_ctx->ctdb_db->ltdb->tdb, lock_ctx->key); + break; + + case LOCK_DB: + (void)ctdb_lockdb_mark(lock_ctx->ctdb_db); + break; + } + } + + request = lock_ctx->request; + if (auto_mark) { + /* Since request may be freed in the callback, unset the lock + * context, so request destructor will not free lock context. + */ + request->lctx = NULL; + } + + /* Since request may be freed in the callback, unset the request */ + lock_ctx->request = NULL; + + request->callback(request->private_data, locked); + + if (!auto_mark) { + return; + } + + if (locked) { + switch (lock_ctx->type) { + case LOCK_RECORD: + tdb_chainlock_unmark(lock_ctx->ctdb_db->ltdb->tdb, lock_ctx->key); + break; + + case LOCK_DB: + ctdb_lockdb_unmark(lock_ctx->ctdb_db); + break; + } + } + + talloc_free(lock_ctx); +} + + +static int lock_bucket_id(double t) +{ + double ms = 1.e-3, s = 1; + int id; + + if (t < 1*ms) { + id = 0; + } else if (t < 10*ms) { + id = 1; + } else if (t < 100*ms) { + id = 2; + } else if (t < 1*s) { + id = 3; + } else if (t < 2*s) { + id = 4; + } else if (t < 4*s) { + id = 5; + } else if (t < 8*s) { + id = 6; + } else if (t < 16*s) { + id = 7; + } else if (t < 32*s) { + id = 8; + } else if (t < 64*s) { + id = 9; + } else { + id = 10; + } + + return id; +} + +/* + * Callback routine when the required locks are obtained. + * Called from parent context + */ +static void ctdb_lock_handler(struct tevent_context *ev, + struct tevent_fd *tfd, + uint16_t flags, + void *private_data) +{ + struct lock_context *lock_ctx; + char c; + bool locked; + double t; + int id; + + lock_ctx = talloc_get_type_abort(private_data, struct lock_context); + + /* cancel the timeout event */ + TALLOC_FREE(lock_ctx->ttimer); + + t = timeval_elapsed(&lock_ctx->start_time); + id = lock_bucket_id(t); + + /* Read the status from the child process */ + if (sys_read(lock_ctx->fd[0], &c, 1) != 1) { + locked = false; + } else { + locked = (c == 0 ? true : false); + } + + /* Update statistics */ + CTDB_INCREMENT_STAT(lock_ctx->ctdb, locks.num_calls); + CTDB_INCREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_calls); + + if (locked) { + CTDB_INCREMENT_STAT(lock_ctx->ctdb, locks.buckets[id]); + CTDB_UPDATE_LATENCY(lock_ctx->ctdb, lock_ctx->ctdb_db, + lock_type_str[lock_ctx->type], locks.latency, + lock_ctx->start_time); + + CTDB_UPDATE_DB_LATENCY(lock_ctx->ctdb_db, lock_type_str[lock_ctx->type], locks.latency, t); + CTDB_INCREMENT_DB_STAT(lock_ctx->ctdb_db, locks.buckets[id]); + } else { + CTDB_INCREMENT_STAT(lock_ctx->ctdb, locks.num_failed); + CTDB_INCREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_failed); + } + + process_callbacks(lock_ctx, locked); +} + +struct lock_log_entry { + struct db_hash_context *lock_log; + TDB_DATA key; + unsigned long log_sec; + struct tevent_timer *timer; +}; + +static int lock_log_fetch_parser(uint8_t *keybuf, size_t keylen, + uint8_t *databuf, size_t datalen, + void *private_data) +{ + struct lock_log_entry **entry = + (struct lock_log_entry **)private_data; + + if (datalen != sizeof(struct lock_log_entry *)) { + return EINVAL; + } + + *entry = talloc_get_type_abort(*(void **)databuf, + struct lock_log_entry); + return 0; +} + +static void lock_log_cleanup(struct tevent_context *ev, + struct tevent_timer *ttimer, + struct timeval current_time, + void *private_data) +{ + struct lock_log_entry *entry = talloc_get_type_abort( + private_data, struct lock_log_entry); + int ret; + + entry->timer = NULL; + + ret = db_hash_delete(entry->lock_log, entry->key.dptr, + entry->key.dsize); + if (ret != 0) { + return; + } + talloc_free(entry); +} + +static bool lock_log_skip(struct tevent_context *ev, + struct db_hash_context *lock_log, + TDB_DATA key, unsigned long elapsed_sec) +{ + struct lock_log_entry *entry = NULL; + int ret; + + ret = db_hash_fetch(lock_log, key.dptr, key.dsize, + lock_log_fetch_parser, &entry); + if (ret == ENOENT) { + + entry = talloc_zero(lock_log, struct lock_log_entry); + if (entry == NULL) { + goto fail; + } + + entry->lock_log = lock_log; + + entry->key.dptr = talloc_memdup(entry, key.dptr, key.dsize); + if (entry->key.dptr == NULL) { + talloc_free(entry); + goto fail; + } + entry->key.dsize = key.dsize; + + entry->log_sec = elapsed_sec; + entry->timer = tevent_add_timer(ev, entry, + timeval_current_ofs(30, 0), + lock_log_cleanup, entry); + if (entry->timer == NULL) { + talloc_free(entry); + goto fail; + } + + ret = db_hash_add(lock_log, key.dptr, key.dsize, + (uint8_t *)&entry, + sizeof(struct lock_log_entry *)); + if (ret != 0) { + talloc_free(entry); + goto fail; + } + + return false; + + } else if (ret == EINVAL) { + + ret = db_hash_delete(lock_log, key.dptr, key.dsize); + if (ret != 0) { + goto fail; + } + + return false; + + } else if (ret == 0) { + + if (elapsed_sec <= entry->log_sec) { + return true; + } + + entry->log_sec = elapsed_sec; + + TALLOC_FREE(entry->timer); + entry->timer = tevent_add_timer(ev, entry, + timeval_current_ofs(30, 0), + lock_log_cleanup, entry); + if (entry->timer == NULL) { + ret = db_hash_delete(lock_log, key.dptr, key.dsize); + if (ret != 0) { + goto fail; + } + talloc_free(entry); + } + + return false; + } + + +fail: + return false; + +} + +static const char **debug_locks_args(TALLOC_CTX *mem_ctx, struct lock_context *lock_ctx) +{ + const char **args = NULL; + int tdb_flags; + int nargs, i; + + /* Program, lock helper PID, db|record, tdb path, fcntl|mutex, NULL */ + nargs = 6; + + args = talloc_array(mem_ctx, const char *, nargs); + if (args == NULL) { + return NULL; + } + + args[0] = talloc_strdup(args, "debug_locks"); + args[1] = talloc_asprintf(args, "%d", lock_ctx->child); + + if (lock_ctx->type == LOCK_RECORD) { + args[2] = talloc_strdup(args, "RECORD"); + } else { + args[2] = talloc_strdup(args, "DB"); + } + + args[3] = talloc_strdup(args, lock_ctx->ctdb_db->db_path); + + tdb_flags = tdb_get_flags(lock_ctx->ctdb_db->ltdb->tdb); + if (tdb_flags & TDB_MUTEX_LOCKING) { + args[4] = talloc_strdup(args, "MUTEX"); + } else { + args[4] = talloc_strdup(args, "FCNTL"); + } + + args[5] = NULL; + + for (i=0; i<nargs-1; i++) { + if (args[i] == NULL) { + talloc_free(args); + return NULL; + } + } + + return args; +} + +/* + * Callback routine when required locks are not obtained within timeout + * Called from parent context + */ +static void ctdb_lock_timeout_handler(struct tevent_context *ev, + struct tevent_timer *ttimer, + struct timeval current_time, + void *private_data) +{ + static char debug_locks[PATH_MAX+1] = ""; + struct lock_context *lock_ctx; + struct ctdb_context *ctdb; + pid_t pid; + double elapsed_time; + bool skip; + char *keystr; + const char **args; + + lock_ctx = talloc_get_type_abort(private_data, struct lock_context); + ctdb = lock_ctx->ctdb; + + elapsed_time = timeval_elapsed(&lock_ctx->start_time); + + /* For database locks, always log */ + if (lock_ctx->type == LOCK_DB) { + DEBUG(DEBUG_WARNING, + ("Unable to get DB lock on database %s for " + "%.0lf seconds\n", + lock_ctx->ctdb_db->db_name, elapsed_time)); + goto lock_debug; + } + + /* For record locks, check if we have already logged */ + skip = lock_log_skip(ev, lock_ctx->ctdb_db->lock_log, + lock_ctx->key, (unsigned long)elapsed_time); + if (skip) { + goto skip_lock_debug; + } + + keystr = hex_encode_talloc(lock_ctx, lock_ctx->key.dptr, + lock_ctx->key.dsize); + DEBUG(DEBUG_WARNING, + ("Unable to get RECORD lock on database %s for %.0lf seconds" + " (key %s)\n", + lock_ctx->ctdb_db->db_name, elapsed_time, + keystr ? keystr : "")); + TALLOC_FREE(keystr); + + /* If a node stopped/banned, don't spam the logs */ + if (ctdb->nodes[ctdb->pnn]->flags & NODE_FLAGS_INACTIVE) { + goto skip_lock_debug; + } + +lock_debug: + + if (ctdb_set_helper("lock debugging helper", + debug_locks, sizeof(debug_locks), + "CTDB_DEBUG_LOCKS", + getenv("CTDB_BASE"), "debug_locks.sh")) { + args = debug_locks_args(lock_ctx, lock_ctx); + if (args != NULL) { + pid = vfork(); + if (pid == 0) { + execvp(debug_locks, discard_const(args)); + _exit(0); + } + talloc_free(args); + ctdb_track_child(ctdb, pid); + } else { + D_WARNING("No memory for debug locks args\n"); + } + } else { + DEBUG(DEBUG_WARNING, + (__location__ + " Unable to setup lock debugging\n")); + } + +skip_lock_debug: + + /* reset the timeout timer */ + // talloc_free(lock_ctx->ttimer); + lock_ctx->ttimer = tevent_add_timer(ctdb->ev, + lock_ctx, + timeval_current_ofs(10, 0), + ctdb_lock_timeout_handler, + (void *)lock_ctx); +} + +static bool lock_helper_args(TALLOC_CTX *mem_ctx, + struct lock_context *lock_ctx, int fd, + int *argc, const char ***argv) +{ + const char **args = NULL; + int nargs = 0, i; + + switch (lock_ctx->type) { + case LOCK_RECORD: + nargs = 6; + break; + + case LOCK_DB: + nargs = 5; + break; + } + + /* Add extra argument for null termination */ + nargs++; + + args = talloc_array(mem_ctx, const char *, nargs); + if (args == NULL) { + return false; + } + + args[0] = talloc_asprintf(args, "%d", getpid()); + args[1] = talloc_asprintf(args, "%d", fd); + + switch (lock_ctx->type) { + case LOCK_RECORD: + args[2] = talloc_strdup(args, "RECORD"); + args[3] = talloc_strdup(args, lock_ctx->ctdb_db->db_path); + args[4] = talloc_asprintf(args, "0x%x", + tdb_get_flags(lock_ctx->ctdb_db->ltdb->tdb)); + if (lock_ctx->key.dsize == 0) { + args[5] = talloc_strdup(args, "NULL"); + } else { + args[5] = hex_encode_talloc(args, lock_ctx->key.dptr, lock_ctx->key.dsize); + } + break; + + case LOCK_DB: + args[2] = talloc_strdup(args, "DB"); + args[3] = talloc_strdup(args, lock_ctx->ctdb_db->db_path); + args[4] = talloc_asprintf(args, "0x%x", + tdb_get_flags(lock_ctx->ctdb_db->ltdb->tdb)); + break; + } + + /* Make sure last argument is NULL */ + args[nargs-1] = NULL; + + for (i=0; i<nargs-1; i++) { + if (args[i] == NULL) { + talloc_free(args); + return false; + } + } + + *argc = nargs; + *argv = args; + return true; +} + +/* + * Find a lock request that can be scheduled + */ +static struct lock_context *ctdb_find_lock_context(struct ctdb_context *ctdb) +{ + struct lock_context *lock_ctx, *next_ctx; + struct ctdb_db_context *ctdb_db; + + /* First check if there are database lock requests */ + + for (lock_ctx = ctdb->lock_pending; lock_ctx != NULL; + lock_ctx = next_ctx) { + + if (lock_ctx->request != NULL) { + /* Found a lock context with a request */ + return lock_ctx; + } + + next_ctx = lock_ctx->next; + + DEBUG(DEBUG_INFO, ("Removing lock context without lock " + "request\n")); + DLIST_REMOVE(ctdb->lock_pending, lock_ctx); + CTDB_DECREMENT_STAT(ctdb, locks.num_pending); + CTDB_DECREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_pending); + talloc_free(lock_ctx); + } + + /* Next check database queues */ + for (ctdb_db = ctdb->db_list; ctdb_db; ctdb_db = ctdb_db->next) { + if (ctdb_db->lock_num_current == + ctdb->tunable.lock_processes_per_db) { + continue; + } + + for (lock_ctx = ctdb_db->lock_pending; lock_ctx != NULL; + lock_ctx = next_ctx) { + + next_ctx = lock_ctx->next; + + if (lock_ctx->request != NULL) { + return lock_ctx; + } + + DEBUG(DEBUG_INFO, ("Removing lock context without " + "lock request\n")); + DLIST_REMOVE(ctdb_db->lock_pending, lock_ctx); + CTDB_DECREMENT_STAT(ctdb, locks.num_pending); + CTDB_DECREMENT_DB_STAT(ctdb_db, locks.num_pending); + talloc_free(lock_ctx); + } + } + + return NULL; +} + +/* + * Schedule a new lock child process + * Set up callback handler and timeout handler + */ +static void ctdb_lock_schedule(struct ctdb_context *ctdb) +{ + struct lock_context *lock_ctx; + int ret, argc; + TALLOC_CTX *tmp_ctx; + static char prog[PATH_MAX+1] = ""; + const char **args; + + if (!ctdb_set_helper("lock helper", + prog, sizeof(prog), + "CTDB_LOCK_HELPER", + CTDB_HELPER_BINDIR, "ctdb_lock_helper")) { + ctdb_die(ctdb, __location__ + " Unable to set lock helper\n"); + } + + /* Find a lock context with requests */ + lock_ctx = ctdb_find_lock_context(ctdb); + if (lock_ctx == NULL) { + return; + } + + lock_ctx->child = -1; + ret = pipe(lock_ctx->fd); + if (ret != 0) { + DEBUG(DEBUG_ERR, ("Failed to create pipe in ctdb_lock_schedule\n")); + return; + } + + set_close_on_exec(lock_ctx->fd[0]); + + /* Create data for child process */ + tmp_ctx = talloc_new(lock_ctx); + if (tmp_ctx == NULL) { + DEBUG(DEBUG_ERR, ("Failed to allocate memory for helper args\n")); + close(lock_ctx->fd[0]); + close(lock_ctx->fd[1]); + return; + } + + if (! ctdb->do_setsched) { + ret = setenv("CTDB_NOSETSCHED", "1", 1); + if (ret != 0) { + DEBUG(DEBUG_WARNING, + ("Failed to set CTDB_NOSETSCHED variable\n")); + } + } + + /* Create arguments for lock helper */ + if (!lock_helper_args(tmp_ctx, lock_ctx, lock_ctx->fd[1], + &argc, &args)) { + DEBUG(DEBUG_ERR, ("Failed to create lock helper args\n")); + close(lock_ctx->fd[0]); + close(lock_ctx->fd[1]); + talloc_free(tmp_ctx); + return; + } + + lock_ctx->child = ctdb_vfork_exec(lock_ctx, ctdb, prog, argc, + (const char **)args); + if (lock_ctx->child == -1) { + DEBUG(DEBUG_ERR, ("Failed to create a child in ctdb_lock_schedule\n")); + close(lock_ctx->fd[0]); + close(lock_ctx->fd[1]); + talloc_free(tmp_ctx); + return; + } + + /* Parent process */ + close(lock_ctx->fd[1]); + + talloc_free(tmp_ctx); + + /* Set up timeout handler */ + lock_ctx->ttimer = tevent_add_timer(ctdb->ev, + lock_ctx, + timeval_current_ofs(10, 0), + ctdb_lock_timeout_handler, + (void *)lock_ctx); + if (lock_ctx->ttimer == NULL) { + ctdb_kill(ctdb, lock_ctx->child, SIGTERM); + lock_ctx->child = -1; + close(lock_ctx->fd[0]); + return; + } + + /* Set up callback */ + lock_ctx->tfd = tevent_add_fd(ctdb->ev, + lock_ctx, + lock_ctx->fd[0], + TEVENT_FD_READ, + ctdb_lock_handler, + (void *)lock_ctx); + if (lock_ctx->tfd == NULL) { + TALLOC_FREE(lock_ctx->ttimer); + ctdb_kill(ctdb, lock_ctx->child, SIGTERM); + lock_ctx->child = -1; + close(lock_ctx->fd[0]); + return; + } + tevent_fd_set_auto_close(lock_ctx->tfd); + + /* Move the context from pending to current */ + if (lock_ctx->type == LOCK_RECORD) { + DLIST_REMOVE(lock_ctx->ctdb_db->lock_pending, lock_ctx); + DLIST_ADD_END(lock_ctx->ctdb_db->lock_current, lock_ctx); + } else { + DLIST_REMOVE(ctdb->lock_pending, lock_ctx); + DLIST_ADD_END(ctdb->lock_current, lock_ctx); + } + CTDB_DECREMENT_STAT(lock_ctx->ctdb, locks.num_pending); + CTDB_INCREMENT_STAT(lock_ctx->ctdb, locks.num_current); + lock_ctx->ctdb_db->lock_num_current++; + CTDB_DECREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_pending); + CTDB_INCREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_current); +} + + +/* + * Lock record / db depending on type + */ +static struct lock_request *ctdb_lock_internal(TALLOC_CTX *mem_ctx, + struct ctdb_context *ctdb, + struct ctdb_db_context *ctdb_db, + TDB_DATA key, + uint32_t priority, + void (*callback)(void *, bool), + void *private_data, + enum lock_type type, + bool auto_mark) +{ + struct lock_context *lock_ctx = NULL; + struct lock_request *request; + + if (callback == NULL) { + DEBUG(DEBUG_WARNING, ("No callback function specified, not locking\n")); + return NULL; + } + + lock_ctx = talloc_zero(ctdb, struct lock_context); + if (lock_ctx == NULL) { + DEBUG(DEBUG_ERR, ("Failed to create a new lock context\n")); + return NULL; + } + + if ((request = talloc_zero(mem_ctx, struct lock_request)) == NULL) { + talloc_free(lock_ctx); + return NULL; + } + + lock_ctx->type = type; + lock_ctx->ctdb = ctdb; + lock_ctx->ctdb_db = ctdb_db; + lock_ctx->key.dsize = key.dsize; + if (key.dsize > 0) { + lock_ctx->key.dptr = talloc_memdup(lock_ctx, key.dptr, key.dsize); + if (lock_ctx->key.dptr == NULL) { + DEBUG(DEBUG_ERR, (__location__ "Memory allocation error\n")); + talloc_free(lock_ctx); + talloc_free(request); + return NULL; + } + lock_ctx->key_hash = ctdb_hash(&key); + } else { + lock_ctx->key.dptr = NULL; + } + lock_ctx->priority = priority; + lock_ctx->auto_mark = auto_mark; + + lock_ctx->request = request; + lock_ctx->child = -1; + + /* Non-record locks are required by recovery and should be scheduled + * immediately, so keep them at the head of the pending queue. + */ + if (lock_ctx->type == LOCK_RECORD) { + DLIST_ADD_END(ctdb_db->lock_pending, lock_ctx); + } else { + DLIST_ADD_END(ctdb->lock_pending, lock_ctx); + } + CTDB_INCREMENT_STAT(ctdb, locks.num_pending); + if (ctdb_db) { + CTDB_INCREMENT_DB_STAT(ctdb_db, locks.num_pending); + } + + /* Start the timer when we activate the context */ + lock_ctx->start_time = timeval_current(); + + request->lctx = lock_ctx; + request->callback = callback; + request->private_data = private_data; + + talloc_set_destructor(request, ctdb_lock_request_destructor); + talloc_set_destructor(lock_ctx, ctdb_lock_context_destructor); + + ctdb_lock_schedule(ctdb); + + return request; +} + + +/* + * obtain a lock on a record in a database + */ +struct lock_request *ctdb_lock_record(TALLOC_CTX *mem_ctx, + struct ctdb_db_context *ctdb_db, + TDB_DATA key, + bool auto_mark, + void (*callback)(void *, bool), + void *private_data) +{ + return ctdb_lock_internal(mem_ctx, + ctdb_db->ctdb, + ctdb_db, + key, + 0, + callback, + private_data, + LOCK_RECORD, + auto_mark); +} + + +/* + * obtain a lock on a database + */ +struct lock_request *ctdb_lock_db(TALLOC_CTX *mem_ctx, + struct ctdb_db_context *ctdb_db, + bool auto_mark, + void (*callback)(void *, bool), + void *private_data) +{ + return ctdb_lock_internal(mem_ctx, + ctdb_db->ctdb, + ctdb_db, + tdb_null, + 0, + callback, + private_data, + LOCK_DB, + auto_mark); +} diff --git a/ctdb/server/ctdb_lock_helper.c b/ctdb/server/ctdb_lock_helper.c new file mode 100644 index 0000000..51d2992 --- /dev/null +++ b/ctdb/server/ctdb_lock_helper.c @@ -0,0 +1,350 @@ +/* + ctdb lock helper + + Copyright (C) Amitay Isaacs 2013 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "replace.h" +#include "system/filesys.h" +#include "system/network.h" +#include "system/wait.h" + +#include <talloc.h> +#include <tevent.h> +#include <tdb.h> + +#include "lib/util/sys_rw.h" +#include "lib/util/tevent_unix.h" + +#include "protocol/protocol.h" + +#include "common/system.h" + +static bool realtime = true; + +struct lock_state { + struct tdb_context *tdb; + TDB_DATA key; +}; + +static void set_priority(void) +{ + const char *ptr; + + ptr = getenv("CTDB_NOSETSCHED"); + if (ptr != NULL) { + realtime = false; + } + + if (! realtime) { + return; + } + + realtime = set_scheduler(); + if (! realtime) { + fprintf(stderr, + "locking: Unable to set real-time scheduler priority\n"); + } +} + +static void reset_priority(void) +{ + if (realtime) { + reset_scheduler(); + } +} + +static void send_result(int fd, char result) +{ + sys_write(fd, &result, 1); + if (result == 1) { + exit(1); + } +} + + +static void usage(const char *progname) +{ + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: %s <ctdbd-pid> <output-fd> RECORD <db-path> <db-flags> <db-key>\n", progname); + fprintf(stderr, " %s <ctdbd-pid> <output-fd> DB <db-path> <db-flags>\n", progname); +} + +static uint8_t *hex_decode_talloc(TALLOC_CTX *mem_ctx, + const char *hex_in, size_t *len) +{ + unsigned int i; + int num; + uint8_t *buffer; + + *len = strlen(hex_in) / 2; + buffer = talloc_array(mem_ctx, unsigned char, *len); + + for (i=0; i<*len; i++) { + sscanf(&hex_in[i*2], "%02X", &num); + buffer[i] = (uint8_t)num; + } + + return buffer; +} + +static int lock_record(const char *dbpath, const char *dbflags, + const char *dbkey, struct lock_state *state) +{ + int tdb_flags; + + /* No error checking since CTDB always passes sane values */ + tdb_flags = strtol(dbflags, NULL, 0); + + /* Convert hex key to key */ + if (strcmp(dbkey, "NULL") == 0) { + state->key.dptr = NULL; + state->key.dsize = 0; + } else { + state->key.dptr = hex_decode_talloc(NULL, dbkey, + &state->key.dsize); + } + + state->tdb = tdb_open(dbpath, 0, tdb_flags, O_RDWR, 0600); + if (state->tdb == NULL) { + fprintf(stderr, "locking: Error opening database %s\n", dbpath); + return 1; + } + + set_priority(); + + if (tdb_chainlock(state->tdb, state->key) < 0) { + fprintf(stderr, "locking: Error getting record lock (%s)\n", + tdb_errorstr(state->tdb)); + return 1; + } + + reset_priority(); + + return 0; + +} + +static int lock_db(const char *dbpath, const char *dbflags, + struct lock_state *state) +{ + int tdb_flags; + + /* No error checking since CTDB always passes sane values */ + tdb_flags = strtol(dbflags, NULL, 0); + + state->tdb = tdb_open(dbpath, 0, tdb_flags, O_RDWR, 0600); + if (state->tdb == NULL) { + fprintf(stderr, "locking: Error opening database %s\n", dbpath); + return 1; + } + + set_priority(); + + if (tdb_lockall(state->tdb) < 0) { + fprintf(stderr, "locking: Error getting db lock (%s)\n", + tdb_errorstr(state->tdb)); + return 1; + } + + reset_priority(); + + return 0; +} + +struct wait_for_parent_state { + struct tevent_context *ev; + pid_t ppid; +}; + +static void wait_for_parent_check(struct tevent_req *subreq); + +static struct tevent_req *wait_for_parent_send(TALLOC_CTX *mem_ctx, + struct tevent_context *ev, + pid_t ppid) +{ + struct tevent_req *req, *subreq; + struct wait_for_parent_state *state; + + req = tevent_req_create(mem_ctx, &state, struct wait_for_parent_state); + if (req == NULL) { + return NULL; + } + + state->ev = ev; + state->ppid = ppid; + + if (ppid == 1) { + tevent_req_done(req); + return tevent_req_post(req, ev); + } + + subreq = tevent_wakeup_send(state, ev, + tevent_timeval_current_ofs(5,0)); + if (tevent_req_nomem(subreq, req)) { + return tevent_req_post(req, ev); + } + tevent_req_set_callback(subreq, wait_for_parent_check, req); + + return req; +} + +static void wait_for_parent_check(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct wait_for_parent_state *state = tevent_req_data( + req, struct wait_for_parent_state); + bool status; + + status = tevent_wakeup_recv(subreq); + TALLOC_FREE(subreq); + if (! status) { + /* Ignore error */ + fprintf(stderr, "locking: tevent_wakeup_recv() failed\n"); + } + + if (kill(state->ppid, 0) == -1 && errno == ESRCH) { + tevent_req_done(req); + return; + } + + subreq = tevent_wakeup_send(state, state->ev, + tevent_timeval_current_ofs(5,0)); + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, wait_for_parent_check, req); +} + +static bool wait_for_parent_recv(struct tevent_req *req, int *perr) +{ + if (tevent_req_is_unix_error(req, perr)) { + return false; + } + + return true; +} + +static void cleanup(struct lock_state *state) +{ + if (state->tdb != NULL) { + if (state->key.dsize == 0) { + tdb_unlockall(state->tdb); + } else { + tdb_chainunlock(state->tdb, state->key); + } + tdb_close(state->tdb); + } +} + +static void signal_handler(struct tevent_context *ev, + struct tevent_signal *se, + int signum, int count, void *siginfo, + void *private_data) +{ + struct lock_state *state = (struct lock_state *)private_data; + + cleanup(state); + exit(0); +} + +int main(int argc, char *argv[]) +{ + struct tevent_context *ev; + struct tevent_signal *se; + struct tevent_req *req; + struct lock_state state = { 0 }; + int write_fd; + char result = 0; + int ppid; + const char *lock_type; + bool status; + int err; + + reset_scheduler(); + + if (argc < 4) { + usage(argv[0]); + exit(1); + } + + ppid = atoi(argv[1]); + write_fd = atoi(argv[2]); + lock_type = argv[3]; + + ev = tevent_context_init(NULL); + if (ev == NULL) { + fprintf(stderr, "locking: tevent_context_init() failed\n"); + exit(1); + } + + se = tevent_add_signal(ev, ev, SIGTERM, 0, + signal_handler, &state); + if (se == NULL) { + fprintf(stderr, "locking: tevent_add_signal() failed\n"); + talloc_free(ev); + exit(1); + } + + if (strcmp(lock_type, "RECORD") == 0) { + if (argc != 7) { + fprintf(stderr, + "locking: Invalid number of arguments (%d)\n", + argc); + usage(argv[0]); + exit(1); + } + result = lock_record(argv[4], argv[5], argv[6], &state); + + } else if (strcmp(lock_type, "DB") == 0) { + if (argc != 6) { + fprintf(stderr, + "locking: Invalid number of arguments (%d)\n", + argc); + usage(argv[0]); + exit(1); + } + result = lock_db(argv[4], argv[5], &state); + + } else { + fprintf(stderr, "locking: Invalid lock-type '%s'\n", lock_type); + usage(argv[0]); + exit(1); + } + + send_result(write_fd, result); + + req = wait_for_parent_send(ev, ev, ppid); + if (req == NULL) { + fprintf(stderr, "locking: wait_for_parent_send() failed\n"); + cleanup(&state); + exit(1); + } + + tevent_req_poll(req, ev); + + status = wait_for_parent_recv(req, &err); + if (! status) { + fprintf(stderr, + "locking: wait_for_parent_recv() failed (%d)\n", + err); + } + + talloc_free(ev); + cleanup(&state); + return 0; +} diff --git a/ctdb/server/ctdb_logging.c b/ctdb/server/ctdb_logging.c new file mode 100644 index 0000000..1da26b5 --- /dev/null +++ b/ctdb/server/ctdb_logging.c @@ -0,0 +1,174 @@ +/* + ctdb logging code + + Copyright (C) Andrew Tridgell 2008 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "replace.h" +#include "system/filesys.h" +#include "system/network.h" +#include "system/time.h" + +#include <talloc.h> +#include <tevent.h> + +#include "lib/util/dlinklist.h" +#include "lib/util/debug.h" +#include "lib/util/blocking.h" +#include "lib/util/sys_rw.h" +#include "lib/util/time.h" + +#include "ctdb_private.h" +#include "ctdb_client.h" + +#include "common/common.h" +#include "common/logging.h" + +struct ctdb_log_state { + int fd, pfd; + char buf[1024]; + uint16_t buf_used; +}; + +/* Used by ctdb_set_child_logging() */ +static struct ctdb_log_state *log_state; + +/* Initialise logging */ +bool ctdb_logging_init(TALLOC_CTX *mem_ctx, const char *logging, + const char *debug_level) +{ + int ret; + + log_state = talloc_zero(mem_ctx, struct ctdb_log_state); + if (log_state == NULL) { + return false; + } + + ret = logging_init(mem_ctx, logging, debug_level, "ctdbd"); + if (ret != 0) { + return false; + } + + return true; +} + +static void write_to_log(const char *buf, unsigned int len) +{ + DEBUG(script_log_level, ("%*.*s\n", len, len, buf)); +} + +/* + called when log data comes in from a child process + */ +static void ctdb_child_log_handler(struct tevent_context *ev, + struct tevent_fd *fde, + uint16_t flags, void *private) +{ + struct ctdb_log_state *log = talloc_get_type(private, struct ctdb_log_state); + char *p; + int n; + + if (!(flags & TEVENT_FD_READ)) { + return; + } + + n = sys_read(log->pfd, &log->buf[log->buf_used], + sizeof(log->buf) - log->buf_used); + if (n > 0) { + log->buf_used += n; + } else if (n == 0) { + if (log != log_state) { + talloc_free(log); + } + return; + } + + while (log->buf_used > 0 && + (p = memchr(log->buf, '\n', log->buf_used)) != NULL) { + int n1 = (p - log->buf)+1; + int n2 = n1 - 1; + /* swallow \r from child processes */ + if (n2 > 0 && log->buf[n2-1] == '\r') { + n2--; + } + write_to_log(log->buf, n2); + memmove(log->buf, p+1, sizeof(log->buf) - n1); + log->buf_used -= n1; + } + + /* the buffer could have completely filled - unfortunately we have + no choice but to dump it out straight away */ + if (log->buf_used == sizeof(log->buf)) { + write_to_log(log->buf, log->buf_used); + log->buf_used = 0; + } +} + +/* + setup for logging of child process stdout +*/ +int ctdb_set_child_logging(struct ctdb_context *ctdb) +{ + int p[2]; + int old_stdout, old_stderr; + struct tevent_fd *fde; + + /* setup a pipe to catch IO from subprocesses */ + if (pipe(p) != 0) { + DEBUG(DEBUG_ERR,(__location__ " Failed to setup for child logging pipe\n")); + return -1; + } + + /* We'll fail if stderr/stdout not already open; it's simpler. */ + old_stdout = dup(STDOUT_FILENO); + if (old_stdout < 0) { + DEBUG(DEBUG_ERR, ("Failed to dup stdout for child logging\n")); + return -1; + } + old_stderr = dup(STDERR_FILENO); + if (old_stderr < 0) { + DEBUG(DEBUG_ERR, ("Failed to dup stderr for child logging\n")); + close(old_stdout); + return -1; + } + if (dup2(p[1], STDOUT_FILENO) < 0 || dup2(p[1], STDERR_FILENO) < 0) { + int saved_errno = errno; + dup2(old_stdout, STDOUT_FILENO); + dup2(old_stderr, STDERR_FILENO); + close(old_stdout); + close(old_stderr); + close(p[0]); + close(p[1]); + errno = saved_errno; + + printf(__location__ " dup2 failed: %s\n", + strerror(errno)); + return -1; + } + close(p[1]); + close(old_stdout); + close(old_stderr); + + fde = tevent_add_fd(ctdb->ev, log_state, p[0], TEVENT_FD_READ, + ctdb_child_log_handler, log_state); + tevent_fd_set_auto_close(fde); + + log_state->pfd = p[0]; + + DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for logging\n", p[0])); + + return 0; +} diff --git a/ctdb/server/ctdb_ltdb_server.c b/ctdb/server/ctdb_ltdb_server.c new file mode 100644 index 0000000..e2cb916 --- /dev/null +++ b/ctdb/server/ctdb_ltdb_server.c @@ -0,0 +1,1663 @@ +/* + ctdb ltdb code - server side + + Copyright (C) Andrew Tridgell 2007 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "replace.h" +#include "system/network.h" +#include "system/filesys.h" +#include "system/dir.h" +#include "system/time.h" +#include "system/locale.h" + +#include <talloc.h> +#include <tevent.h> + +#include "lib/tdb_wrap/tdb_wrap.h" +#include "lib/util/dlinklist.h" +#include "lib/util/debug.h" +#include "lib/util/samba_util.h" + +#include "ctdb_private.h" +#include "ctdb_client.h" + +#include "common/rb_tree.h" +#include "common/reqid.h" +#include "common/system.h" +#include "common/common.h" +#include "common/logging.h" + +#include "server/ctdb_config.h" + +#define PERSISTENT_HEALTH_TDB "persistent_health.tdb" + +/** + * write a record to a normal database + * + * This is the server-variant of the ctdb_ltdb_store function. + * It contains logic to determine whether a record should be + * stored or deleted. It also sends SCHEDULE_FOR_DELETION + * controls to the local ctdb daemon if appropriate. + */ +static int ctdb_ltdb_store_server(struct ctdb_db_context *ctdb_db, + TDB_DATA key, + struct ctdb_ltdb_header *header, + TDB_DATA data) +{ + struct ctdb_context *ctdb = ctdb_db->ctdb; + TDB_DATA rec[2]; + uint32_t hsize = sizeof(struct ctdb_ltdb_header); + int ret; + bool keep = false; + bool schedule_for_deletion = false; + bool remove_from_delete_queue = false; + uint32_t lmaster; + + if (ctdb->flags & CTDB_FLAG_TORTURE) { + TDB_DATA old; + struct ctdb_ltdb_header *h2; + + old = tdb_fetch(ctdb_db->ltdb->tdb, key); + h2 = (struct ctdb_ltdb_header *)old.dptr; + if (old.dptr != NULL && + old.dsize >= hsize && + h2->rsn > header->rsn) { + DEBUG(DEBUG_ERR, + ("RSN regression! %"PRIu64" %"PRIu64"\n", + h2->rsn, header->rsn)); + } + if (old.dptr) { + free(old.dptr); + } + } + + if (ctdb->vnn_map == NULL) { + /* + * Called from a client: always store the record + * Also don't call ctdb_lmaster since it uses the vnn_map! + */ + keep = true; + goto store; + } + + lmaster = ctdb_lmaster(ctdb_db->ctdb, &key); + + /* + * If we migrate an empty record off to another node + * and the record has not been migrated with data, + * delete the record instead of storing the empty record. + */ + if (data.dsize != 0) { + keep = true; + } else if (header->flags & CTDB_REC_RO_FLAGS) { + keep = true; + } else if (header->flags & CTDB_REC_FLAG_AUTOMATIC) { + /* + * The record is not created by the client but + * automatically by the ctdb_ltdb_fetch logic that + * creates a record with an initial header in the + * ltdb before trying to migrate the record from + * the current lmaster. Keep it instead of trying + * to delete the non-existing record... + */ + keep = true; + schedule_for_deletion = true; + } else if (header->flags & CTDB_REC_FLAG_MIGRATED_WITH_DATA) { + keep = true; + } else if (ctdb_db->ctdb->pnn == lmaster) { + /* + * If we are lmaster, then we usually keep the record. + * But if we retrieve the dmaster role by a VACUUM_MIGRATE + * and the record is empty and has never been migrated + * with data, then we should delete it instead of storing it. + * This is part of the vacuuming process. + * + * The reason that we usually need to store even empty records + * on the lmaster is that a client operating directly on the + * lmaster (== dmaster) expects the local copy of the record to + * exist after successful ctdb migrate call. If the record does + * not exist, the client goes into a migrate loop and eventually + * fails. So storing the empty record makes sure that we do not + * need to change the client code. + */ + if (!(header->flags & CTDB_REC_FLAG_VACUUM_MIGRATED)) { + keep = true; + } else if (ctdb_db->ctdb->pnn != header->dmaster) { + keep = true; + } + } else if (ctdb_db->ctdb->pnn == header->dmaster) { + keep = true; + } + + if (keep) { + if (ctdb_db_volatile(ctdb_db) && + (ctdb_db->ctdb->pnn == header->dmaster) && + !(header->flags & CTDB_REC_RO_FLAGS)) + { + header->rsn++; + + if (data.dsize == 0) { + schedule_for_deletion = true; + } + } + remove_from_delete_queue = !schedule_for_deletion; + } + +store: + /* + * The VACUUM_MIGRATED flag is only set temporarily for + * the above logic when the record was retrieved by a + * VACUUM_MIGRATE call and should not be stored in the + * database. + * + * The VACUUM_MIGRATE call is triggered by a vacuum fetch, + * and there are two cases in which the corresponding record + * is stored in the local database: + * 1. The record has been migrated with data in the past + * (the MIGRATED_WITH_DATA record flag is set). + * 2. The record has been filled with data again since it + * had been submitted in the VACUUM_FETCH message to the + * lmaster. + * For such records it is important to not store the + * VACUUM_MIGRATED flag in the database. + */ + header->flags &= ~CTDB_REC_FLAG_VACUUM_MIGRATED; + + /* + * Similarly, clear the AUTOMATIC flag which should not enter + * the local database copy since this would require client + * modifications to clear the flag when the client stores + * the record. + */ + header->flags &= ~CTDB_REC_FLAG_AUTOMATIC; + + rec[0].dsize = hsize; + rec[0].dptr = (uint8_t *)header; + + rec[1].dsize = data.dsize; + rec[1].dptr = data.dptr; + + DEBUG(DEBUG_DEBUG, (__location__ " db[%s]: %s record: hash[0x%08x]\n", + ctdb_db->db_name, + keep?"storing":"deleting", + ctdb_hash(&key))); + + if (keep) { + ret = tdb_storev(ctdb_db->ltdb->tdb, key, rec, 2, TDB_REPLACE); + } else { + ret = tdb_delete(ctdb_db->ltdb->tdb, key); + } + + if (ret != 0) { + int lvl = DEBUG_ERR; + + if (keep == false && + tdb_error(ctdb_db->ltdb->tdb) == TDB_ERR_NOEXIST) + { + lvl = DEBUG_DEBUG; + } + + DEBUG(lvl, (__location__ " db[%s]: Failed to %s record: " + "%d - %s\n", + ctdb_db->db_name, + keep?"store":"delete", ret, + tdb_errorstr(ctdb_db->ltdb->tdb))); + + schedule_for_deletion = false; + remove_from_delete_queue = false; + } + + if (schedule_for_deletion) { + int ret2; + ret2 = ctdb_local_schedule_for_deletion(ctdb_db, header, key); + if (ret2 != 0) { + DEBUG(DEBUG_ERR, (__location__ " ctdb_local_schedule_for_deletion failed.\n")); + } + } + + if (remove_from_delete_queue) { + ctdb_local_remove_from_delete_queue(ctdb_db, header, key); + } + + return ret; +} + +struct lock_fetch_state { + struct ctdb_context *ctdb; + struct ctdb_db_context *ctdb_db; + void (*recv_pkt)(void *, struct ctdb_req_header *); + void *recv_context; + struct ctdb_req_header *hdr; + uint32_t generation; + bool ignore_generation; +}; + +/* + called when we should retry the operation + */ +static void lock_fetch_callback(void *p, bool locked) +{ + struct lock_fetch_state *state = talloc_get_type(p, struct lock_fetch_state); + if (!state->ignore_generation && + state->generation != state->ctdb_db->generation) { + DEBUG(DEBUG_NOTICE,("Discarding previous generation lockwait packet\n")); + talloc_free(state->hdr); + return; + } + state->recv_pkt(state->recv_context, state->hdr); + DEBUG(DEBUG_INFO,(__location__ " PACKET REQUEUED\n")); +} + + +/* + do a non-blocking ltdb_lock, deferring this ctdb request until we + have the chainlock + + It does the following: + + 1) tries to get the chainlock. If it succeeds, then it returns 0 + + 2) if it fails to get a chainlock immediately then it sets up a + non-blocking chainlock via ctdb_lock_record, and when it gets the + chainlock it re-submits this ctdb request to the main packet + receive function. + + This effectively queues all ctdb requests that cannot be + immediately satisfied until it can get the lock. This means that + the main ctdb daemon will not block waiting for a chainlock held by + a client + + There are 3 possible return values: + + 0: means that it got the lock immediately. + -1: means that it failed to get the lock, and won't retry + -2: means that it failed to get the lock immediately, but will retry + */ +int ctdb_ltdb_lock_requeue(struct ctdb_db_context *ctdb_db, + TDB_DATA key, struct ctdb_req_header *hdr, + void (*recv_pkt)(void *, struct ctdb_req_header *), + void *recv_context, bool ignore_generation) +{ + int ret; + struct tdb_context *tdb = ctdb_db->ltdb->tdb; + struct lock_request *lreq; + struct lock_fetch_state *state; + + ret = tdb_chainlock_nonblock(tdb, key); + + if (ret != 0 && + !(errno == EACCES || errno == EAGAIN || errno == EDEADLK)) { + /* a hard failure - don't try again */ + return -1; + } + + /* when torturing, ensure we test the contended path */ + if ((ctdb_db->ctdb->flags & CTDB_FLAG_TORTURE) && + random() % 5 == 0) { + ret = -1; + tdb_chainunlock(tdb, key); + } + + /* first the non-contended path */ + if (ret == 0) { + return 0; + } + + state = talloc(hdr, struct lock_fetch_state); + state->ctdb = ctdb_db->ctdb; + state->ctdb_db = ctdb_db; + state->hdr = hdr; + state->recv_pkt = recv_pkt; + state->recv_context = recv_context; + state->generation = ctdb_db->generation; + state->ignore_generation = ignore_generation; + + /* now the contended path */ + lreq = ctdb_lock_record(state, ctdb_db, key, true, lock_fetch_callback, state); + if (lreq == NULL) { + return -1; + } + + /* we need to move the packet off the temporary context in ctdb_input_pkt(), + so it won't be freed yet */ + talloc_steal(state, hdr); + + /* now tell the caller than we will retry asynchronously */ + return -2; +} + +/* + a variant of ctdb_ltdb_lock_requeue that also fetches the record + */ +int ctdb_ltdb_lock_fetch_requeue(struct ctdb_db_context *ctdb_db, + TDB_DATA key, struct ctdb_ltdb_header *header, + struct ctdb_req_header *hdr, TDB_DATA *data, + void (*recv_pkt)(void *, struct ctdb_req_header *), + void *recv_context, bool ignore_generation) +{ + int ret; + + ret = ctdb_ltdb_lock_requeue(ctdb_db, key, hdr, recv_pkt, + recv_context, ignore_generation); + if (ret != 0) { + return ret; + } + + ret = ctdb_ltdb_fetch(ctdb_db, key, header, hdr, data); + if (ret != 0) { + int uret; + uret = ctdb_ltdb_unlock(ctdb_db, key); + if (uret != 0) { + DBG_ERR("ctdb_ltdb_unlock() failed with error %d\n", + uret); + } + } + return ret; +} + + +/* + paranoid check to see if the db is empty + */ +static void ctdb_check_db_empty(struct ctdb_db_context *ctdb_db) +{ + struct tdb_context *tdb = ctdb_db->ltdb->tdb; + int count = tdb_traverse_read(tdb, NULL, NULL); + if (count != 0) { + DEBUG(DEBUG_ALERT,(__location__ " tdb '%s' not empty on attach! aborting\n", + ctdb_db->db_path)); + ctdb_fatal(ctdb_db->ctdb, "database not empty on attach"); + } +} + +int ctdb_load_persistent_health(struct ctdb_context *ctdb, + struct ctdb_db_context *ctdb_db) +{ + struct tdb_context *tdb = ctdb->db_persistent_health->tdb; + char *old; + char *reason = NULL; + TDB_DATA key; + TDB_DATA val; + + key.dptr = discard_const_p(uint8_t, ctdb_db->db_name); + key.dsize = strlen(ctdb_db->db_name); + + old = ctdb_db->unhealthy_reason; + ctdb_db->unhealthy_reason = NULL; + + val = tdb_fetch(tdb, key); + if (val.dsize > 0) { + reason = talloc_strndup(ctdb_db, + (const char *)val.dptr, + val.dsize); + if (reason == NULL) { + DEBUG(DEBUG_ALERT,(__location__ " talloc_strndup(%d) failed\n", + (int)val.dsize)); + ctdb_db->unhealthy_reason = old; + free(val.dptr); + return -1; + } + } + + if (val.dptr) { + free(val.dptr); + } + + talloc_free(old); + ctdb_db->unhealthy_reason = reason; + return 0; +} + +int ctdb_update_persistent_health(struct ctdb_context *ctdb, + struct ctdb_db_context *ctdb_db, + const char *given_reason,/* NULL means healthy */ + unsigned int num_healthy_nodes) +{ + struct tdb_context *tdb = ctdb->db_persistent_health->tdb; + int ret; + TDB_DATA key; + TDB_DATA val; + char *new_reason = NULL; + char *old_reason = NULL; + + ret = tdb_transaction_start(tdb); + if (ret != 0) { + DEBUG(DEBUG_ALERT,(__location__ " tdb_transaction_start('%s') failed: %d - %s\n", + tdb_name(tdb), ret, tdb_errorstr(tdb))); + return -1; + } + + ret = ctdb_load_persistent_health(ctdb, ctdb_db); + if (ret != 0) { + DEBUG(DEBUG_ALERT,(__location__ " ctdb_load_persistent_health('%s') failed: %d\n", + ctdb_db->db_name, ret)); + return -1; + } + old_reason = ctdb_db->unhealthy_reason; + + key.dptr = discard_const_p(uint8_t, ctdb_db->db_name); + key.dsize = strlen(ctdb_db->db_name); + + if (given_reason) { + new_reason = talloc_strdup(ctdb_db, given_reason); + if (new_reason == NULL) { + DEBUG(DEBUG_ALERT,(__location__ " talloc_strdup(%s) failed\n", + given_reason)); + return -1; + } + } else if (old_reason && num_healthy_nodes == 0) { + /* + * If the reason indicates ok, but there were no healthy nodes + * available, it means that we have not recovered valid content + * of the db. So if there's an old reason, prefix it with + * "NO-HEALTHY-NODES - " + */ + const char *prefix; + +#define _TMP_PREFIX "NO-HEALTHY-NODES - " + ret = strncmp(_TMP_PREFIX, old_reason, strlen(_TMP_PREFIX)); + if (ret != 0) { + prefix = _TMP_PREFIX; + } else { + prefix = ""; + } + new_reason = talloc_asprintf(ctdb_db, "%s%s", + prefix, old_reason); + if (new_reason == NULL) { + DEBUG(DEBUG_ALERT,(__location__ " talloc_asprintf(%s%s) failed\n", + prefix, old_reason)); + return -1; + } +#undef _TMP_PREFIX + } + + if (new_reason) { + val.dptr = discard_const_p(uint8_t, new_reason); + val.dsize = strlen(new_reason); + + ret = tdb_store(tdb, key, val, TDB_REPLACE); + if (ret != 0) { + tdb_transaction_cancel(tdb); + DEBUG(DEBUG_ALERT,(__location__ " tdb_store('%s', %s, %s) failed: %d - %s\n", + tdb_name(tdb), ctdb_db->db_name, new_reason, + ret, tdb_errorstr(tdb))); + talloc_free(new_reason); + return -1; + } + DEBUG(DEBUG_ALERT,("Updated db health for db(%s) to: %s\n", + ctdb_db->db_name, new_reason)); + } else if (old_reason) { + ret = tdb_delete(tdb, key); + if (ret != 0) { + tdb_transaction_cancel(tdb); + DEBUG(DEBUG_ALERT,(__location__ " tdb_delete('%s', %s) failed: %d - %s\n", + tdb_name(tdb), ctdb_db->db_name, + ret, tdb_errorstr(tdb))); + talloc_free(new_reason); + return -1; + } + DEBUG(DEBUG_NOTICE,("Updated db health for db(%s): OK\n", + ctdb_db->db_name)); + } + + ret = tdb_transaction_commit(tdb); + if (ret != TDB_SUCCESS) { + DEBUG(DEBUG_ALERT,(__location__ " tdb_transaction_commit('%s') failed: %d - %s\n", + tdb_name(tdb), ret, tdb_errorstr(tdb))); + talloc_free(new_reason); + return -1; + } + + talloc_free(old_reason); + ctdb_db->unhealthy_reason = new_reason; + + return 0; +} + +static int ctdb_backup_corrupted_tdb(struct ctdb_context *ctdb, + struct ctdb_db_context *ctdb_db) +{ + time_t now = time(NULL); + char *new_path; + char *new_reason; + int ret; + struct tm *tm; + + tm = gmtime(&now); + + /* formatted like: foo.tdb.0.corrupted.20091204160825.0Z */ + new_path = talloc_asprintf(ctdb_db, "%s.corrupted." + "%04u%02u%02u%02u%02u%02u.0Z", + ctdb_db->db_path, + tm->tm_year+1900, tm->tm_mon+1, + tm->tm_mday, tm->tm_hour, tm->tm_min, + tm->tm_sec); + if (new_path == NULL) { + DEBUG(DEBUG_CRIT,(__location__ " talloc_asprintf() failed\n")); + return -1; + } + + new_reason = talloc_asprintf(ctdb_db, + "ERROR - Backup of corrupted TDB in '%s'", + new_path); + if (new_reason == NULL) { + DEBUG(DEBUG_CRIT,(__location__ " talloc_asprintf() failed\n")); + return -1; + } + ret = ctdb_update_persistent_health(ctdb, ctdb_db, new_reason, 0); + talloc_free(new_reason); + if (ret != 0) { + DEBUG(DEBUG_CRIT,(__location__ + ": ctdb_backup_corrupted_tdb(%s) not implemented yet\n", + ctdb_db->db_path)); + return -1; + } + + ret = rename(ctdb_db->db_path, new_path); + if (ret != 0) { + DEBUG(DEBUG_CRIT,(__location__ + ": ctdb_backup_corrupted_tdb(%s) rename to %s failed: %d - %s\n", + ctdb_db->db_path, new_path, + errno, strerror(errno))); + talloc_free(new_path); + return -1; + } + + DEBUG(DEBUG_CRIT,(__location__ + ": ctdb_backup_corrupted_tdb(%s) renamed to %s\n", + ctdb_db->db_path, new_path)); + talloc_free(new_path); + return 0; +} + +int ctdb_recheck_persistent_health(struct ctdb_context *ctdb) +{ + struct ctdb_db_context *ctdb_db; + int ret; + int ok = 0; + int fail = 0; + + for (ctdb_db = ctdb->db_list; ctdb_db; ctdb_db = ctdb_db->next) { + if (!ctdb_db_persistent(ctdb_db)) { + continue; + } + + ret = ctdb_load_persistent_health(ctdb, ctdb_db); + if (ret != 0) { + DEBUG(DEBUG_ALERT,(__location__ + " load persistent health for '%s' failed\n", + ctdb_db->db_path)); + return -1; + } + + if (ctdb_db->unhealthy_reason == NULL) { + ok++; + DEBUG(DEBUG_INFO,(__location__ + " persistent db '%s' healthy\n", + ctdb_db->db_path)); + continue; + } + + fail++; + DEBUG(DEBUG_ALERT,(__location__ + " persistent db '%s' unhealthy: %s\n", + ctdb_db->db_path, + ctdb_db->unhealthy_reason)); + } + DEBUG(DEBUG_NOTICE, + ("ctdb_recheck_persistent_health: OK[%d] FAIL[%d]\n", + ok, fail)); + + if (fail != 0) { + return -1; + } + + return 0; +} + + +/* + mark a database - as healthy + */ +int32_t ctdb_control_db_set_healthy(struct ctdb_context *ctdb, TDB_DATA indata) +{ + uint32_t db_id = *(uint32_t *)indata.dptr; + struct ctdb_db_context *ctdb_db; + int ret; + bool may_recover = false; + + ctdb_db = find_ctdb_db(ctdb, db_id); + if (!ctdb_db) { + DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%x\n", db_id)); + return -1; + } + + if (ctdb_db->unhealthy_reason) { + may_recover = true; + } + + ret = ctdb_update_persistent_health(ctdb, ctdb_db, NULL, 1); + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ + " ctdb_update_persistent_health(%s) failed\n", + ctdb_db->db_name)); + return -1; + } + + if (may_recover && ctdb->runstate == CTDB_RUNSTATE_STARTUP) { + DEBUG(DEBUG_ERR, (__location__ " db %s become healthy - force recovery for startup\n", + ctdb_db->db_name)); + ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE; + } + + return 0; +} + +int32_t ctdb_control_db_get_health(struct ctdb_context *ctdb, + TDB_DATA indata, + TDB_DATA *outdata) +{ + uint32_t db_id = *(uint32_t *)indata.dptr; + struct ctdb_db_context *ctdb_db; + int ret; + + ctdb_db = find_ctdb_db(ctdb, db_id); + if (!ctdb_db) { + DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%x\n", db_id)); + return -1; + } + + ret = ctdb_load_persistent_health(ctdb, ctdb_db); + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ + " ctdb_load_persistent_health(%s) failed\n", + ctdb_db->db_name)); + return -1; + } + + *outdata = tdb_null; + if (ctdb_db->unhealthy_reason) { + outdata->dptr = (uint8_t *)ctdb_db->unhealthy_reason; + outdata->dsize = strlen(ctdb_db->unhealthy_reason)+1; + } + + return 0; +} + + +int ctdb_set_db_readonly(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db) +{ + char *ropath; + + if (ctdb_db_readonly(ctdb_db)) { + return 0; + } + + if (! ctdb_db_volatile(ctdb_db)) { + DEBUG(DEBUG_ERR, + ("Non-volatile databases do not support readonly flag\n")); + return -1; + } + + ropath = talloc_asprintf(ctdb_db, "%s.RO", ctdb_db->db_path); + if (ropath == NULL) { + DEBUG(DEBUG_CRIT,("Failed to asprintf the tracking database\n")); + return -1; + } + ctdb_db->rottdb = tdb_open(ropath, + ctdb->tunable.database_hash_size, + TDB_NOLOCK|TDB_CLEAR_IF_FIRST|TDB_NOSYNC, + O_CREAT|O_RDWR, 0600); + if (ctdb_db->rottdb == NULL) { + DEBUG(DEBUG_CRIT,("Failed to open/create the tracking database '%s'\n", ropath)); + talloc_free(ropath); + return -1; + } + + DEBUG(DEBUG_NOTICE,("OPENED tracking database : '%s'\n", ropath)); + + ctdb_db_set_readonly(ctdb_db); + + DEBUG(DEBUG_NOTICE, ("Readonly property set on DB %s\n", ctdb_db->db_name)); + + talloc_free(ropath); + return 0; +} + +/* + attach to a database, handling both persistent and non-persistent databases + return 0 on success, -1 on failure + */ +static int ctdb_local_attach(struct ctdb_context *ctdb, const char *db_name, + uint8_t db_flags, const char *unhealthy_reason) +{ + struct ctdb_db_context *ctdb_db, *tmp_db; + int ret; + struct TDB_DATA key; + int tdb_flags; + int mode = 0600; + int remaining_tries = 0; + + ctdb_db = talloc_zero(ctdb, struct ctdb_db_context); + CTDB_NO_MEMORY(ctdb, ctdb_db); + + ctdb_db->ctdb = ctdb; + ctdb_db->db_name = talloc_strdup(ctdb_db, db_name); + CTDB_NO_MEMORY(ctdb, ctdb_db->db_name); + + key.dsize = strlen(db_name)+1; + key.dptr = discard_const(db_name); + ctdb_db->db_id = ctdb_hash(&key); + ctdb_db->db_flags = db_flags; + + if (ctdb_db_volatile(ctdb_db)) { + ctdb_db->delete_queue = trbt_create(ctdb_db, 0); + if (ctdb_db->delete_queue == NULL) { + CTDB_NO_MEMORY(ctdb, ctdb_db->delete_queue); + } + + ctdb_db->fetch_queue = trbt_create(ctdb_db, 0); + if (ctdb_db->fetch_queue == NULL) { + CTDB_NO_MEMORY(ctdb, ctdb_db->fetch_queue); + } + + ctdb_db->ctdb_ltdb_store_fn = ctdb_ltdb_store_server; + } + + /* check for hash collisions */ + for (tmp_db=ctdb->db_list;tmp_db;tmp_db=tmp_db->next) { + if (tmp_db->db_id == ctdb_db->db_id) { + DEBUG(DEBUG_CRIT,("db_id 0x%x hash collision. name1='%s' name2='%s'\n", + tmp_db->db_id, db_name, tmp_db->db_name)); + talloc_free(ctdb_db); + return -1; + } + } + + if (ctdb_db_persistent(ctdb_db)) { + if (unhealthy_reason) { + ret = ctdb_update_persistent_health(ctdb, ctdb_db, + unhealthy_reason, 0); + if (ret != 0) { + DEBUG(DEBUG_ALERT,(__location__ " ctdb_update_persistent_health('%s','%s') failed: %d\n", + ctdb_db->db_name, unhealthy_reason, ret)); + talloc_free(ctdb_db); + return -1; + } + } + + if (ctdb->max_persistent_check_errors > 0) { + remaining_tries = 1; + } + if (ctdb->runstate == CTDB_RUNSTATE_RUNNING) { + remaining_tries = 0; + } + + ret = ctdb_load_persistent_health(ctdb, ctdb_db); + if (ret != 0) { + DEBUG(DEBUG_ALERT,(__location__ " ctdb_load_persistent_health('%s') failed: %d\n", + ctdb_db->db_name, ret)); + talloc_free(ctdb_db); + return -1; + } + } + + if (ctdb_db->unhealthy_reason && remaining_tries == 0) { + DEBUG(DEBUG_ALERT,(__location__ "ERROR: tdb %s is marked as unhealthy: %s\n", + ctdb_db->db_name, ctdb_db->unhealthy_reason)); + talloc_free(ctdb_db); + return -1; + } + + if (ctdb_db->unhealthy_reason) { + /* this is just a warning, but we want that in the log file! */ + DEBUG(DEBUG_ALERT,(__location__ "Warning: tdb %s is marked as unhealthy: %s\n", + ctdb_db->db_name, ctdb_db->unhealthy_reason)); + } + + /* open the database */ + ctdb_db->db_path = talloc_asprintf(ctdb_db, "%s/%s.%u", + ctdb_db_persistent(ctdb_db) ? + ctdb->db_directory_persistent : + ctdb->db_directory, + db_name, ctdb->pnn); + + tdb_flags = ctdb_db_tdb_flags(db_flags, + ctdb->valgrinding, + ctdb_config.tdb_mutexes); + +again: + ctdb_db->ltdb = tdb_wrap_open(ctdb_db, ctdb_db->db_path, + ctdb->tunable.database_hash_size, + tdb_flags, + O_CREAT|O_RDWR, mode); + if (ctdb_db->ltdb == NULL) { + struct stat st; + int saved_errno = errno; + + if (! ctdb_db_persistent(ctdb_db)) { + DEBUG(DEBUG_CRIT,("Failed to open tdb '%s': %d - %s\n", + ctdb_db->db_path, + saved_errno, + strerror(saved_errno))); + talloc_free(ctdb_db); + return -1; + } + + if (remaining_tries == 0) { + DEBUG(DEBUG_CRIT,(__location__ + "Failed to open persistent tdb '%s': %d - %s\n", + ctdb_db->db_path, + saved_errno, + strerror(saved_errno))); + talloc_free(ctdb_db); + return -1; + } + + ret = stat(ctdb_db->db_path, &st); + if (ret != 0) { + DEBUG(DEBUG_CRIT,(__location__ + "Failed to open persistent tdb '%s': %d - %s\n", + ctdb_db->db_path, + saved_errno, + strerror(saved_errno))); + talloc_free(ctdb_db); + return -1; + } + + ret = ctdb_backup_corrupted_tdb(ctdb, ctdb_db); + if (ret != 0) { + DEBUG(DEBUG_CRIT,(__location__ + "Failed to open persistent tdb '%s': %d - %s\n", + ctdb_db->db_path, + saved_errno, + strerror(saved_errno))); + talloc_free(ctdb_db); + return -1; + } + + remaining_tries--; + mode = st.st_mode; + goto again; + } + + if (!ctdb_db_persistent(ctdb_db)) { + ctdb_check_db_empty(ctdb_db); + } else { + ret = tdb_check(ctdb_db->ltdb->tdb, NULL, NULL); + if (ret != 0) { + int fd; + struct stat st; + + DEBUG(DEBUG_CRIT,("tdb_check(%s) failed: %d - %s\n", + ctdb_db->db_path, ret, + tdb_errorstr(ctdb_db->ltdb->tdb))); + if (remaining_tries == 0) { + talloc_free(ctdb_db); + return -1; + } + + fd = tdb_fd(ctdb_db->ltdb->tdb); + ret = fstat(fd, &st); + if (ret != 0) { + DEBUG(DEBUG_CRIT,(__location__ + "Failed to fstat() persistent tdb '%s': %d - %s\n", + ctdb_db->db_path, + errno, + strerror(errno))); + talloc_free(ctdb_db); + return -1; + } + + /* close the TDB */ + talloc_free(ctdb_db->ltdb); + ctdb_db->ltdb = NULL; + + ret = ctdb_backup_corrupted_tdb(ctdb, ctdb_db); + if (ret != 0) { + DEBUG(DEBUG_CRIT,("Failed to backup corrupted tdb '%s'\n", + ctdb_db->db_path)); + talloc_free(ctdb_db); + return -1; + } + + remaining_tries--; + mode = st.st_mode; + goto again; + } + } + + /* remember the flags the client has specified */ + tdb_add_flags(ctdb_db->ltdb->tdb, tdb_flags); + + + /* set up a rb tree we can use to track which records we have a + fetch-lock in-flight for so we can defer any additional calls + for the same record. + */ + ctdb_db->deferred_fetch = trbt_create(ctdb_db, 0); + if (ctdb_db->deferred_fetch == NULL) { + DEBUG(DEBUG_ERR,("Failed to create deferred fetch rb tree for ctdb database\n")); + talloc_free(ctdb_db); + return -1; + } + + ctdb_db->defer_dmaster = trbt_create(ctdb_db, 0); + if (ctdb_db->defer_dmaster == NULL) { + DEBUG(DEBUG_ERR, ("Failed to create defer dmaster rb tree for %s\n", + ctdb_db->db_name)); + talloc_free(ctdb_db); + return -1; + } + + DLIST_ADD(ctdb->db_list, ctdb_db); + + /* setting this can help some high churn databases */ + tdb_set_max_dead(ctdb_db->ltdb->tdb, ctdb->tunable.database_max_dead); + + /* + all databases support the "null" function. we need this in + order to do forced migration of records + */ + ret = ctdb_daemon_set_call(ctdb, ctdb_db->db_id, ctdb_null_func, CTDB_NULL_FUNC); + if (ret != 0) { + DEBUG(DEBUG_CRIT,("Failed to setup null function for '%s'\n", ctdb_db->db_name)); + talloc_free(ctdb_db); + return -1; + } + + /* + all databases support the "fetch" function. we need this + for efficient Samba3 ctdb fetch + */ + ret = ctdb_daemon_set_call(ctdb, ctdb_db->db_id, ctdb_fetch_func, CTDB_FETCH_FUNC); + if (ret != 0) { + DEBUG(DEBUG_CRIT,("Failed to setup fetch function for '%s'\n", ctdb_db->db_name)); + talloc_free(ctdb_db); + return -1; + } + + /* + all databases support the "fetch_with_header" function. we need this + for efficient readonly record fetches + */ + ret = ctdb_daemon_set_call(ctdb, ctdb_db->db_id, ctdb_fetch_with_header_func, CTDB_FETCH_WITH_HEADER_FUNC); + if (ret != 0) { + DEBUG(DEBUG_CRIT,("Failed to setup fetch function for '%s'\n", ctdb_db->db_name)); + talloc_free(ctdb_db); + return -1; + } + + ret = ctdb_vacuum_init(ctdb_db); + if (ret != 0) { + DEBUG(DEBUG_CRIT,("Failed to setup vacuuming for " + "database '%s'\n", ctdb_db->db_name)); + talloc_free(ctdb_db); + return -1; + } + + ret = ctdb_migration_init(ctdb_db); + if (ret != 0) { + DEBUG(DEBUG_ERR, + ("Failed to setup migration tracking for db '%s'\n", + ctdb_db->db_name)); + talloc_free(ctdb_db); + return -1; + } + + ret = db_hash_init(ctdb_db, "lock_log", 2048, DB_HASH_COMPLEX, + &ctdb_db->lock_log); + if (ret != 0) { + DEBUG(DEBUG_ERR, + ("Failed to setup lock logging for db '%s'\n", + ctdb_db->db_name)); + talloc_free(ctdb_db); + return -1; + } + + ctdb_db->generation = ctdb->vnn_map->generation; + + DEBUG(DEBUG_NOTICE,("Attached to database '%s' with flags 0x%x\n", + ctdb_db->db_path, tdb_flags)); + + /* success */ + return 0; +} + + +struct ctdb_deferred_attach_context { + struct ctdb_deferred_attach_context *next, *prev; + struct ctdb_context *ctdb; + struct ctdb_req_control_old *c; +}; + + +static int ctdb_deferred_attach_destructor(struct ctdb_deferred_attach_context *da_ctx) +{ + DLIST_REMOVE(da_ctx->ctdb->deferred_attach, da_ctx); + + return 0; +} + +static void ctdb_deferred_attach_timeout(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *private_data) +{ + struct ctdb_deferred_attach_context *da_ctx = talloc_get_type(private_data, struct ctdb_deferred_attach_context); + struct ctdb_context *ctdb = da_ctx->ctdb; + + ctdb_request_control_reply(ctdb, da_ctx->c, NULL, -1, NULL); + talloc_free(da_ctx); +} + +static void ctdb_deferred_attach_callback(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *private_data) +{ + struct ctdb_deferred_attach_context *da_ctx = talloc_get_type(private_data, struct ctdb_deferred_attach_context); + struct ctdb_context *ctdb = da_ctx->ctdb; + + /* This talloc-steals the packet ->c */ + ctdb_input_pkt(ctdb, (struct ctdb_req_header *)da_ctx->c); + talloc_free(da_ctx); +} + +int ctdb_process_deferred_attach(struct ctdb_context *ctdb) +{ + struct ctdb_deferred_attach_context *da_ctx; + + /* call it from the main event loop as soon as the current event + finishes. + */ + while ((da_ctx = ctdb->deferred_attach) != NULL) { + DLIST_REMOVE(ctdb->deferred_attach, da_ctx); + tevent_add_timer(ctdb->ev, da_ctx, + timeval_current_ofs(1,0), + ctdb_deferred_attach_callback, da_ctx); + } + + return 0; +} + +/* + a client has asked to attach a new database + */ +int32_t ctdb_control_db_attach(struct ctdb_context *ctdb, + TDB_DATA indata, + TDB_DATA *outdata, + uint8_t db_flags, + uint32_t srcnode, + uint32_t client_id, + struct ctdb_req_control_old *c, + bool *async_reply) +{ + const char *db_name = (const char *)indata.dptr; + struct ctdb_db_context *db; + struct ctdb_node *node = ctdb->nodes[ctdb->pnn]; + struct ctdb_client *client = NULL; + uint32_t opcode; + + if (ctdb->tunable.allow_client_db_attach == 0) { + DEBUG(DEBUG_ERR, ("DB Attach to database %s denied by tunable " + "AllowClientDBAccess == 0\n", db_name)); + return -1; + } + + /* don't allow any local clients to attach while we are in recovery mode + * except for the recovery daemon. + * allow all attach from the network since these are always from remote + * recovery daemons. + */ + if (srcnode == ctdb->pnn && client_id != 0) { + client = reqid_find(ctdb->idr, client_id, struct ctdb_client); + } + if (client != NULL) { + /* If the node is inactive it is not part of the cluster + and we should not allow clients to attach to any + databases + */ + if (node->flags & NODE_FLAGS_INACTIVE) { + DEBUG(DEBUG_ERR,("DB Attach to database %s refused since node is inactive (flags=0x%x)\n", db_name, node->flags)); + return -1; + } + + if ((c->flags & CTDB_CTRL_FLAG_ATTACH_RECOVERY) && + ctdb->recovery_mode != CTDB_RECOVERY_ACTIVE) { + DBG_ERR("Attach from recovery refused because " + "recovery is not active\n"); + return -1; + } + + if (!(c->flags & CTDB_CTRL_FLAG_ATTACH_RECOVERY) && + (ctdb->recovery_mode == CTDB_RECOVERY_ACTIVE || + ctdb->runstate < CTDB_RUNSTATE_STARTUP)) { + struct ctdb_deferred_attach_context *da_ctx = talloc(client, struct ctdb_deferred_attach_context); + + if (da_ctx == NULL) { + DEBUG(DEBUG_ERR,("DB Attach to database %s deferral for client with pid:%d failed due to OOM.\n", db_name, client->pid)); + return -1; + } + + da_ctx->ctdb = ctdb; + da_ctx->c = talloc_steal(da_ctx, c); + talloc_set_destructor(da_ctx, ctdb_deferred_attach_destructor); + DLIST_ADD(ctdb->deferred_attach, da_ctx); + + tevent_add_timer(ctdb->ev, da_ctx, + timeval_current_ofs(ctdb->tunable.deferred_attach_timeout, 0), + ctdb_deferred_attach_timeout, da_ctx); + + DEBUG(DEBUG_ERR,("DB Attach to database %s deferred for client with pid:%d since node is in recovery mode.\n", db_name, client->pid)); + *async_reply = true; + return 0; + } + } + + /* see if we already have this name */ + db = ctdb_db_handle(ctdb, db_name); + if (db) { + if ((db->db_flags & db_flags) != db_flags) { + DEBUG(DEBUG_ERR, + ("Error: Failed to re-attach with 0x%x flags," + " database has 0x%x flags\n", db_flags, + db->db_flags)); + return -1; + } + outdata->dptr = (uint8_t *)&db->db_id; + outdata->dsize = sizeof(db->db_id); + return 0; + } + + if (ctdb_local_attach(ctdb, db_name, db_flags, NULL) != 0) { + return -1; + } + + db = ctdb_db_handle(ctdb, db_name); + if (!db) { + DEBUG(DEBUG_ERR,("Failed to find db handle for name '%s'\n", db_name)); + return -1; + } + + outdata->dptr = (uint8_t *)&db->db_id; + outdata->dsize = sizeof(db->db_id); + + /* Try to ensure it's locked in mem */ + lockdown_memory(ctdb->valgrinding); + + if (ctdb_db_persistent(db)) { + opcode = CTDB_CONTROL_DB_ATTACH_PERSISTENT; + } else if (ctdb_db_replicated(db)) { + opcode = CTDB_CONTROL_DB_ATTACH_REPLICATED; + } else { + opcode = CTDB_CONTROL_DB_ATTACH; + } + + /* tell all the other nodes about this database */ + ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, opcode, + 0, CTDB_CTRL_FLAG_NOREPLY, + indata, NULL, NULL); + + /* success */ + return 0; +} + +/* + * a client has asked to detach from a database + */ +int32_t ctdb_control_db_detach(struct ctdb_context *ctdb, TDB_DATA indata, + uint32_t client_id) +{ + uint32_t db_id; + struct ctdb_db_context *ctdb_db; + struct ctdb_client *client = NULL; + + db_id = *(uint32_t *)indata.dptr; + ctdb_db = find_ctdb_db(ctdb, db_id); + if (ctdb_db == NULL) { + DEBUG(DEBUG_ERR, ("Invalid dbid 0x%08x in DB detach\n", + db_id)); + return -1; + } + + if (ctdb->tunable.allow_client_db_attach == 1) { + DEBUG(DEBUG_ERR, ("DB detach from database %s denied. " + "Clients are allowed access to databases " + "(AllowClientDBAccess == 1)\n", + ctdb_db->db_name)); + return -1; + } + + if (! ctdb_db_volatile(ctdb_db)) { + DEBUG(DEBUG_ERR, + ("Detaching non-volatile database %s denied\n", + ctdb_db->db_name)); + return -1; + } + + /* Cannot detach from database when in recovery */ + if (ctdb->recovery_mode == CTDB_RECOVERY_ACTIVE) { + DEBUG(DEBUG_ERR, ("DB detach denied while in recovery\n")); + return -1; + } + + /* If a control comes from a client, then broadcast it to all nodes. + * Do the actual detach only if the control comes from other daemons. + */ + if (client_id != 0) { + client = reqid_find(ctdb->idr, client_id, struct ctdb_client); + if (client != NULL) { + /* forward the control to all the nodes */ + ctdb_daemon_send_control(ctdb, + CTDB_BROADCAST_CONNECTED, 0, + CTDB_CONTROL_DB_DETACH, 0, + CTDB_CTRL_FLAG_NOREPLY, + indata, NULL, NULL); + return 0; + } + DEBUG(DEBUG_ERR, ("Client has gone away. Failing DB detach " + "for database '%s'\n", ctdb_db->db_name)); + return -1; + } + + /* Disable vacuuming and drop all vacuuming data */ + talloc_free(ctdb_db->vacuum_handle); + talloc_free(ctdb_db->delete_queue); + talloc_free(ctdb_db->fetch_queue); + + /* Terminate any deferred fetch */ + talloc_free(ctdb_db->deferred_fetch); + + /* Terminate any traverses */ + while (ctdb_db->traverse) { + talloc_free(ctdb_db->traverse); + } + + /* Terminate any revokes */ + while (ctdb_db->revokechild_active) { + talloc_free(ctdb_db->revokechild_active); + } + + /* Free readonly tracking database */ + if (ctdb_db_readonly(ctdb_db)) { + talloc_free(ctdb_db->rottdb); + } + + DLIST_REMOVE(ctdb->db_list, ctdb_db); + + DEBUG(DEBUG_NOTICE, ("Detached from database '%s'\n", + ctdb_db->db_name)); + talloc_free(ctdb_db); + + return 0; +} + +/* + attach to all existing persistent databases + */ +static int ctdb_attach_persistent(struct ctdb_context *ctdb, + const char *unhealthy_reason) +{ + DIR *d; + struct dirent *de; + + /* open the persistent db directory and scan it for files */ + d = opendir(ctdb->db_directory_persistent); + if (d == NULL) { + return 0; + } + + while ((de=readdir(d))) { + char *p, *s, *q; + size_t len = strlen(de->d_name); + uint32_t node; + int invalid_name = 0; + + s = talloc_strdup(ctdb, de->d_name); + if (s == NULL) { + closedir(d); + CTDB_NO_MEMORY(ctdb, s); + } + + /* only accept names ending in .tdb */ + p = strstr(s, ".tdb."); + if (len < 7 || p == NULL) { + talloc_free(s); + continue; + } + + /* only accept names ending with .tdb. and any number of digits */ + q = p+5; + while (*q != 0 && invalid_name == 0) { + if (!isdigit(*q++)) { + invalid_name = 1; + } + } + if (invalid_name == 1 || sscanf(p+5, "%u", &node) != 1 || node != ctdb->pnn) { + DEBUG(DEBUG_ERR,("Ignoring persistent database '%s'\n", de->d_name)); + talloc_free(s); + continue; + } + p[4] = 0; + + if (ctdb_local_attach(ctdb, s, CTDB_DB_FLAGS_PERSISTENT, unhealthy_reason) != 0) { + DEBUG(DEBUG_ERR,("Failed to attach to persistent database '%s'\n", de->d_name)); + closedir(d); + talloc_free(s); + return -1; + } + + DEBUG(DEBUG_INFO,("Attached to persistent database %s\n", s)); + + talloc_free(s); + } + closedir(d); + return 0; +} + +int ctdb_attach_databases(struct ctdb_context *ctdb) +{ + int ret; + char *persistent_health_path = NULL; + char *unhealthy_reason = NULL; + bool first_try = true; + + persistent_health_path = talloc_asprintf(ctdb, "%s/%s.%u", + ctdb->db_directory_state, + PERSISTENT_HEALTH_TDB, + ctdb->pnn); + if (persistent_health_path == NULL) { + DEBUG(DEBUG_CRIT,(__location__ " talloc_asprintf() failed\n")); + return -1; + } + +again: + + ctdb->db_persistent_health = tdb_wrap_open(ctdb, persistent_health_path, + 0, TDB_DISALLOW_NESTING, + O_CREAT | O_RDWR, 0600); + if (ctdb->db_persistent_health == NULL) { + struct tdb_wrap *tdb; + + if (!first_try) { + DEBUG(DEBUG_CRIT,("Failed to open tdb '%s': %d - %s\n", + persistent_health_path, + errno, + strerror(errno))); + talloc_free(persistent_health_path); + talloc_free(unhealthy_reason); + return -1; + } + first_try = false; + + unhealthy_reason = talloc_asprintf(ctdb, "WARNING - '%s' %s - %s", + persistent_health_path, + "was cleared after a failure", + "manual verification needed"); + if (unhealthy_reason == NULL) { + DEBUG(DEBUG_CRIT,(__location__ " talloc_asprintf() failed\n")); + talloc_free(persistent_health_path); + return -1; + } + + DEBUG(DEBUG_CRIT,("Failed to open tdb '%s' - retrying after CLEAR_IF_FIRST\n", + persistent_health_path)); + tdb = tdb_wrap_open(ctdb, persistent_health_path, + 0, TDB_CLEAR_IF_FIRST | TDB_DISALLOW_NESTING, + O_CREAT | O_RDWR, 0600); + if (tdb) { + DEBUG(DEBUG_CRIT,("Failed to open tdb '%s' - with CLEAR_IF_FIRST: %d - %s\n", + persistent_health_path, + errno, + strerror(errno))); + talloc_free(persistent_health_path); + talloc_free(unhealthy_reason); + return -1; + } + + talloc_free(tdb); + goto again; + } + ret = tdb_check(ctdb->db_persistent_health->tdb, NULL, NULL); + if (ret != 0) { + struct tdb_wrap *tdb; + + talloc_free(ctdb->db_persistent_health); + ctdb->db_persistent_health = NULL; + + if (!first_try) { + DEBUG(DEBUG_CRIT,("tdb_check('%s') failed\n", + persistent_health_path)); + talloc_free(persistent_health_path); + talloc_free(unhealthy_reason); + return -1; + } + first_try = false; + + unhealthy_reason = talloc_asprintf(ctdb, "WARNING - '%s' %s - %s", + persistent_health_path, + "was cleared after a failure", + "manual verification needed"); + if (unhealthy_reason == NULL) { + DEBUG(DEBUG_CRIT,(__location__ " talloc_asprintf() failed\n")); + talloc_free(persistent_health_path); + return -1; + } + + DEBUG(DEBUG_CRIT,("tdb_check('%s') failed - retrying after CLEAR_IF_FIRST\n", + persistent_health_path)); + tdb = tdb_wrap_open(ctdb, persistent_health_path, + 0, TDB_CLEAR_IF_FIRST | TDB_DISALLOW_NESTING, + O_CREAT | O_RDWR, 0600); + if (tdb) { + DEBUG(DEBUG_CRIT,("Failed to open tdb '%s' - with CLEAR_IF_FIRST: %d - %s\n", + persistent_health_path, + errno, + strerror(errno))); + talloc_free(persistent_health_path); + talloc_free(unhealthy_reason); + return -1; + } + + talloc_free(tdb); + goto again; + } + talloc_free(persistent_health_path); + + ret = ctdb_attach_persistent(ctdb, unhealthy_reason); + talloc_free(unhealthy_reason); + if (ret != 0) { + return ret; + } + + return 0; +} + +/* + called when a broadcast seqnum update comes in + */ +int32_t ctdb_ltdb_update_seqnum(struct ctdb_context *ctdb, uint32_t db_id, uint32_t srcnode) +{ + struct ctdb_db_context *ctdb_db; + if (srcnode == ctdb->pnn) { + /* don't update ourselves! */ + return 0; + } + + ctdb_db = find_ctdb_db(ctdb, db_id); + if (!ctdb_db) { + DEBUG(DEBUG_ERR,("Unknown db_id 0x%x in ctdb_ltdb_update_seqnum\n", db_id)); + return -1; + } + + if (ctdb_db->unhealthy_reason) { + DEBUG(DEBUG_ERR,("db(%s) unhealty in ctdb_ltdb_update_seqnum: %s\n", + ctdb_db->db_name, ctdb_db->unhealthy_reason)); + return -1; + } + + tdb_increment_seqnum_nonblock(ctdb_db->ltdb->tdb); + ctdb_db->seqnum = tdb_get_seqnum(ctdb_db->ltdb->tdb); + return 0; +} + +/* + timer to check for seqnum changes in a ltdb and propagate them + */ +static void ctdb_ltdb_seqnum_check(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *p) +{ + struct ctdb_db_context *ctdb_db = talloc_get_type(p, struct ctdb_db_context); + struct ctdb_context *ctdb = ctdb_db->ctdb; + uint32_t new_seqnum = tdb_get_seqnum(ctdb_db->ltdb->tdb); + if (new_seqnum != ctdb_db->seqnum) { + /* something has changed - propagate it */ + TDB_DATA data; + data.dptr = (uint8_t *)&ctdb_db->db_id; + data.dsize = sizeof(uint32_t); + ctdb_daemon_send_control(ctdb, + CTDB_BROADCAST_ACTIVE, + 0, + CTDB_CONTROL_UPDATE_SEQNUM, + 0, + CTDB_CTRL_FLAG_NOREPLY, + data, + NULL, + NULL); + } + ctdb_db->seqnum = new_seqnum; + + /* setup a new timer */ + ctdb_db->seqnum_update = + tevent_add_timer(ctdb->ev, ctdb_db, + timeval_current_ofs(ctdb->tunable.seqnum_interval/1000, + (ctdb->tunable.seqnum_interval%1000)*1000), + ctdb_ltdb_seqnum_check, ctdb_db); +} + +/* + enable seqnum handling on this db + */ +int32_t ctdb_ltdb_enable_seqnum(struct ctdb_context *ctdb, uint32_t db_id) +{ + struct ctdb_db_context *ctdb_db; + ctdb_db = find_ctdb_db(ctdb, db_id); + if (!ctdb_db) { + DEBUG(DEBUG_ERR,("Unknown db_id 0x%x in ctdb_ltdb_enable_seqnum\n", db_id)); + return -1; + } + + if (ctdb_db->seqnum_update == NULL) { + ctdb_db->seqnum_update = tevent_add_timer( + ctdb->ev, ctdb_db, + timeval_current_ofs(ctdb->tunable.seqnum_interval/1000, + (ctdb->tunable.seqnum_interval%1000)*1000), + ctdb_ltdb_seqnum_check, ctdb_db); + } + + tdb_enable_seqnum(ctdb_db->ltdb->tdb); + ctdb_db->seqnum = tdb_get_seqnum(ctdb_db->ltdb->tdb); + return 0; +} + +int ctdb_set_db_sticky(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db) +{ + if (ctdb_db_sticky(ctdb_db)) { + return 0; + } + + if (! ctdb_db_volatile(ctdb_db)) { + DEBUG(DEBUG_ERR, + ("Non-volatile databases do not support sticky flag\n")); + return -1; + } + + ctdb_db->sticky_records = trbt_create(ctdb_db, 0); + + ctdb_db_set_sticky(ctdb_db); + + DEBUG(DEBUG_NOTICE,("set db sticky %s\n", ctdb_db->db_name)); + + return 0; +} + +void ctdb_db_statistics_reset(struct ctdb_db_context *ctdb_db) +{ + unsigned int i; + + for (i=0; i<MAX_HOT_KEYS; i++) { + if (ctdb_db->hot_keys[i].key.dsize > 0) { + TALLOC_FREE(ctdb_db->hot_keys[i].key.dptr); + ctdb_db->hot_keys[i].key.dsize = 0; + } + ctdb_db->hot_keys[i].count = 0; + ctdb_db->hot_keys[i].last_logged_count = 0; + } + + ZERO_STRUCT(ctdb_db->statistics); +} + +int32_t ctdb_control_get_db_statistics(struct ctdb_context *ctdb, + uint32_t db_id, + TDB_DATA *outdata) +{ + struct ctdb_db_context *ctdb_db; + struct ctdb_db_statistics_old *stats; + unsigned int i; + size_t len; + char *ptr; + + ctdb_db = find_ctdb_db(ctdb, db_id); + if (!ctdb_db) { + DEBUG(DEBUG_ERR,("Unknown db_id 0x%x in get_db_statistics\n", db_id)); + return -1; + } + + len = offsetof(struct ctdb_db_statistics_old, hot_keys_wire); + for (i = 0; i < MAX_HOT_KEYS; i++) { + struct ctdb_db_statistics_old *s = &ctdb_db->statistics; + + s->hot_keys[i].key.dsize = ctdb_db->hot_keys[i].key.dsize; + s->hot_keys[i].key.dptr = ctdb_db->hot_keys[i].key.dptr; + s->hot_keys[i].count = ctdb_db->hot_keys[i].count; + + len += s->hot_keys[i].key.dsize; + } + + stats = talloc_size(outdata, len); + if (stats == NULL) { + DEBUG(DEBUG_ERR,("Failed to allocate db statistics structure\n")); + return -1; + } + + memcpy(stats, &ctdb_db->statistics, + offsetof(struct ctdb_db_statistics_old, hot_keys_wire)); + + stats->num_hot_keys = MAX_HOT_KEYS; + + ptr = &stats->hot_keys_wire[0]; + for (i = 0; i < MAX_HOT_KEYS; i++) { + memcpy(ptr, ctdb_db->statistics.hot_keys[i].key.dptr, + ctdb_db->statistics.hot_keys[i].key.dsize); + ptr += ctdb_db->statistics.hot_keys[i].key.dsize; + } + + outdata->dptr = (uint8_t *)stats; + outdata->dsize = len; + + return 0; +} diff --git a/ctdb/server/ctdb_monitor.c b/ctdb/server/ctdb_monitor.c new file mode 100644 index 0000000..ab58ec4 --- /dev/null +++ b/ctdb/server/ctdb_monitor.c @@ -0,0 +1,509 @@ +/* + monitoring links to all other nodes to detect dead nodes + + + Copyright (C) Ronnie Sahlberg 2007 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "replace.h" +#include "system/filesys.h" +#include "system/network.h" +#include "system/wait.h" + +#include <talloc.h> +#include <tevent.h> + +#include "lib/util/debug.h" +#include "lib/util/samba_util.h" +#include "lib/util/util_process.h" + +#include "ctdb_private.h" + +#include "common/system.h" +#include "common/common.h" +#include "common/logging.h" + +struct ctdb_monitor_state { + TALLOC_CTX *monitor_context; + uint32_t next_interval; + uint32_t event_script_timeouts; +}; + +static void ctdb_check_health(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *private_data); + +static int ctdb_run_notification_script_child(struct ctdb_context *ctdb, const char *event) +{ + struct stat st; + int ret; + char *cmd; + + if (stat(ctdb->notification_script, &st) != 0) { + DEBUG(DEBUG_ERR,("Could not stat notification script %s. Can not send notifications.\n", ctdb->notification_script)); + return -1; + } + if (!(st.st_mode & S_IXUSR)) { + DEBUG(DEBUG_ERR,("Notification script %s is not executable.\n", ctdb->notification_script)); + return -1; + } + + cmd = talloc_asprintf(ctdb, "%s %s\n", ctdb->notification_script, event); + CTDB_NO_MEMORY(ctdb, cmd); + + ret = system(cmd); + /* if the system() call was successful, translate ret into the + return code from the command + */ + if (ret != -1) { + ret = WEXITSTATUS(ret); + } + if (ret != 0) { + DEBUG(DEBUG_ERR,("Notification script \"%s\" failed with error %d\n", cmd, ret)); + } + + return ret; +} + +void ctdb_run_notification_script(struct ctdb_context *ctdb, const char *event) +{ + pid_t child; + + if (ctdb->notification_script == NULL) { + return; + } + + child = ctdb_fork(ctdb); + if (child == (pid_t)-1) { + DEBUG(DEBUG_ERR,("Failed to fork() a notification child process\n")); + return; + } + if (child == 0) { + int ret; + + prctl_set_comment("ctdb_notification"); + ret = ctdb_run_notification_script_child(ctdb, event); + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ " Notification script failed\n")); + } + _exit(0); + } + + return; +} + +/* + called when a health monitoring event script finishes + */ +static void ctdb_health_callback(struct ctdb_context *ctdb, int status, void *p) +{ + struct ctdb_node *node = ctdb->nodes[ctdb->pnn]; + TDB_DATA data; + struct ctdb_node_flag_change c; + uint32_t next_interval; + int ret; + TDB_DATA rddata; + struct ctdb_srvid_message rd; + const char *state_str = NULL; + + c.pnn = ctdb->pnn; + c.old_flags = node->flags; + + ZERO_STRUCT(rd); + rd.pnn = ctdb->pnn; + rd.srvid = 0; + + rddata.dptr = (uint8_t *)&rd; + rddata.dsize = sizeof(rd); + + if (status == ECANCELED) { + DEBUG(DEBUG_ERR,("Monitoring event was cancelled\n")); + goto after_change_status; + } + + if (status == ETIMEDOUT) { + ctdb->monitor->event_script_timeouts++; + + if (ctdb->monitor->event_script_timeouts >= + ctdb->tunable.monitor_timeout_count) { + DEBUG(DEBUG_ERR, + ("Maximum monitor timeout count %u reached." + " Making node unhealthy\n", + ctdb->tunable.monitor_timeout_count)); + } else { + /* We pretend this is OK. */ + goto after_change_status; + } + } else { + ctdb->monitor->event_script_timeouts = 0; + } + + if (status != 0 && !(node->flags & NODE_FLAGS_UNHEALTHY)) { + DEBUG(DEBUG_NOTICE,("monitor event failed - disabling node\n")); + node->flags |= NODE_FLAGS_UNHEALTHY; + ctdb->monitor->next_interval = 5; + + ctdb_run_notification_script(ctdb, "unhealthy"); + } else if (status == 0 && (node->flags & NODE_FLAGS_UNHEALTHY)) { + DEBUG(DEBUG_NOTICE,("monitor event OK - node re-enabled\n")); + node->flags &= ~NODE_FLAGS_UNHEALTHY; + ctdb->monitor->next_interval = 5; + + ctdb_run_notification_script(ctdb, "healthy"); + } + +after_change_status: + next_interval = ctdb->monitor->next_interval; + + ctdb->monitor->next_interval *= 2; + if (ctdb->monitor->next_interval > ctdb->tunable.monitor_interval) { + ctdb->monitor->next_interval = ctdb->tunable.monitor_interval; + } + + tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context, + timeval_current_ofs(next_interval, 0), + ctdb_check_health, ctdb); + + if (c.old_flags == node->flags) { + return; + } + + c.new_flags = node->flags; + + data.dptr = (uint8_t *)&c; + data.dsize = sizeof(c); + + /* ask the recovery daemon to push these changes out to all nodes */ + ctdb_daemon_send_message(ctdb, ctdb->pnn, + CTDB_SRVID_PUSH_NODE_FLAGS, data); + + if (c.new_flags & NODE_FLAGS_UNHEALTHY) { + state_str = "UNHEALTHY"; + } else { + state_str = "HEALTHY"; + } + + /* ask the recmaster to reallocate all addresses */ + DEBUG(DEBUG_ERR, + ("Node became %s. Ask recovery master to reallocate IPs\n", + state_str)); + ret = ctdb_daemon_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_TAKEOVER_RUN, rddata); + if (ret != 0) { + DEBUG(DEBUG_ERR, + (__location__ + " Failed to send IP takeover run request\n")); + } +} + + +static void ctdb_run_startup(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *private_data); +/* + called when the startup event script finishes + */ +static void ctdb_startup_callback(struct ctdb_context *ctdb, int status, void *p) +{ + if (status != 0) { + DEBUG(DEBUG_ERR,("startup event failed\n")); + tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context, + timeval_current_ofs(5, 0), + ctdb_run_startup, ctdb); + return; + } + + DEBUG(DEBUG_NOTICE,("startup event OK - enabling monitoring\n")); + ctdb_set_runstate(ctdb, CTDB_RUNSTATE_RUNNING); + ctdb->monitor->next_interval = 2; + ctdb_run_notification_script(ctdb, "startup"); + + /* tell all other nodes we've just started up */ + ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, + 0, CTDB_CONTROL_STARTUP, 0, + CTDB_CTRL_FLAG_NOREPLY, + tdb_null, NULL, NULL); + + tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context, + timeval_current_ofs(ctdb->monitor->next_interval, 0), + ctdb_check_health, ctdb); +} + +static void ctdb_run_startup(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *private_data) +{ + struct ctdb_context *ctdb = talloc_get_type(private_data, + struct ctdb_context); + int ret; + + /* This is necessary to avoid the "startup" event colliding + * with the "ipreallocated" event from the takeover run + * following the first recovery. We might as well serialise + * these things if we can. + */ + if (ctdb->runstate < CTDB_RUNSTATE_STARTUP) { + DEBUG(DEBUG_NOTICE, + ("Not yet in startup runstate. Wait one more second\n")); + tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context, + timeval_current_ofs(1, 0), + ctdb_run_startup, ctdb); + return; + } + + DEBUG(DEBUG_NOTICE,("Running the \"startup\" event.\n")); + ret = ctdb_event_script_callback(ctdb, + ctdb->monitor->monitor_context, + ctdb_startup_callback, + ctdb, CTDB_EVENT_STARTUP, "%s", ""); + + if (ret != 0) { + DEBUG(DEBUG_ERR,("Unable to launch startup event script\n")); + tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context, + timeval_current_ofs(5, 0), + ctdb_run_startup, ctdb); + } +} + +/* + wait until we have finished initial recoveries before we start the + monitoring events + */ +static void ctdb_wait_until_recovered(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *private_data) +{ + struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context); + int ret; + static int count = 0; + + count++; + + if (count < 60 || count%600 == 0) { + DEBUG(DEBUG_NOTICE,("CTDB_WAIT_UNTIL_RECOVERED\n")); + if (ctdb->nodes[ctdb->pnn]->flags & NODE_FLAGS_STOPPED) { + DEBUG(DEBUG_NOTICE,("Node is STOPPED. Node will NOT recover.\n")); + } + } + + if (ctdb->vnn_map->generation == INVALID_GENERATION) { + ctdb->db_persistent_startup_generation = INVALID_GENERATION; + + tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context, + timeval_current_ofs(1, 0), + ctdb_wait_until_recovered, ctdb); + return; + } + + if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) { + ctdb->db_persistent_startup_generation = INVALID_GENERATION; + + DEBUG(DEBUG_NOTICE,(__location__ " in recovery. Wait one more second\n")); + tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context, + timeval_current_ofs(1, 0), + ctdb_wait_until_recovered, ctdb); + return; + } + + + if (!fast_start && timeval_elapsed(&ctdb->last_recovery_finished) < (ctdb->tunable.rerecovery_timeout + 3)) { + ctdb->db_persistent_startup_generation = INVALID_GENERATION; + + DEBUG(DEBUG_NOTICE,(__location__ " wait for pending recoveries to end. Wait one more second.\n")); + + tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context, + timeval_current_ofs(1, 0), + ctdb_wait_until_recovered, ctdb); + return; + } + + if (ctdb->vnn_map->generation == ctdb->db_persistent_startup_generation) { + DEBUG(DEBUG_INFO,(__location__ " skip ctdb_recheck_persistent_health() " + "until the next recovery\n")); + tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context, + timeval_current_ofs(1, 0), + ctdb_wait_until_recovered, ctdb); + return; + } + + ctdb->db_persistent_startup_generation = ctdb->vnn_map->generation; + ret = ctdb_recheck_persistent_health(ctdb); + if (ret != 0) { + ctdb->db_persistent_check_errors++; + if (ctdb->db_persistent_check_errors < ctdb->max_persistent_check_errors) { + DEBUG(DEBUG_ERR, + (__location__ "ctdb_recheck_persistent_health() " + "failed (%llu of %llu times) - retry later\n", + (unsigned long long)ctdb->db_persistent_check_errors, + (unsigned long long)ctdb->max_persistent_check_errors)); + tevent_add_timer(ctdb->ev, + ctdb->monitor->monitor_context, + timeval_current_ofs(1, 0), + ctdb_wait_until_recovered, ctdb); + return; + } + DEBUG(DEBUG_ALERT,(__location__ + "ctdb_recheck_persistent_health() failed (%llu times) - prepare shutdown\n", + (unsigned long long)ctdb->db_persistent_check_errors)); + ctdb_shutdown_sequence(ctdb, 11); + /* In case above returns due to duplicate shutdown */ + return; + } + ctdb->db_persistent_check_errors = 0; + + tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context, + timeval_current(), ctdb_run_startup, ctdb); +} + + +/* + see if the event scripts think we are healthy + */ +static void ctdb_check_health(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *private_data) +{ + struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context); + bool skip_monitoring = false; + int ret = 0; + + if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL || + ctdb->nodes[ctdb->pnn]->flags & NODE_FLAGS_INACTIVE || + ctdb->runstate != CTDB_RUNSTATE_RUNNING) { + skip_monitoring = true; + } else { + if (ctdb_db_all_frozen(ctdb)) { + DEBUG(DEBUG_ERR, + ("Skip monitoring since databases are frozen\n")); + skip_monitoring = true; + } + } + + if (skip_monitoring) { + tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context, + timeval_current_ofs(ctdb->monitor->next_interval, 0), + ctdb_check_health, ctdb); + return; + } + + ret = ctdb_event_script_callback(ctdb, + ctdb->monitor->monitor_context, + ctdb_health_callback, + ctdb, CTDB_EVENT_MONITOR, "%s", ""); + if (ret != 0) { + DEBUG(DEBUG_ERR,("Unable to launch monitor event script\n")); + ctdb->monitor->next_interval = 5; + tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context, + timeval_current_ofs(5, 0), + ctdb_check_health, ctdb); + } +} + +/* stop any monitoring + this should only be done when shutting down the daemon +*/ +void ctdb_stop_monitoring(struct ctdb_context *ctdb) +{ + if (ctdb->monitor == NULL) { + D_NOTICE("Monitoring not yet initialised\n"); + return; + } + + TALLOC_FREE(ctdb->monitor->monitor_context); + + ctdb->monitor->next_interval = 5; + DEBUG(DEBUG_NOTICE,("Monitoring has been stopped\n")); +} + +/* + start watching for nodes that might be dead + */ +void ctdb_wait_for_first_recovery(struct ctdb_context *ctdb) +{ + ctdb_set_runstate(ctdb, CTDB_RUNSTATE_FIRST_RECOVERY); + + ctdb->monitor = talloc(ctdb, struct ctdb_monitor_state); + CTDB_NO_MEMORY_FATAL(ctdb, ctdb->monitor); + + ctdb->monitor->monitor_context = talloc_new(ctdb->monitor); + CTDB_NO_MEMORY_FATAL(ctdb, ctdb->monitor->monitor_context); + + tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context, + timeval_current_ofs(1, 0), + ctdb_wait_until_recovered, ctdb); +} + + +/* + modify flags on a node + */ +int32_t ctdb_control_modflags(struct ctdb_context *ctdb, TDB_DATA indata) +{ + struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)indata.dptr; + struct ctdb_node *node; + uint32_t old_flags; + + /* + * Don't let other nodes override the current node's flags. + * The recovery master fetches flags from this node so there's + * no need to push them back. Doing so is racy. + */ + if (c->pnn == ctdb->pnn) { + DBG_DEBUG("Ignoring flag changes for current node\n"); + return 0; + } + + node = ctdb_find_node(ctdb, c->pnn); + if (node == NULL) { + DBG_ERR("Node %u is invalid\n", c->pnn); + return -1; + } + + if (node->flags & NODE_FLAGS_DISCONNECTED) { + DBG_DEBUG("Ignoring flag changes for disconnected node\n"); + return 0; + } + + /* + * Remember the old flags. We don't care what some other node + * thought the old flags were - that's irrelevant. + */ + old_flags = node->flags; + + /* + * This node tracks nodes it is connected to, so don't let + * another node override this + */ + node->flags = + (old_flags & NODE_FLAGS_DISCONNECTED) | + (c->new_flags & ~NODE_FLAGS_DISCONNECTED); + + if (node->flags == old_flags) { + return 0; + } + + D_NOTICE("Node %u has changed flags - 0x%x -> 0x%x\n", + c->pnn, + old_flags, + node->flags); + + if (node->flags == 0 && ctdb->runstate <= CTDB_RUNSTATE_STARTUP) { + DBG_ERR("Node %u became healthy - force recovery for startup\n", + c->pnn); + ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE; + } + + return 0; +} diff --git a/ctdb/server/ctdb_mutex_fcntl_helper.c b/ctdb/server/ctdb_mutex_fcntl_helper.c new file mode 100644 index 0000000..aac98ea --- /dev/null +++ b/ctdb/server/ctdb_mutex_fcntl_helper.c @@ -0,0 +1,795 @@ +/* + CTDB mutex fcntl lock file helper + + Copyright (C) Martin Schwenke 2015 + + wait_for_parent() code from ctdb_lock_helper.c: + + Copyright (C) Amitay Isaacs 2013 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "replace.h" +#include "system/filesys.h" +#include "system/network.h" +#include "system/wait.h" +#include "system/dir.h" + +#include <tevent.h> + +#include "lib/util/sys_rw.h" +#include "lib/util/tevent_unix.h" +#include "lib/util/util.h" +#include "lib/util/smb_strtox.h" + +/* protocol.h is just needed for ctdb_sock_addr, which is used in system.h */ +#include "protocol/protocol.h" +#include "common/system.h" +#include "common/tmon.h" + +static char progpath[PATH_MAX]; +static char *progname = NULL; + +static int fcntl_lock_fd(int fd, bool block, off_t start) +{ + static struct flock lock = { + .l_type = F_WRLCK, + .l_whence = SEEK_SET, + .l_len = 1, + .l_pid = 0, + }; + int cmd = block ? F_SETLKW : F_SETLK; + + lock.l_start = start; + if (fcntl(fd, cmd, &lock) != 0) { + return errno; + } + + return 0; +} + +static char fcntl_lock(const char *file, int *outfd) +{ + int fd; + int ret; + + fd = open(file, O_RDWR|O_CREAT, 0600); + if (fd == -1) { + fprintf(stderr, "%s: Unable to open %s - (%s)\n", + progname, file, strerror(errno)); + return '3'; + } + + ret = fcntl_lock_fd(fd, false, 0); + if (ret != 0) { + close(fd); + if (ret == EACCES || ret == EAGAIN) { + /* Lock contention, fail silently */ + return '1'; + } + + /* Log an error for any other failure */ + fprintf(stderr, + "%s: Failed to get lock on '%s' - (%s)\n", + progname, + file, + strerror(ret)); + return '3'; + } + + *outfd = fd; + + return '0'; +} + +/* + * Wait and see if the parent exits + */ + +struct wait_for_parent_state { + struct tevent_context *ev; + pid_t ppid; +}; + +static void wait_for_parent_check(struct tevent_req *subreq); + +static struct tevent_req *wait_for_parent_send(TALLOC_CTX *mem_ctx, + struct tevent_context *ev, + pid_t ppid) +{ + struct tevent_req *req, *subreq; + struct wait_for_parent_state *state; + + req = tevent_req_create(mem_ctx, &state, struct wait_for_parent_state); + if (req == NULL) { + return NULL; + } + + state->ev = ev; + state->ppid = ppid; + + if (ppid == 1) { + fprintf(stderr, "parent == 1\n"); + tevent_req_done(req); + return tevent_req_post(req, ev); + } + + subreq = tevent_wakeup_send(state, ev, + tevent_timeval_current_ofs(5,0)); + if (tevent_req_nomem(subreq, req)) { + return tevent_req_post(req, ev); + } + tevent_req_set_callback(subreq, wait_for_parent_check, req); + + return req; +} + +static void wait_for_parent_check(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct wait_for_parent_state *state = tevent_req_data( + req, struct wait_for_parent_state); + bool status; + + status = tevent_wakeup_recv(subreq); + TALLOC_FREE(subreq); + if (! status) { + /* Ignore error */ + fprintf(stderr, "%s: tevent_wakeup_recv() failed\n", progname); + } + + if (kill(state->ppid, 0) == -1 && errno == ESRCH) { + fprintf(stderr, "parent gone\n"); + tevent_req_done(req); + return; + } + + subreq = tevent_wakeup_send(state, state->ev, + tevent_timeval_current_ofs(5,0)); + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, wait_for_parent_check, req); +} + +static bool wait_for_parent_recv(struct tevent_req *req, int *perr) +{ + if (tevent_req_is_unix_error(req, perr)) { + return false; + } + + return true; +} + +/* + * Perform I/O on lock in a loop - complete when file removed or replaced + */ + +struct lock_io_check_state { + struct tevent_context *ev; + const char *lock_file; + ino_t inode; + unsigned long recheck_interval; +}; + +static void lock_io_check_loop(struct tevent_req *subreq); + +static struct tevent_req *lock_io_check_send(TALLOC_CTX *mem_ctx, + struct tevent_context *ev, + const char *lock_file, + ino_t inode, + unsigned long recheck_interval) +{ + struct tevent_req *req, *subreq; + struct lock_io_check_state *state; + + req = tevent_req_create(mem_ctx, &state, struct lock_io_check_state); + if (req == NULL) { + return NULL; + } + + state->ev = ev; + state->lock_file = lock_file; + state->inode = inode; + state->recheck_interval = recheck_interval; + + subreq = tevent_wakeup_send( + state, + ev, + tevent_timeval_current_ofs(state->recheck_interval, 0)); + if (tevent_req_nomem(subreq, req)) { + return tevent_req_post(req, ev); + } + tevent_req_set_callback(subreq, lock_io_check_loop, req); + + return req; +} + +static void lock_io_check_loop(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct lock_io_check_state *state = tevent_req_data( + req, struct lock_io_check_state); + bool status; + struct stat sb; + int fd = -1; + int ret; + + status = tevent_wakeup_recv(subreq); + TALLOC_FREE(subreq); + if (! status) { + /* Ignore error */ + fprintf(stderr, "%s: tevent_wakeup_recv() failed\n", progname); + } + + fd = open(state->lock_file, O_RDWR); + if (fd == -1) { + fprintf(stderr, + "%s: " + "lock lost - lock file \"%s\" open failed (ret=%d)\n", + progname, + state->lock_file, + errno); + goto done; + } + + ret = fstat(fd, &sb); + if (ret != 0) { + fprintf(stderr, + "%s: " + "lock lost - lock file \"%s\" check failed (ret=%d)\n", + progname, + state->lock_file, + errno); + goto done; + } + + if (sb.st_ino != state->inode) { + fprintf(stderr, + "%s: lock lost - lock file \"%s\" inode changed\n", + progname, + state->lock_file); + goto done; + } + + /* + * Attempt to lock a 2nd byte range. Using a blocking lock + * encourages ping timeouts if the cluster filesystem is in a + * bad state. It also makes testing easier. + */ + ret = fcntl_lock_fd(fd, true, 1); + if (ret != 0) { + fprintf(stderr, + "%s: " + "lock fail - lock file \"%s\" test lock error (%d)\n", + progname, + state->lock_file, + ret); + goto done; + } + + /* Unlock occurs on close */ + close(fd); + + subreq = tevent_wakeup_send( + state, + state->ev, + tevent_timeval_current_ofs(state->recheck_interval, 0)); + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, lock_io_check_loop, req); + + return; + +done: + if (fd != -1) { + close(fd); + } + tevent_req_done(req); +} + +static bool lock_io_check_recv(struct tevent_req *req, int *perr) +{ + if (tevent_req_is_unix_error(req, perr)) { + return false; + } + + return true; +} + +struct lock_test_child_state { +}; + +static void lock_test_child_ping_done(struct tevent_req *subreq); +static void lock_test_child_io_check_done(struct tevent_req *subreq); + +static struct tevent_req *lock_test_child_send(TALLOC_CTX *mem_ctx, + struct tevent_context *ev, + const char *lock_file, + int fd, + ino_t inode, + unsigned long recheck_interval, + bool send_pings) +{ + struct tevent_req *req, *subreq; + struct lock_test_child_state *state; + unsigned int interval = send_pings ? 1 : 0; + + req = tevent_req_create(mem_ctx, &state, struct lock_test_child_state); + if (req == NULL) { + return NULL; + } + + subreq = tmon_ping_send(state, ev, fd, TMON_FD_BOTH, 0, interval); + if (tevent_req_nomem(subreq, req)) { + return tevent_req_post(req, ev); + } + tevent_req_set_callback(subreq, lock_test_child_ping_done, req); + + subreq = lock_io_check_send(state, + ev, + lock_file, + inode, + recheck_interval); + if (tevent_req_nomem(subreq, req)) { + return tevent_req_post(req, ev); + } + tevent_req_set_callback(subreq, lock_test_child_io_check_done, req); + + return req; +} + +static void lock_test_child_ping_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + bool status; + int err; + + status = tmon_ping_recv(subreq, &err); + TALLOC_FREE(subreq); + if (!status) { + tevent_req_error(req, err); + return; + } + + tevent_req_done(req); +} + +static void lock_test_child_io_check_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + bool status; + int err; + + status = lock_io_check_recv(subreq, &err); + TALLOC_FREE(subreq); + if (!status) { + tevent_req_error(req, err); + return; + } + + tevent_req_done(req); +} + +static bool lock_test_child_recv(struct tevent_req *req, int *perr) +{ + if (tevent_req_is_unix_error(req, perr)) { + /* Parent exit is expected */ + if (*perr == EPIPE) { + return true; + } + return false; + } + + return true; +} + +static void lock_test_child(const char *lock_file, + int lock_fd, + int pipe_fd, + unsigned long recheck_interval, + bool send_pings) +{ + struct tevent_context *ev; + struct tevent_req *req; + struct stat sb; + ino_t inode; + bool status; + int ret; + + ret = fstat(lock_fd, &sb); + if (ret != 0) { + fprintf(stderr, + "%s: lock lost - " + "lock file \"%s\" stat failed (ret=%d)\n", + progname, + lock_file, + errno); + _exit(1); + } + inode = sb.st_ino; + close(lock_fd); + + ev = tevent_context_init(NULL); + if (ev == NULL) { + fprintf(stderr, "%s: tevent_context_init() failed\n", progname); + _exit(1); + } + + req = lock_test_child_send(ev, + ev, + lock_file, + pipe_fd, + inode, + recheck_interval, + send_pings); + if (req == NULL) { + fprintf(stderr, + "%s: lock_test_child_send() failed\n", + progname); + _exit(1); + } + + tevent_req_poll(req, ev); + + status = lock_test_child_recv(req, &ret); + if (! status) { + fprintf(stderr, + "%s: lock_test_child_recv() failed (%d)\n", + progname, + ret); + _exit(1); + } + + _exit(0); +} + +struct lock_test_state { + int *lock_fdp; + int pipe_fd; + pid_t child_pid; +}; + +static void lock_test_ping_done(struct tevent_req *subreq); + +static struct tevent_req *lock_test_send(TALLOC_CTX *mem_ctx, + struct tevent_context *ev, + const char *lock_file, + int *fdp, + unsigned long recheck_interval, + unsigned long ping_timeout) +{ + struct tevent_req *req, *subreq; + struct lock_test_state *state; + pid_t pid; + int sv[2]; + int ret; + + req = tevent_req_create(mem_ctx, &state, struct lock_test_state); + if (req == NULL) { + return NULL; + } + + ret = socketpair(AF_UNIX, SOCK_STREAM, 0, sv); + if (ret != 0) { + fprintf(stderr, + "%s: socketpair() failed (errno=%d)\n", + progname, + errno); + tevent_req_error(req, errno); + return tevent_req_post(req, ev); + } + + pid = fork(); + if (pid == -1) { + + int err = errno; + fprintf(stderr, "%s: fork() failed (errno=%d)\n", progname, err); + close(sv[0]); + close(sv[1]); + tevent_req_error(req, err); + return tevent_req_post(req, ev); + } + if (pid == 0) { + /* Child */ + close(sv[0]); + TALLOC_FREE(ev); + + lock_test_child(lock_file, + *fdp, + sv[1], + recheck_interval, + ping_timeout != 0); + /* Above does not return */ + } + + /* Parent */ + close(sv[1]); + + state->lock_fdp = fdp; + state->pipe_fd = sv[0]; + state->child_pid = pid; + + subreq = tmon_ping_send(state, ev, sv[0], TMON_FD_BOTH, ping_timeout, 0); + if (tevent_req_nomem(subreq, req)) { + close(sv[0]); + return tevent_req_post(req, ev); + } + tevent_req_set_callback(subreq, lock_test_ping_done, req); + + return req; +} + +static void lock_test_ping_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct lock_test_state *state = tevent_req_data( + req, struct lock_test_state); + int wstatus; + bool status; + int err; + + status = tmon_ping_recv(subreq, &err); + TALLOC_FREE(subreq); + if (! status) { + switch (err) { + case EPIPE: + /* Child exit, child already printed message */ + break; + case ETIMEDOUT: + fprintf(stderr, + "%s: ping timeout from lock test child\n", + progname); + break; + default: + fprintf(stderr, + "%s: tmon_ping_recv() failed (%d)\n", + progname, + err); + } + /* Ignore error */ + } + + /* + * Lock checking child is gone or not sending pings. Release + * the lock, close this end of pipe, send SIGKILL to the child + * process and wait for the child to exit. + */ + close(*state->lock_fdp); + *state->lock_fdp = -1; + close(state->pipe_fd); + kill(state->child_pid, SIGKILL); + waitpid(state->child_pid, &wstatus, 0); + + tevent_req_done(req); +} + +static bool lock_test_recv(struct tevent_req *req, int *perr) +{ + if (tevent_req_is_unix_error(req, perr)) { + return false; + } + + return true; +} + +/* + * Wait for a reason to exit, indicating that parent has exited or I/O + * on lock failed + */ + +struct wait_for_exit_state { +}; + +static void wait_for_exit_parent_done(struct tevent_req *subreq); +static void wait_for_exit_lock_test_done(struct tevent_req *subreq); + +static struct tevent_req *wait_for_exit_send(TALLOC_CTX *mem_ctx, + struct tevent_context *ev, + pid_t ppid, + const char *lock_file, + int *fdp, + unsigned long recheck_interval, + unsigned long ping_timeout) +{ + struct tevent_req *req, *subreq; + struct wait_for_exit_state *state; + + req = tevent_req_create(mem_ctx, &state, struct wait_for_exit_state); + if (req == NULL) { + return NULL; + } + + subreq = wait_for_parent_send(state, ev, ppid); + if (tevent_req_nomem(subreq, req)) { + return tevent_req_post(req, ev); + } + tevent_req_set_callback(subreq, wait_for_exit_parent_done, req); + + if (recheck_interval > 0) { + subreq = lock_test_send(state, + ev, + lock_file, + fdp, + recheck_interval, + ping_timeout); + if (tevent_req_nomem(subreq, req)) { + return tevent_req_post(req, ev); + } + tevent_req_set_callback(subreq, + wait_for_exit_lock_test_done, + req); + } + + return req; +} + +static void wait_for_exit_parent_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + bool status; + int err; + + status = wait_for_parent_recv(subreq, &err); + TALLOC_FREE(subreq); + if (! status) { + /* Ignore error */ + fprintf(stderr, + "%s: " + "wait_for_parent_recv() failed (%d)\n", + progname, + err); + } + + tevent_req_done(req); +} + +static void wait_for_exit_lock_test_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + bool status; + int err; + + status = lock_test_recv(subreq, &err); + TALLOC_FREE(subreq); + if (! status) { + fprintf(stderr, + "%s: " + "lock_test_recv() failed (%d)\n", + progname, + err); + /* Ignore error, fall through to done */ + } + + tevent_req_done(req); +} + +static bool wait_for_exit_recv(struct tevent_req *req, int *perr) +{ + if (tevent_req_is_unix_error(req, perr)) { + return false; + } + + return true; +} + +static void usage(void) +{ + fprintf(stderr, + "Usage: %s <file> [recheck_interval [ping_timeout]]\n", + progname); +} + +int main(int argc, char *argv[]) +{ + struct tevent_context *ev; + char result; + int ppid; + const char *file = NULL; + unsigned long recheck_interval; + unsigned long ping_timeout; + int ret; + int fd = -1; + struct tevent_req *req; + bool status; + + strlcpy(progpath, argv[0], sizeof(progpath)); + progname = basename(progpath); + + if (argc < 2 || argc > 4) { + usage(); + exit(1); + } + + ev = tevent_context_init(NULL); + if (ev == NULL) { + fprintf(stderr, "locking: tevent_context_init() failed\n"); + exit(1); + } + + ppid = getppid(); + + file = argv[1]; + + recheck_interval = 5; + ping_timeout = 0; + if (argc >= 3) { + recheck_interval = smb_strtoul(argv[2], + NULL, + 10, + &ret, + SMB_STR_STANDARD); + if (ret != 0) { + usage(); + exit(1); + } + } + if (argc >= 4) { + ping_timeout = smb_strtoul(argv[3], + NULL, + 10, + &ret, + SMB_STR_STANDARD); + if (ret != 0) { + usage(); + exit(1); + } + } + + result = fcntl_lock(file, &fd); + sys_write(STDOUT_FILENO, &result, 1); + + if (result != '0') { + return 0; + } + + req = wait_for_exit_send(ev, + ev, + ppid, + file, + &fd, + recheck_interval, + ping_timeout); + if (req == NULL) { + fprintf(stderr, + "%s: wait_for_exit_send() failed\n", + progname); + exit(1); + } + + tevent_req_poll(req, ev); + + status = wait_for_exit_recv(req, &ret); + if (! status) { + fprintf(stderr, + "%s: wait_for_exit_recv() failed (%d)\n", + progname, + ret); + } + + if (fd != -1) { + close(fd); + } + + return 0; +} diff --git a/ctdb/server/ctdb_persistent.c b/ctdb/server/ctdb_persistent.c new file mode 100644 index 0000000..2671744 --- /dev/null +++ b/ctdb/server/ctdb_persistent.c @@ -0,0 +1,397 @@ +/* + persistent store logic + + Copyright (C) Andrew Tridgell 2007 + Copyright (C) Ronnie Sahlberg 2007 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "replace.h" +#include "system/filesys.h" +#include "system/network.h" +#include "system/time.h" +#include "system/wait.h" + +#include <talloc.h> +#include <tevent.h> + +#include "lib/tdb_wrap/tdb_wrap.h" +#include "lib/util/debug.h" +#include "lib/util/samba_util.h" + +#include "ctdb_private.h" + +#include "common/reqid.h" +#include "common/common.h" +#include "common/logging.h" + +struct ctdb_persistent_state { + struct ctdb_context *ctdb; + struct ctdb_db_context *ctdb_db; /* used by trans3_commit */ + struct ctdb_client *client; /* used by trans3_commit */ + struct ctdb_req_control_old *c; + const char *errormsg; + uint32_t num_pending; + int32_t status; + uint32_t num_failed, num_sent; +}; + +/* + 1) all nodes fail, and all nodes reply + 2) some nodes fail, all nodes reply + 3) some nodes timeout + 4) all nodes succeed + */ + +/* + called when a node has acknowledged a ctdb_control_update_record call + */ +static void ctdb_persistent_callback(struct ctdb_context *ctdb, + int32_t status, TDB_DATA data, + const char *errormsg, + void *private_data) +{ + struct ctdb_persistent_state *state = talloc_get_type(private_data, + struct ctdb_persistent_state); + + if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) { + DEBUG(DEBUG_INFO, ("ctdb_persistent_callback: ignoring reply " + "during recovery\n")); + return; + } + + if (status != 0) { + DEBUG(DEBUG_ERR,("ctdb_persistent_callback failed with status %d (%s)\n", + status, errormsg?errormsg:"no error message given")); + state->status = status; + state->errormsg = errormsg; + state->num_failed++; + + /* + * If a node failed to complete the update_record control, + * then either a recovery is already running or something + * bad is going on. So trigger a recovery and let the + * recovery finish the transaction, sending back the reply + * for the trans3_commit control to the client. + */ + ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE; + return; + } + + state->num_pending--; + + if (state->num_pending != 0) { + return; + } + + ctdb_request_control_reply(state->ctdb, state->c, NULL, 0, state->errormsg); + talloc_free(state); +} + +/* + called if persistent store times out + */ +static void ctdb_persistent_store_timeout(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *private_data) +{ + struct ctdb_persistent_state *state = talloc_get_type(private_data, struct ctdb_persistent_state); + + if (state->ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) { + DEBUG(DEBUG_INFO, ("ctdb_persistent_store_timeout: ignoring " + "timeout during recovery\n")); + return; + } + + ctdb_request_control_reply(state->ctdb, state->c, NULL, 1, + "timeout in ctdb_persistent_state"); + + talloc_free(state); +} + +/** + * Finish pending trans3 commit controls, i.e. send + * reply to the client. This is called by the end-recovery + * control to fix the situation when a recovery interrupts + * the usual progress of a transaction. + */ +void ctdb_persistent_finish_trans3_commits(struct ctdb_context *ctdb) +{ + struct ctdb_db_context *ctdb_db; + + if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) { + DEBUG(DEBUG_INFO, ("ctdb_persistent_finish_trans3_commits: " + "skipping execution when recovery is " + "active\n")); + return; + } + + for (ctdb_db = ctdb->db_list; ctdb_db; ctdb_db = ctdb_db->next) { + struct ctdb_persistent_state *state; + + if (ctdb_db->persistent_state == NULL) { + continue; + } + + state = ctdb_db->persistent_state; + + ctdb_request_control_reply(ctdb, state->c, NULL, 2, + "trans3 commit ended by recovery"); + + /* The destructor sets ctdb_db->persistent_state to NULL. */ + talloc_free(state); + } +} + +static int ctdb_persistent_state_destructor(struct ctdb_persistent_state *state) +{ + if (state->client != NULL) { + state->client->db_id = 0; + } + + if (state->ctdb_db != NULL) { + state->ctdb_db->persistent_state = NULL; + } + + return 0; +} + +/* + * Store a set of persistent records. + * This is used to roll out a transaction to all nodes. + */ +int32_t ctdb_control_trans3_commit(struct ctdb_context *ctdb, + struct ctdb_req_control_old *c, + TDB_DATA recdata, bool *async_reply) +{ + struct ctdb_client *client; + struct ctdb_persistent_state *state; + unsigned int i; + struct ctdb_marshall_buffer *m = (struct ctdb_marshall_buffer *)recdata.dptr; + struct ctdb_db_context *ctdb_db; + + if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) { + DEBUG(DEBUG_INFO,("rejecting ctdb_control_trans3_commit when recovery active\n")); + return -1; + } + + client = reqid_find(ctdb->idr, c->client_id, struct ctdb_client); + if (client == NULL) { + DEBUG(DEBUG_ERR,(__location__ " can not match persistent_store " + "to a client. Returning error\n")); + return -1; + } + + if (client->db_id != 0) { + DEBUG(DEBUG_ERR,(__location__ " ERROR: trans3_commit: " + "client-db_id[0x%08x] != 0 " + "(client_id[0x%08x]): trans3_commit active?\n", + client->db_id, client->client_id)); + return -1; + } + + ctdb_db = find_ctdb_db(ctdb, m->db_id); + if (ctdb_db == NULL) { + DEBUG(DEBUG_ERR,(__location__ " ctdb_control_trans3_commit: " + "Unknown database db_id[0x%08x]\n", m->db_id)); + return -1; + } + + if (ctdb_db->persistent_state != NULL) { + DEBUG(DEBUG_ERR, (__location__ " Error: " + "ctdb_control_trans3_commit " + "called while a transaction commit is " + "active. db_id[0x%08x]\n", m->db_id)); + return -1; + } + + ctdb_db->persistent_state = talloc_zero(ctdb_db, + struct ctdb_persistent_state); + CTDB_NO_MEMORY(ctdb, ctdb_db->persistent_state); + + client->db_id = m->db_id; + + state = ctdb_db->persistent_state; + state->ctdb = ctdb; + state->ctdb_db = ctdb_db; + state->c = c; + state->client = client; + + talloc_set_destructor(state, ctdb_persistent_state_destructor); + + for (i = 0; i < ctdb->vnn_map->size; i++) { + struct ctdb_node *node = ctdb->nodes[ctdb->vnn_map->map[i]]; + int ret; + + /* only send to active nodes */ + if (node->flags & NODE_FLAGS_INACTIVE) { + continue; + } + + ret = ctdb_daemon_send_control(ctdb, node->pnn, 0, + CTDB_CONTROL_UPDATE_RECORD, + c->client_id, 0, recdata, + ctdb_persistent_callback, + state); + if (ret == -1) { + DEBUG(DEBUG_ERR,("Unable to send " + "CTDB_CONTROL_UPDATE_RECORD " + "to pnn %u\n", node->pnn)); + talloc_free(state); + return -1; + } + + state->num_pending++; + state->num_sent++; + } + + if (state->num_pending == 0) { + talloc_free(state); + return 0; + } + + /* we need to wait for the replies */ + *async_reply = true; + + /* need to keep the control structure around */ + talloc_steal(state, c); + + /* but we won't wait forever */ + tevent_add_timer(ctdb->ev, state, + timeval_current_ofs(ctdb->tunable.control_timeout, 0), + ctdb_persistent_store_timeout, state); + + return 0; +} + + +/* + backwards compatibility: + + start a persistent store operation. passing both the key, header and + data to the daemon. If the client disconnects before it has issued + a persistent_update call to the daemon we trigger a full recovery + to ensure the databases are brought back in sync. + for now we ignore the recdata that the client has passed to us. + */ +int32_t ctdb_control_start_persistent_update(struct ctdb_context *ctdb, + struct ctdb_req_control_old *c, + TDB_DATA recdata) +{ + struct ctdb_client *client = reqid_find(ctdb->idr, c->client_id, struct ctdb_client); + + if (client == NULL) { + DEBUG(DEBUG_ERR,(__location__ " can not match start_persistent_update to a client. Returning error\n")); + return -1; + } + + client->num_persistent_updates++; + + return 0; +} + +/* + backwards compatibility: + + called to tell ctdbd that it is no longer doing a persistent update +*/ +int32_t ctdb_control_cancel_persistent_update(struct ctdb_context *ctdb, + struct ctdb_req_control_old *c, + TDB_DATA recdata) +{ + struct ctdb_client *client = reqid_find(ctdb->idr, c->client_id, struct ctdb_client); + + if (client == NULL) { + DEBUG(DEBUG_ERR,(__location__ " can not match cancel_persistent_update to a client. Returning error\n")); + return -1; + } + + if (client->num_persistent_updates > 0) { + client->num_persistent_updates--; + } + + return 0; +} + +static int32_t ctdb_get_db_seqnum(struct ctdb_context *ctdb, + uint32_t db_id, + uint64_t *seqnum) +{ + int32_t ret; + struct ctdb_db_context *ctdb_db; + const char *keyname = CTDB_DB_SEQNUM_KEY; + TDB_DATA key; + TDB_DATA data; + TALLOC_CTX *mem_ctx = talloc_new(ctdb); + struct ctdb_ltdb_header header; + + ctdb_db = find_ctdb_db(ctdb, db_id); + if (!ctdb_db) { + DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", db_id)); + ret = -1; + goto done; + } + + if (! ctdb_db_allow_access(ctdb_db)) { + ret = -1; + goto done; + } + + key.dptr = (uint8_t *)discard_const(keyname); + key.dsize = strlen(keyname) + 1; + + ret = (int32_t)ctdb_ltdb_fetch(ctdb_db, key, &header, mem_ctx, &data); + if (ret != 0) { + goto done; + } + + if (data.dsize != sizeof(uint64_t)) { + *seqnum = 0; + goto done; + } + + *seqnum = *(uint64_t *)data.dptr; + +done: + talloc_free(mem_ctx); + return ret; +} + +/** + * Get the sequence number of a persistent database. + */ +int32_t ctdb_control_get_db_seqnum(struct ctdb_context *ctdb, + TDB_DATA indata, + TDB_DATA *outdata) +{ + uint32_t db_id; + int32_t ret; + uint64_t seqnum; + + db_id = *(uint32_t *)indata.dptr; + ret = ctdb_get_db_seqnum(ctdb, db_id, &seqnum); + if (ret != 0) { + goto done; + } + + outdata->dsize = sizeof(uint64_t); + outdata->dptr = talloc_memdup(outdata, &seqnum, sizeof(uint64_t)); + if (outdata->dptr == NULL) { + ret = -1; + } + +done: + return ret; +} diff --git a/ctdb/server/ctdb_recover.c b/ctdb/server/ctdb_recover.c new file mode 100644 index 0000000..7b30d11 --- /dev/null +++ b/ctdb/server/ctdb_recover.c @@ -0,0 +1,1243 @@ +/* + ctdb recovery code + + Copyright (C) Andrew Tridgell 2007 + Copyright (C) Ronnie Sahlberg 2007 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ +#include "replace.h" +#include "system/time.h" +#include "system/network.h" +#include "system/filesys.h" +#include "system/wait.h" + +#include <talloc.h> +#include <tevent.h> +#include <tdb.h> + +#include "lib/tdb_wrap/tdb_wrap.h" +#include "lib/util/dlinklist.h" +#include "lib/util/debug.h" +#include "lib/util/time.h" +#include "lib/util/util_process.h" + +#include "ctdb_private.h" +#include "ctdb_client.h" + +#include "common/system.h" +#include "common/common.h" +#include "common/logging.h" + +#include "ctdb_cluster_mutex.h" + +int +ctdb_control_getvnnmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata) +{ + struct ctdb_vnn_map_wire *map; + size_t len; + + CHECK_CONTROL_DATA_SIZE(0); + + len = offsetof(struct ctdb_vnn_map_wire, map) + sizeof(uint32_t)*ctdb->vnn_map->size; + map = talloc_size(outdata, len); + CTDB_NO_MEMORY(ctdb, map); + + map->generation = ctdb->vnn_map->generation; + map->size = ctdb->vnn_map->size; + memcpy(map->map, ctdb->vnn_map->map, sizeof(uint32_t)*map->size); + + outdata->dsize = len; + outdata->dptr = (uint8_t *)map; + + return 0; +} + +int +ctdb_control_setvnnmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata) +{ + struct ctdb_vnn_map_wire *map = (struct ctdb_vnn_map_wire *)indata.dptr; + + if (ctdb->recovery_mode != CTDB_RECOVERY_ACTIVE) { + DEBUG(DEBUG_ERR, ("Attempt to set vnnmap when not in recovery\n")); + return -1; + } + + talloc_free(ctdb->vnn_map); + + ctdb->vnn_map = talloc(ctdb, struct ctdb_vnn_map); + CTDB_NO_MEMORY(ctdb, ctdb->vnn_map); + + ctdb->vnn_map->generation = map->generation; + ctdb->vnn_map->size = map->size; + ctdb->vnn_map->map = talloc_array(ctdb->vnn_map, uint32_t, map->size); + CTDB_NO_MEMORY(ctdb, ctdb->vnn_map->map); + + memcpy(ctdb->vnn_map->map, map->map, sizeof(uint32_t)*map->size); + + return 0; +} + +int +ctdb_control_getdbmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata) +{ + uint32_t i, len; + struct ctdb_db_context *ctdb_db; + struct ctdb_dbid_map_old *dbid_map; + + CHECK_CONTROL_DATA_SIZE(0); + + len = 0; + for(ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next){ + len++; + } + + + outdata->dsize = offsetof(struct ctdb_dbid_map_old, dbs) + sizeof(dbid_map->dbs[0])*len; + outdata->dptr = (unsigned char *)talloc_zero_size(outdata, outdata->dsize); + if (!outdata->dptr) { + DEBUG(DEBUG_ALERT, (__location__ " Failed to allocate dbmap array\n")); + exit(1); + } + + dbid_map = (struct ctdb_dbid_map_old *)outdata->dptr; + dbid_map->num = len; + for (i=0,ctdb_db=ctdb->db_list;ctdb_db;i++,ctdb_db=ctdb_db->next){ + dbid_map->dbs[i].db_id = ctdb_db->db_id; + dbid_map->dbs[i].flags = ctdb_db->db_flags; + } + + return 0; +} + +int +ctdb_control_getnodemap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata) +{ + CHECK_CONTROL_DATA_SIZE(0); + + outdata->dptr = (unsigned char *)ctdb_node_list_to_map(ctdb->nodes, + ctdb->num_nodes, + outdata); + if (outdata->dptr == NULL) { + return -1; + } + + outdata->dsize = talloc_get_size(outdata->dptr); + + return 0; +} + +/* + reload the nodes file +*/ +int +ctdb_control_reload_nodes_file(struct ctdb_context *ctdb, uint32_t opcode) +{ + unsigned int i, num_nodes; + TALLOC_CTX *tmp_ctx; + struct ctdb_node **nodes; + + tmp_ctx = talloc_new(ctdb); + + /* steal the old nodes file for a while */ + talloc_steal(tmp_ctx, ctdb->nodes); + nodes = ctdb->nodes; + ctdb->nodes = NULL; + num_nodes = ctdb->num_nodes; + ctdb->num_nodes = 0; + + /* load the new nodes file */ + ctdb_load_nodes_file(ctdb); + + for (i=0; i<ctdb->num_nodes; i++) { + /* keep any identical pre-existing nodes and connections */ + if ((i < num_nodes) && ctdb_same_address(&ctdb->nodes[i]->address, &nodes[i]->address)) { + talloc_free(ctdb->nodes[i]); + ctdb->nodes[i] = talloc_steal(ctdb->nodes, nodes[i]); + continue; + } + + if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) { + continue; + } + + /* any new or different nodes must be added */ + if (ctdb->methods->add_node(ctdb->nodes[i]) != 0) { + DEBUG(DEBUG_CRIT, (__location__ " methods->add_node failed at %d\n", i)); + ctdb_fatal(ctdb, "failed to add node. shutting down\n"); + } + if (ctdb->methods->connect_node(ctdb->nodes[i]) != 0) { + DEBUG(DEBUG_CRIT, (__location__ " methods->add_connect failed at %d\n", i)); + ctdb_fatal(ctdb, "failed to connect to node. shutting down\n"); + } + } + + /* tell the recovery daemon to reload the nodes file too */ + ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELOAD_NODES, tdb_null); + + talloc_free(tmp_ctx); + + return 0; +} + +struct db_pull_state { + struct ctdb_context *ctdb; + struct ctdb_db_context *ctdb_db; + struct ctdb_marshall_buffer *recs; + uint32_t pnn; + uint64_t srvid; + uint32_t num_records; +}; + +static int traverse_db_pull(struct tdb_context *tdb, TDB_DATA key, + TDB_DATA data, void *private_data) +{ + struct db_pull_state *state = (struct db_pull_state *)private_data; + struct ctdb_marshall_buffer *recs; + + recs = ctdb_marshall_add(state->ctdb, state->recs, + state->ctdb_db->db_id, 0, key, NULL, data); + if (recs == NULL) { + TALLOC_FREE(state->recs); + return -1; + } + state->recs = recs; + + if (talloc_get_size(state->recs) >= + state->ctdb->tunable.rec_buffer_size_limit) { + TDB_DATA buffer; + int ret; + + buffer = ctdb_marshall_finish(state->recs); + ret = ctdb_daemon_send_message(state->ctdb, state->pnn, + state->srvid, buffer); + if (ret != 0) { + TALLOC_FREE(state->recs); + return -1; + } + + state->num_records += state->recs->count; + TALLOC_FREE(state->recs); + } + + return 0; +} + +int32_t ctdb_control_db_pull(struct ctdb_context *ctdb, + struct ctdb_req_control_old *c, + TDB_DATA indata, TDB_DATA *outdata) +{ + struct ctdb_pulldb_ext *pulldb_ext; + struct ctdb_db_context *ctdb_db; + struct db_pull_state state; + int ret; + + pulldb_ext = (struct ctdb_pulldb_ext *)indata.dptr; + + ctdb_db = find_ctdb_db(ctdb, pulldb_ext->db_id); + if (ctdb_db == NULL) { + DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", + pulldb_ext->db_id)); + return -1; + } + + if (!ctdb_db_frozen(ctdb_db)) { + DEBUG(DEBUG_ERR, + ("rejecting ctdb_control_pull_db when not frozen\n")); + return -1; + } + + if (ctdb_db->unhealthy_reason) { + /* this is just a warning, as the tdb should be empty anyway */ + DEBUG(DEBUG_WARNING, + ("db(%s) unhealty in ctdb_control_db_pull: %s\n", + ctdb_db->db_name, ctdb_db->unhealthy_reason)); + } + + state.ctdb = ctdb; + state.ctdb_db = ctdb_db; + state.recs = NULL; + state.pnn = c->hdr.srcnode; + state.srvid = pulldb_ext->srvid; + state.num_records = 0; + + /* If the records are invalid, we are done */ + if (ctdb_db->invalid_records) { + goto done; + } + + if (ctdb_lockdb_mark(ctdb_db) != 0) { + DEBUG(DEBUG_ERR, + (__location__ " Failed to get lock on entire db - failing\n")); + return -1; + } + + ret = tdb_traverse_read(ctdb_db->ltdb->tdb, traverse_db_pull, &state); + if (ret == -1) { + DEBUG(DEBUG_ERR, + (__location__ " Failed to get traverse db '%s'\n", + ctdb_db->db_name)); + ctdb_lockdb_unmark(ctdb_db); + return -1; + } + + /* Last few records */ + if (state.recs != NULL) { + TDB_DATA buffer; + + buffer = ctdb_marshall_finish(state.recs); + ret = ctdb_daemon_send_message(state.ctdb, state.pnn, + state.srvid, buffer); + if (ret != 0) { + TALLOC_FREE(state.recs); + ctdb_lockdb_unmark(ctdb_db); + return -1; + } + + state.num_records += state.recs->count; + TALLOC_FREE(state.recs); + } + + ctdb_lockdb_unmark(ctdb_db); + +done: + outdata->dptr = talloc_size(outdata, sizeof(uint32_t)); + if (outdata->dptr == NULL) { + DEBUG(DEBUG_ERR, (__location__ " Memory allocation error\n")); + return -1; + } + + memcpy(outdata->dptr, (uint8_t *)&state.num_records, sizeof(uint32_t)); + outdata->dsize = sizeof(uint32_t); + + return 0; +} + +struct db_push_state { + struct ctdb_context *ctdb; + struct ctdb_db_context *ctdb_db; + uint64_t srvid; + uint32_t num_records; + bool failed; +}; + +static void db_push_msg_handler(uint64_t srvid, TDB_DATA indata, + void *private_data) +{ + struct db_push_state *state = talloc_get_type( + private_data, struct db_push_state); + struct ctdb_marshall_buffer *recs; + struct ctdb_rec_data_old *rec; + unsigned int i; + int ret; + + if (state->failed) { + return; + } + + recs = (struct ctdb_marshall_buffer *)indata.dptr; + rec = (struct ctdb_rec_data_old *)&recs->data[0]; + + DEBUG(DEBUG_INFO, ("starting push of %u records for dbid 0x%x\n", + recs->count, recs->db_id)); + + for (i=0; i<recs->count; i++) { + TDB_DATA key, data; + struct ctdb_ltdb_header *hdr; + + key.dptr = &rec->data[0]; + key.dsize = rec->keylen; + data.dptr = &rec->data[key.dsize]; + data.dsize = rec->datalen; + + if (data.dsize < sizeof(struct ctdb_ltdb_header)) { + DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n")); + goto failed; + } + + hdr = (struct ctdb_ltdb_header *)data.dptr; + /* Strip off any read only record flags. + * All readonly records are revoked implicitly by a recovery. + */ + hdr->flags &= ~CTDB_REC_RO_FLAGS; + + data.dptr += sizeof(*hdr); + data.dsize -= sizeof(*hdr); + + ret = ctdb_ltdb_store(state->ctdb_db, key, hdr, data); + if (ret != 0) { + DEBUG(DEBUG_ERR, + (__location__ " Unable to store record\n")); + goto failed; + } + + rec = (struct ctdb_rec_data_old *)(rec->length + (uint8_t *)rec); + } + + DEBUG(DEBUG_DEBUG, ("finished push of %u records for dbid 0x%x\n", + recs->count, recs->db_id)); + + state->num_records += recs->count; + return; + +failed: + state->failed = true; +} + +int32_t ctdb_control_db_push_start(struct ctdb_context *ctdb, TDB_DATA indata) +{ + struct ctdb_pulldb_ext *pulldb_ext; + struct ctdb_db_context *ctdb_db; + struct db_push_state *state; + int ret; + + pulldb_ext = (struct ctdb_pulldb_ext *)indata.dptr; + + ctdb_db = find_ctdb_db(ctdb, pulldb_ext->db_id); + if (ctdb_db == NULL) { + DEBUG(DEBUG_ERR, + (__location__ " Unknown db 0x%08x\n", pulldb_ext->db_id)); + return -1; + } + + if (!ctdb_db_frozen(ctdb_db)) { + DEBUG(DEBUG_ERR, + ("rejecting ctdb_control_db_push_start when not frozen\n")); + return -1; + } + + if (ctdb_db->push_started) { + DEBUG(DEBUG_WARNING, + (__location__ " DB push already started for %s\n", + ctdb_db->db_name)); + + /* De-register old state */ + state = (struct db_push_state *)ctdb_db->push_state; + if (state != NULL) { + srvid_deregister(ctdb->srv, state->srvid, state); + talloc_free(state); + ctdb_db->push_state = NULL; + } + } + + state = talloc_zero(ctdb_db, struct db_push_state); + if (state == NULL) { + DEBUG(DEBUG_ERR, (__location__ " Memory allocation error\n")); + return -1; + } + + state->ctdb = ctdb; + state->ctdb_db = ctdb_db; + state->srvid = pulldb_ext->srvid; + state->failed = false; + + ret = srvid_register(ctdb->srv, state, state->srvid, + db_push_msg_handler, state); + if (ret != 0) { + DEBUG(DEBUG_ERR, + (__location__ " Failed to register srvid for db push\n")); + talloc_free(state); + return -1; + } + + if (ctdb_lockdb_mark(ctdb_db) != 0) { + DEBUG(DEBUG_ERR, + (__location__ " Failed to get lock on entire db - failing\n")); + srvid_deregister(ctdb->srv, state->srvid, state); + talloc_free(state); + return -1; + } + + ctdb_db->push_started = true; + ctdb_db->push_state = state; + + return 0; +} + +int32_t ctdb_control_db_push_confirm(struct ctdb_context *ctdb, + TDB_DATA indata, TDB_DATA *outdata) +{ + uint32_t db_id; + struct ctdb_db_context *ctdb_db; + struct db_push_state *state; + + db_id = *(uint32_t *)indata.dptr; + + ctdb_db = find_ctdb_db(ctdb, db_id); + if (ctdb_db == NULL) { + DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", db_id)); + return -1; + } + + if (!ctdb_db_frozen(ctdb_db)) { + DEBUG(DEBUG_ERR, + ("rejecting ctdb_control_db_push_confirm when not frozen\n")); + return -1; + } + + if (!ctdb_db->push_started) { + DEBUG(DEBUG_ERR, (__location__ " DB push not started\n")); + return -1; + } + + if (ctdb_db_readonly(ctdb_db)) { + DEBUG(DEBUG_ERR, + ("Clearing the tracking database for dbid 0x%x\n", + ctdb_db->db_id)); + if (tdb_wipe_all(ctdb_db->rottdb) != 0) { + DEBUG(DEBUG_ERR, + ("Failed to wipe tracking database for 0x%x." + " Dropping read-only delegation support\n", + ctdb_db->db_id)); + tdb_close(ctdb_db->rottdb); + ctdb_db->rottdb = NULL; + ctdb_db_reset_readonly(ctdb_db); + } + + while (ctdb_db->revokechild_active != NULL) { + talloc_free(ctdb_db->revokechild_active); + } + } + + ctdb_lockdb_unmark(ctdb_db); + + state = (struct db_push_state *)ctdb_db->push_state; + if (state == NULL) { + DEBUG(DEBUG_ERR, (__location__ " Missing push db state\n")); + return -1; + } + + srvid_deregister(ctdb->srv, state->srvid, state); + + outdata->dptr = talloc_size(outdata, sizeof(uint32_t)); + if (outdata->dptr == NULL) { + DEBUG(DEBUG_ERR, (__location__ " Memory allocation error\n")); + talloc_free(state); + ctdb_db->push_state = NULL; + return -1; + } + + memcpy(outdata->dptr, (uint8_t *)&state->num_records, sizeof(uint32_t)); + outdata->dsize = sizeof(uint32_t); + + talloc_free(state); + ctdb_db->push_started = false; + ctdb_db->push_state = NULL; + + return 0; +} + +struct set_recmode_state { + struct ctdb_context *ctdb; + struct ctdb_req_control_old *c; +}; + +static void set_recmode_handler(char status, + double latency, + void *private_data) +{ + struct set_recmode_state *state = talloc_get_type_abort( + private_data, struct set_recmode_state); + int s = 0; + const char *err = NULL; + + switch (status) { + case '0': + /* Mutex taken */ + DEBUG(DEBUG_ERR, + ("ERROR: Daemon able to take recovery lock on \"%s\" during recovery\n", + state->ctdb->recovery_lock)); + s = -1; + err = "Took recovery lock from daemon during recovery - probably a cluster filesystem lock coherence problem"; + break; + + case '1': + /* Contention */ + DEBUG(DEBUG_DEBUG, (__location__ " Recovery lock check OK\n")); + state->ctdb->recovery_mode = CTDB_RECOVERY_NORMAL; + ctdb_process_deferred_attach(state->ctdb); + + s = 0; + + CTDB_UPDATE_RECLOCK_LATENCY(state->ctdb, "daemon reclock", + reclock.ctdbd, latency); + break; + + case '2': + /* Timeout. Consider this a success, not a failure, + * as we failed to set the recovery lock which is what + * we wanted. This can be caused by the cluster + * filesystem being very slow to arbitrate locks + * immediately after a node failure. */ + DEBUG(DEBUG_WARNING, + (__location__ + "Time out getting recovery lock, allowing recmode set anyway\n")); + state->ctdb->recovery_mode = CTDB_RECOVERY_NORMAL; + ctdb_process_deferred_attach(state->ctdb); + + s = 0; + break; + + default: + DEBUG(DEBUG_ERR, + ("Unexpected error when testing recovery lock\n")); + s = -1; + err = "Unexpected error when testing recovery lock"; + } + + ctdb_request_control_reply(state->ctdb, state->c, NULL, s, err); + talloc_free(state); +} + +static void +ctdb_drop_all_ips_event(struct tevent_context *ev, struct tevent_timer *te, + struct timeval t, void *private_data) +{ + struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context); + + DEBUG(DEBUG_ERR,(__location__ " Been in recovery mode for too long. Dropping all IPS\n")); + talloc_free(ctdb->release_ips_ctx); + ctdb->release_ips_ctx = NULL; + + ctdb_release_all_ips(ctdb); +} + +/* + * Set up an event to drop all public ips if we remain in recovery for too + * long + */ +int ctdb_deferred_drop_all_ips(struct ctdb_context *ctdb) +{ + if (ctdb->release_ips_ctx != NULL) { + talloc_free(ctdb->release_ips_ctx); + } + ctdb->release_ips_ctx = talloc_new(ctdb); + CTDB_NO_MEMORY(ctdb, ctdb->release_ips_ctx); + + tevent_add_timer(ctdb->ev, ctdb->release_ips_ctx, + timeval_current_ofs(ctdb->tunable.recovery_drop_all_ips, 0), + ctdb_drop_all_ips_event, ctdb); + return 0; +} + +/* + set the recovery mode + */ +int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb, + struct ctdb_req_control_old *c, + TDB_DATA indata, bool *async_reply, + const char **errormsg) +{ + uint32_t recmode = *(uint32_t *)indata.dptr; + struct ctdb_db_context *ctdb_db; + struct set_recmode_state *state; + struct ctdb_cluster_mutex_handle *h; + + if (recmode == ctdb->recovery_mode) { + D_INFO("Recovery mode already set to %s\n", + recmode == CTDB_RECOVERY_NORMAL ? "NORMAL" : "ACTIVE"); + return 0; + } + + D_NOTICE("Recovery mode set to %s\n", + recmode == CTDB_RECOVERY_NORMAL ? "NORMAL" : "ACTIVE"); + + /* if we enter recovery but stay in recovery for too long + we will eventually drop all our ip addresses + */ + if (recmode == CTDB_RECOVERY_ACTIVE) { + if (ctdb_deferred_drop_all_ips(ctdb) != 0) { + D_ERR("Failed to set up deferred drop all ips\n"); + } + + ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE; + return 0; + } + + /* From this point: recmode == CTDB_RECOVERY_NORMAL + * + * Therefore, what follows is special handling when setting + * recovery mode back to normal */ + + TALLOC_FREE(ctdb->release_ips_ctx); + + for (ctdb_db = ctdb->db_list; ctdb_db != NULL; ctdb_db = ctdb_db->next) { + if (ctdb_db->generation != ctdb->vnn_map->generation) { + DEBUG(DEBUG_ERR, + ("Inconsistent DB generation %u for %s\n", + ctdb_db->generation, ctdb_db->db_name)); + DEBUG(DEBUG_ERR, ("Recovery mode set to ACTIVE\n")); + return -1; + } + } + + /* force the databases to thaw */ + if (ctdb_db_all_frozen(ctdb)) { + ctdb_control_thaw(ctdb, false); + } + + if (ctdb->recovery_lock == NULL) { + /* Not using recovery lock file */ + ctdb->recovery_mode = CTDB_RECOVERY_NORMAL; + ctdb_process_deferred_attach(ctdb); + return 0; + } + + state = talloc_zero(ctdb, struct set_recmode_state); + if (state == NULL) { + DEBUG(DEBUG_ERR, (__location__ " out of memory\n")); + return -1; + } + state->ctdb = ctdb; + state->c = NULL; + + h = ctdb_cluster_mutex(state, ctdb, ctdb->recovery_lock, 5, + set_recmode_handler, state, NULL, NULL); + if (h == NULL) { + talloc_free(state); + return -1; + } + + state->c = talloc_steal(state, c); + *async_reply = true; + + return 0; +} + + +/* + delete a record as part of the vacuum process + only delete if we are not lmaster or dmaster, and our rsn is <= the provided rsn + use non-blocking locks + + return 0 if the record was successfully deleted (i.e. it does not exist + when the function returns) + or !0 is the record still exists in the tdb after returning. + */ +static int delete_tdb_record(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, struct ctdb_rec_data_old *rec) +{ + TDB_DATA key, data, data2; + struct ctdb_ltdb_header *hdr, *hdr2; + + /* these are really internal tdb functions - but we need them here for + non-blocking lock of the freelist */ + int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype); + int tdb_unlock(struct tdb_context *tdb, int list, int ltype); + + + key.dsize = rec->keylen; + key.dptr = &rec->data[0]; + data.dsize = rec->datalen; + data.dptr = &rec->data[rec->keylen]; + + if (ctdb_lmaster(ctdb, &key) == ctdb->pnn) { + DBG_INFO("Called delete on record where we are lmaster\n"); + return -1; + } + + if (data.dsize != sizeof(struct ctdb_ltdb_header)) { + DBG_ERR("Bad record size\n"); + return -1; + } + + hdr = (struct ctdb_ltdb_header *)data.dptr; + + /* use a non-blocking lock */ + if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, key) != 0) { + DBG_INFO("Failed to get non-blocking chain lock\n"); + return -1; + } + + data2 = tdb_fetch(ctdb_db->ltdb->tdb, key); + if (data2.dptr == NULL) { + tdb_chainunlock(ctdb_db->ltdb->tdb, key); + return 0; + } + + if (data2.dsize < sizeof(struct ctdb_ltdb_header)) { + if (tdb_lock_nonblock(ctdb_db->ltdb->tdb, -1, F_WRLCK) == 0) { + if (tdb_delete(ctdb_db->ltdb->tdb, key) != 0) { + DBG_ERR("Failed to delete corrupt record\n"); + } + tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK); + DBG_ERR("Deleted corrupt record\n"); + } + tdb_chainunlock(ctdb_db->ltdb->tdb, key); + free(data2.dptr); + return 0; + } + + hdr2 = (struct ctdb_ltdb_header *)data2.dptr; + + if (hdr2->rsn > hdr->rsn) { + tdb_chainunlock(ctdb_db->ltdb->tdb, key); + DBG_INFO("Skipping record with rsn=%llu - called with rsn=%llu\n", + (unsigned long long)hdr2->rsn, + (unsigned long long)hdr->rsn); + free(data2.dptr); + return -1; + } + + /* do not allow deleting record that have readonly flags set. */ + if (hdr->flags & CTDB_REC_RO_FLAGS) { + tdb_chainunlock(ctdb_db->ltdb->tdb, key); + DBG_INFO("Skipping record with readonly flags set\n"); + free(data2.dptr); + return -1; + } + if (hdr2->flags & CTDB_REC_RO_FLAGS) { + tdb_chainunlock(ctdb_db->ltdb->tdb, key); + DBG_INFO("Skipping record with readonly flags set locally\n"); + free(data2.dptr); + return -1; + } + + if (hdr2->dmaster == ctdb->pnn) { + tdb_chainunlock(ctdb_db->ltdb->tdb, key); + DBG_INFO("Attempted delete record where we are the dmaster\n"); + free(data2.dptr); + return -1; + } + + if (tdb_lock_nonblock(ctdb_db->ltdb->tdb, -1, F_WRLCK) != 0) { + tdb_chainunlock(ctdb_db->ltdb->tdb, key); + DBG_INFO("Failed to get non-blocking freelist lock\n"); + free(data2.dptr); + return -1; + } + + if (tdb_delete(ctdb_db->ltdb->tdb, key) != 0) { + tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK); + tdb_chainunlock(ctdb_db->ltdb->tdb, key); + DBG_INFO("Failed to delete record\n"); + free(data2.dptr); + return -1; + } + + tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK); + tdb_chainunlock(ctdb_db->ltdb->tdb, key); + free(data2.dptr); + return 0; +} + + + +struct recovery_callback_state { + struct ctdb_req_control_old *c; +}; + + +/* + called when the 'recovered' event script has finished + */ +static void ctdb_end_recovery_callback(struct ctdb_context *ctdb, int status, void *p) +{ + struct recovery_callback_state *state = talloc_get_type(p, struct recovery_callback_state); + + CTDB_INCREMENT_STAT(ctdb, num_recoveries); + + if (status != 0) { + DEBUG(DEBUG_ERR,(__location__ " recovered event script failed (status %d)\n", status)); + if (status == -ETIMEDOUT) { + ctdb_ban_self(ctdb); + } + } + + ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL); + talloc_free(state); + + gettimeofday(&ctdb->last_recovery_finished, NULL); + + if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) { + ctdb_set_runstate(ctdb, CTDB_RUNSTATE_STARTUP); + } +} + +/* + recovery has finished + */ +int32_t ctdb_control_end_recovery(struct ctdb_context *ctdb, + struct ctdb_req_control_old *c, + bool *async_reply) +{ + int ret; + struct recovery_callback_state *state; + + DEBUG(DEBUG_ERR,("Recovery has finished\n")); + + ctdb_persistent_finish_trans3_commits(ctdb); + + state = talloc(ctdb, struct recovery_callback_state); + CTDB_NO_MEMORY(ctdb, state); + + state->c = c; + + ret = ctdb_event_script_callback(ctdb, state, + ctdb_end_recovery_callback, + state, + CTDB_EVENT_RECOVERED, "%s", ""); + + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ " Failed to end recovery\n")); + talloc_free(state); + return -1; + } + + /* tell the control that we will be reply asynchronously */ + state->c = talloc_steal(state, c); + *async_reply = true; + return 0; +} + +/* + called when the 'startrecovery' event script has finished + */ +static void ctdb_start_recovery_callback(struct ctdb_context *ctdb, int status, void *p) +{ + struct recovery_callback_state *state = talloc_get_type(p, struct recovery_callback_state); + + if (status != 0) { + DEBUG(DEBUG_ERR,(__location__ " startrecovery event script failed (status %d)\n", status)); + } + + ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL); + talloc_free(state); +} + +static void run_start_recovery_event(struct ctdb_context *ctdb, + struct recovery_callback_state *state) +{ + int ret; + + ret = ctdb_event_script_callback(ctdb, state, + ctdb_start_recovery_callback, + state, + CTDB_EVENT_START_RECOVERY, + "%s", ""); + + if (ret != 0) { + DEBUG(DEBUG_ERR,("Unable to run startrecovery event\n")); + ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL); + talloc_free(state); + return; + } + + return; +} + +static bool reclock_strings_equal(const char *a, const char *b) +{ + return (a == NULL && b == NULL) || + (a != NULL && b != NULL && strcmp(a, b) == 0); +} + +static void start_recovery_reclock_callback(struct ctdb_context *ctdb, + int32_t status, + TDB_DATA data, + const char *errormsg, + void *private_data) +{ + struct recovery_callback_state *state = talloc_get_type_abort( + private_data, struct recovery_callback_state); + const char *local = ctdb->recovery_lock; + const char *remote = NULL; + + if (status != 0) { + DEBUG(DEBUG_ERR, (__location__ " GET_RECLOCK failed\n")); + ctdb_request_control_reply(ctdb, state->c, NULL, + status, errormsg); + talloc_free(state); + return; + } + + /* Check reclock consistency */ + if (data.dsize > 0) { + /* Ensure NUL-termination */ + data.dptr[data.dsize-1] = '\0'; + remote = (const char *)data.dptr; + } + if (! reclock_strings_equal(local, remote)) { + /* Inconsistent */ + ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL); + DEBUG(DEBUG_ERR, + ("Recovery lock configuration inconsistent: " + "recmaster has %s, this node has %s, shutting down\n", + remote == NULL ? "NULL" : remote, + local == NULL ? "NULL" : local)); + talloc_free(state); + ctdb_shutdown_sequence(ctdb, 1); + } + DEBUG(DEBUG_INFO, + ("Recovery lock consistency check successful\n")); + + run_start_recovery_event(ctdb, state); +} + +/* Check recovery lock consistency and run eventscripts for the + * "startrecovery" event */ +int32_t ctdb_control_start_recovery(struct ctdb_context *ctdb, + struct ctdb_req_control_old *c, + bool *async_reply) +{ + int ret; + struct recovery_callback_state *state; + uint32_t recmaster = c->hdr.srcnode; + + DEBUG(DEBUG_ERR, ("Recovery has started\n")); + gettimeofday(&ctdb->last_recovery_started, NULL); + + state = talloc(ctdb, struct recovery_callback_state); + CTDB_NO_MEMORY(ctdb, state); + + state->c = c; + + /* Although the recovery master sent this node a start + * recovery control, this node might still think the recovery + * master is disconnected. In this case defer the recovery + * lock consistency check. */ + if (ctdb->nodes[recmaster]->flags & NODE_FLAGS_DISCONNECTED) { + run_start_recovery_event(ctdb, state); + } else { + /* Ask the recovery master about its reclock setting */ + ret = ctdb_daemon_send_control(ctdb, + recmaster, + 0, + CTDB_CONTROL_GET_RECLOCK_FILE, + 0, 0, + tdb_null, + start_recovery_reclock_callback, + state); + + if (ret != 0) { + DEBUG(DEBUG_ERR, (__location__ " GET_RECLOCK failed\n")); + talloc_free(state); + return -1; + } + } + + /* tell the control that we will be reply asynchronously */ + state->c = talloc_steal(state, c); + *async_reply = true; + + return 0; +} + +/* + try to delete all these records as part of the vacuuming process + and return the records we failed to delete +*/ +int32_t ctdb_control_try_delete_records(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata) +{ + struct ctdb_marshall_buffer *reply = (struct ctdb_marshall_buffer *)indata.dptr; + struct ctdb_db_context *ctdb_db; + unsigned int i; + struct ctdb_rec_data_old *rec; + struct ctdb_marshall_buffer *records; + + if (indata.dsize < offsetof(struct ctdb_marshall_buffer, data)) { + DEBUG(DEBUG_ERR,(__location__ " invalid data in try_delete_records\n")); + return -1; + } + + ctdb_db = find_ctdb_db(ctdb, reply->db_id); + if (!ctdb_db) { + DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", reply->db_id)); + return -1; + } + + + DEBUG(DEBUG_DEBUG,("starting try_delete_records of %u records for dbid 0x%x\n", + reply->count, reply->db_id)); + + + /* create a blob to send back the records we couldn't delete */ + records = (struct ctdb_marshall_buffer *) + talloc_zero_size(outdata, + offsetof(struct ctdb_marshall_buffer, data)); + if (records == NULL) { + DEBUG(DEBUG_ERR,(__location__ " Out of memory\n")); + return -1; + } + records->db_id = ctdb_db->db_id; + + + rec = (struct ctdb_rec_data_old *)&reply->data[0]; + for (i=0;i<reply->count;i++) { + TDB_DATA key, data; + + key.dptr = &rec->data[0]; + key.dsize = rec->keylen; + data.dptr = &rec->data[key.dsize]; + data.dsize = rec->datalen; + + if (data.dsize < sizeof(struct ctdb_ltdb_header)) { + DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record in indata\n")); + talloc_free(records); + return -1; + } + + /* If we can't delete the record we must add it to the reply + so the lmaster knows it may not purge this record + */ + if (delete_tdb_record(ctdb, ctdb_db, rec) != 0) { + size_t old_size; + struct ctdb_ltdb_header *hdr; + + hdr = (struct ctdb_ltdb_header *)data.dptr; + data.dptr += sizeof(*hdr); + data.dsize -= sizeof(*hdr); + + DEBUG(DEBUG_INFO, (__location__ " Failed to vacuum delete record with hash 0x%08x\n", ctdb_hash(&key))); + + old_size = talloc_get_size(records); + records = talloc_realloc_size(outdata, records, old_size + rec->length); + if (records == NULL) { + DEBUG(DEBUG_ERR,(__location__ " Failed to expand\n")); + return -1; + } + records->count++; + memcpy(old_size+(uint8_t *)records, rec, rec->length); + } + + rec = (struct ctdb_rec_data_old *)(rec->length + (uint8_t *)rec); + } + + + *outdata = ctdb_marshall_finish(records); + + return 0; +} + +/* + report capabilities + */ +int32_t ctdb_control_get_capabilities(struct ctdb_context *ctdb, TDB_DATA *outdata) +{ + uint32_t *capabilities = NULL; + + capabilities = talloc(outdata, uint32_t); + CTDB_NO_MEMORY(ctdb, capabilities); + *capabilities = ctdb->capabilities; + + outdata->dsize = sizeof(uint32_t); + outdata->dptr = (uint8_t *)capabilities; + + return 0; +} + +/* The recovery daemon will ping us at regular intervals. + If we haven't been pinged for a while we assume the recovery + daemon is inoperable and we restart. +*/ +static void ctdb_recd_ping_timeout(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *p) +{ + struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context); + uint32_t *count = talloc_get_type(ctdb->recd_ping_count, uint32_t); + + DEBUG(DEBUG_ERR, ("Recovery daemon ping timeout. Count : %u\n", *count)); + + if (*count < ctdb->tunable.recd_ping_failcount) { + (*count)++; + tevent_add_timer(ctdb->ev, ctdb->recd_ping_count, + timeval_current_ofs(ctdb->tunable.recd_ping_timeout, 0), + ctdb_recd_ping_timeout, ctdb); + return; + } + + DEBUG(DEBUG_ERR, ("Final timeout for recovery daemon ping. Restarting recovery daemon. (This can be caused if the cluster filesystem has hung)\n")); + + ctdb_stop_recoverd(ctdb); + ctdb_start_recoverd(ctdb); +} + +int32_t ctdb_control_recd_ping(struct ctdb_context *ctdb) +{ + talloc_free(ctdb->recd_ping_count); + + ctdb->recd_ping_count = talloc_zero(ctdb, uint32_t); + CTDB_NO_MEMORY(ctdb, ctdb->recd_ping_count); + + if (ctdb->tunable.recd_ping_timeout != 0) { + tevent_add_timer(ctdb->ev, ctdb->recd_ping_count, + timeval_current_ofs(ctdb->tunable.recd_ping_timeout, 0), + ctdb_recd_ping_timeout, ctdb); + } + + return 0; +} + +void ctdb_node_become_inactive(struct ctdb_context *ctdb) +{ + struct ctdb_db_context *ctdb_db; + + D_WARNING("Making node INACTIVE\n"); + + /* + * Do not service database calls - reset generation to invalid + * so this node ignores any REQ/REPLY CALL/DMASTER + */ + ctdb->vnn_map->generation = INVALID_GENERATION; + for (ctdb_db = ctdb->db_list; ctdb_db != NULL; ctdb_db = ctdb_db->next) { + ctdb_db->generation = INVALID_GENERATION; + } + + /* + * Although this bypasses the control, the only thing missing + * is the deferred drop of all public IPs, which isn't + * necessary because they are dropped below + */ + if (ctdb->recovery_mode != CTDB_RECOVERY_ACTIVE) { + D_NOTICE("Recovery mode set to ACTIVE\n"); + ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE; + } + + /* + * Initiate database freeze - this will be scheduled for + * immediate execution and will be in progress long before the + * calling control returns + */ + ctdb_daemon_send_control(ctdb, + ctdb->pnn, + 0, + CTDB_CONTROL_FREEZE, + 0, + CTDB_CTRL_FLAG_NOREPLY, + tdb_null, + NULL, + NULL); + + D_NOTICE("Dropping all public IP addresses\n"); + ctdb_release_all_ips(ctdb); +} + +int32_t ctdb_control_stop_node(struct ctdb_context *ctdb) +{ + DEBUG(DEBUG_ERR, ("Stopping node\n")); + ctdb->nodes[ctdb->pnn]->flags |= NODE_FLAGS_STOPPED; + + ctdb_node_become_inactive(ctdb); + + return 0; +} + +int32_t ctdb_control_continue_node(struct ctdb_context *ctdb) +{ + DEBUG(DEBUG_ERR, ("Continue node\n")); + ctdb->nodes[ctdb->pnn]->flags &= ~NODE_FLAGS_STOPPED; + + return 0; +} + diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c new file mode 100644 index 0000000..84e2081 --- /dev/null +++ b/ctdb/server/ctdb_recoverd.c @@ -0,0 +1,3286 @@ +/* + ctdb recovery daemon + + Copyright (C) Ronnie Sahlberg 2007 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "replace.h" +#include "system/filesys.h" +#include "system/time.h" +#include "system/network.h" +#include "system/wait.h" + +#include <popt.h> +#include <talloc.h> +#include <tevent.h> +#include <tdb.h> + +#include "lib/tdb_wrap/tdb_wrap.h" +#include "lib/util/dlinklist.h" +#include "lib/util/debug.h" +#include "lib/util/samba_util.h" +#include "lib/util/sys_rw.h" +#include "lib/util/util_process.h" + +#include "ctdb_private.h" +#include "ctdb_client.h" + +#include "protocol/protocol_basic.h" + +#include "common/system_socket.h" +#include "common/common.h" +#include "common/logging.h" + +#include "server/ctdb_config.h" + +#include "ctdb_cluster_mutex.h" + +/* List of SRVID requests that need to be processed */ +struct srvid_list { + struct srvid_list *next, *prev; + struct ctdb_srvid_message *request; +}; + +struct srvid_requests { + struct srvid_list *requests; +}; + +static void srvid_request_reply(struct ctdb_context *ctdb, + struct ctdb_srvid_message *request, + TDB_DATA result) +{ + /* Someone that sent srvid==0 does not want a reply */ + if (request->srvid == 0) { + talloc_free(request); + return; + } + + if (ctdb_client_send_message(ctdb, request->pnn, request->srvid, + result) == 0) { + DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n", + (unsigned)request->pnn, + (unsigned long long)request->srvid)); + } else { + DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n", + (unsigned)request->pnn, + (unsigned long long)request->srvid)); + } + + talloc_free(request); +} + +static void srvid_requests_reply(struct ctdb_context *ctdb, + struct srvid_requests **requests, + TDB_DATA result) +{ + struct srvid_list *r; + + if (*requests == NULL) { + return; + } + + for (r = (*requests)->requests; r != NULL; r = r->next) { + srvid_request_reply(ctdb, r->request, result); + } + + /* Free the list structure... */ + TALLOC_FREE(*requests); +} + +static void srvid_request_add(struct ctdb_context *ctdb, + struct srvid_requests **requests, + struct ctdb_srvid_message *request) +{ + struct srvid_list *t; + int32_t ret; + TDB_DATA result; + + if (*requests == NULL) { + *requests = talloc_zero(ctdb, struct srvid_requests); + if (*requests == NULL) { + goto nomem; + } + } + + t = talloc_zero(*requests, struct srvid_list); + if (t == NULL) { + /* If *requests was just allocated above then free it */ + if ((*requests)->requests == NULL) { + TALLOC_FREE(*requests); + } + goto nomem; + } + + t->request = (struct ctdb_srvid_message *)talloc_steal(t, request); + DLIST_ADD((*requests)->requests, t); + + return; + +nomem: + /* Failed to add the request to the list. Send a fail. */ + DEBUG(DEBUG_ERR, (__location__ + " Out of memory, failed to queue SRVID request\n")); + ret = -ENOMEM; + result.dsize = sizeof(ret); + result.dptr = (uint8_t *)&ret; + srvid_request_reply(ctdb, request, result); +} + +/* An abstraction to allow an operation (takeover runs, recoveries, + * ...) to be disabled for a given timeout */ +struct ctdb_op_state { + struct tevent_timer *timer; + bool in_progress; + const char *name; +}; + +static struct ctdb_op_state *ctdb_op_init(TALLOC_CTX *mem_ctx, const char *name) +{ + struct ctdb_op_state *state = talloc_zero(mem_ctx, struct ctdb_op_state); + + if (state != NULL) { + state->in_progress = false; + state->name = name; + } + + return state; +} + +static bool ctdb_op_is_disabled(struct ctdb_op_state *state) +{ + return state->timer != NULL; +} + +static bool ctdb_op_begin(struct ctdb_op_state *state) +{ + if (ctdb_op_is_disabled(state)) { + DEBUG(DEBUG_NOTICE, + ("Unable to begin - %s are disabled\n", state->name)); + return false; + } + + state->in_progress = true; + return true; +} + +static bool ctdb_op_end(struct ctdb_op_state *state) +{ + return state->in_progress = false; +} + +static bool ctdb_op_is_in_progress(struct ctdb_op_state *state) +{ + return state->in_progress; +} + +static void ctdb_op_enable(struct ctdb_op_state *state) +{ + TALLOC_FREE(state->timer); +} + +static void ctdb_op_timeout_handler(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval yt, void *p) +{ + struct ctdb_op_state *state = + talloc_get_type(p, struct ctdb_op_state); + + DEBUG(DEBUG_NOTICE,("Reenabling %s after timeout\n", state->name)); + ctdb_op_enable(state); +} + +static int ctdb_op_disable(struct ctdb_op_state *state, + struct tevent_context *ev, + uint32_t timeout) +{ + if (timeout == 0) { + DEBUG(DEBUG_NOTICE,("Reenabling %s\n", state->name)); + ctdb_op_enable(state); + return 0; + } + + if (state->in_progress) { + DEBUG(DEBUG_ERR, + ("Unable to disable %s - in progress\n", state->name)); + return -EAGAIN; + } + + DEBUG(DEBUG_NOTICE,("Disabling %s for %u seconds\n", + state->name, timeout)); + + /* Clear any old timers */ + talloc_free(state->timer); + + /* Arrange for the timeout to occur */ + state->timer = tevent_add_timer(ev, state, + timeval_current_ofs(timeout, 0), + ctdb_op_timeout_handler, state); + if (state->timer == NULL) { + DEBUG(DEBUG_ERR,(__location__ " Unable to setup timer\n")); + return -ENOMEM; + } + + return 0; +} + +struct ctdb_banning_state { + uint32_t pnn; + uint32_t count; + struct timeval last_reported_time; +}; + +struct ctdb_cluster_lock_handle; + +/* + private state of recovery daemon + */ +struct ctdb_recoverd { + struct ctdb_context *ctdb; + uint32_t leader; + struct tevent_timer *leader_broadcast_te; + struct tevent_timer *leader_broadcast_timeout_te; + uint32_t pnn; + uint32_t last_culprit_node; + struct ctdb_banning_state *banning_state; + struct ctdb_node_map_old *nodemap; + struct timeval priority_time; + bool need_takeover_run; + bool need_recovery; + uint32_t node_flags; + struct tevent_timer *send_election_te; + bool election_in_progress; + struct tevent_timer *election_timeout; + struct srvid_requests *reallocate_requests; + struct ctdb_op_state *takeover_run; + struct ctdb_op_state *recovery; + struct ctdb_iface_list_old *ifaces; + uint32_t *force_rebalance_nodes; + struct ctdb_node_capabilities *caps; + bool frozen_on_inactive; + struct ctdb_cluster_lock_handle *cluster_lock_handle; + pid_t helper_pid; +}; + +#define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0) +#define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0) + +static void ctdb_restart_recd(struct tevent_context *ev, + struct tevent_timer *te, struct timeval t, + void *private_data); + +static bool this_node_is_leader(struct ctdb_recoverd *rec) +{ + return rec->leader == rec->pnn; +} + +static bool this_node_can_be_leader(struct ctdb_recoverd *rec) +{ + return (rec->node_flags & NODE_FLAGS_INACTIVE) == 0 && + (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) != 0; +} + +static bool node_flags(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t *flags) +{ + size_t i; + + for (i = 0; i < rec->nodemap->num; i++) { + struct ctdb_node_and_flags *node = &rec->nodemap->nodes[i]; + if (node->pnn == pnn) { + if (flags != NULL) { + *flags = node->flags; + } + return true; + } + } + + return false; +} + +/* + ban a node for a period of time + */ +static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn) +{ + int ret; + struct ctdb_context *ctdb = rec->ctdb; + uint32_t ban_time = ctdb->tunable.recovery_ban_period; + struct ctdb_ban_state bantime; + + if (!ctdb_validate_pnn(ctdb, pnn)) { + DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn)); + return; + } + + DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time)); + + bantime.pnn = pnn; + bantime.time = ban_time; + + ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime); + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn)); + return; + } + +} + +enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED}; + + +/* + remember the trouble maker + */ +static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, + uint32_t culprit, + uint32_t count) +{ + struct ctdb_context *ctdb = talloc_get_type_abort( + rec->ctdb, struct ctdb_context); + struct ctdb_banning_state *ban_state = NULL; + size_t len; + bool ok; + + ok = node_flags(rec, culprit, NULL); + if (!ok) { + DBG_WARNING("Unknown culprit node %"PRIu32"\n", culprit); + return; + } + + /* If we are banned or stopped, do not set other nodes as culprits */ + if (rec->node_flags & NODE_FLAGS_INACTIVE) { + D_WARNING("This node is INACTIVE, cannot set culprit node %d\n", + culprit); + return; + } + + if (rec->banning_state == NULL) { + len = 0; + } else { + size_t i; + + len = talloc_array_length(rec->banning_state); + + for (i = 0 ; i < len; i++) { + if (rec->banning_state[i].pnn == culprit) { + ban_state= &rec->banning_state[i]; + break; + } + } + } + + /* Not found, so extend (or allocate new) array */ + if (ban_state == NULL) { + struct ctdb_banning_state *t; + + len += 1; + /* + * talloc_realloc() handles the corner case where + * rec->banning_state is NULL + */ + t = talloc_realloc(rec, + rec->banning_state, + struct ctdb_banning_state, + len); + if (t == NULL) { + DBG_WARNING("Memory allocation error\n"); + return; + } + rec->banning_state = t; + + /* New element is always at the end - initialise it... */ + ban_state = &rec->banning_state[len - 1]; + *ban_state = (struct ctdb_banning_state) { + .pnn = culprit, + .count = 0, + }; + } else if (ban_state->count > 0 && + timeval_elapsed(&ban_state->last_reported_time) > + ctdb->tunable.recovery_grace_period) { + /* + * Forgive old transgressions beyond the tunable time-limit + */ + ban_state->count = 0; + } + + ban_state->count += count; + ban_state->last_reported_time = timeval_current(); + rec->last_culprit_node = culprit; +} + +static void ban_counts_reset(struct ctdb_recoverd *rec) +{ + D_NOTICE("Resetting ban count to 0 for all nodes\n"); + TALLOC_FREE(rec->banning_state); +} + +/* + remember the trouble maker + */ +static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit) +{ + ctdb_set_culprit_count(rec, culprit, 1); +} + +/* + Retrieve capabilities from all connected nodes + */ +static int update_capabilities(struct ctdb_recoverd *rec, + struct ctdb_node_map_old *nodemap) +{ + uint32_t *capp; + TALLOC_CTX *tmp_ctx; + struct ctdb_node_capabilities *caps; + struct ctdb_context *ctdb = rec->ctdb; + + tmp_ctx = talloc_new(rec); + CTDB_NO_MEMORY(ctdb, tmp_ctx); + + caps = ctdb_get_capabilities(ctdb, tmp_ctx, + CONTROL_TIMEOUT(), nodemap); + + if (caps == NULL) { + DEBUG(DEBUG_ERR, + (__location__ " Failed to get node capabilities\n")); + talloc_free(tmp_ctx); + return -1; + } + + capp = ctdb_get_node_capabilities(caps, rec->pnn); + if (capp == NULL) { + DEBUG(DEBUG_ERR, + (__location__ + " Capabilities don't include current node.\n")); + talloc_free(tmp_ctx); + return -1; + } + ctdb->capabilities = *capp; + + TALLOC_FREE(rec->caps); + rec->caps = talloc_steal(rec, caps); + + talloc_free(tmp_ctx); + return 0; +} + +/* + change recovery mode on all nodes + */ +static int set_recovery_mode(struct ctdb_context *ctdb, + struct ctdb_recoverd *rec, + struct ctdb_node_map_old *nodemap, + uint32_t rec_mode) +{ + TDB_DATA data; + uint32_t *nodes; + TALLOC_CTX *tmp_ctx; + + tmp_ctx = talloc_new(ctdb); + CTDB_NO_MEMORY(ctdb, tmp_ctx); + + nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true); + + data.dsize = sizeof(uint32_t); + data.dptr = (unsigned char *)&rec_mode; + + if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE, + nodes, 0, + CONTROL_TIMEOUT(), + false, data, + NULL, NULL, + NULL) != 0) { + DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n")); + talloc_free(tmp_ctx); + return -1; + } + + talloc_free(tmp_ctx); + return 0; +} + +/* + * Update flags on all connected nodes + */ +static int update_flags_on_all_nodes(struct ctdb_recoverd *rec, + uint32_t pnn, + uint32_t flags) +{ + struct ctdb_context *ctdb = rec->ctdb; + struct timeval timeout = CONTROL_TIMEOUT(); + TDB_DATA data; + struct ctdb_node_map_old *nodemap=NULL; + struct ctdb_node_flag_change c; + TALLOC_CTX *tmp_ctx = talloc_new(ctdb); + uint32_t *nodes; + uint32_t i; + int ret; + + nodemap = rec->nodemap; + + for (i = 0; i < nodemap->num; i++) { + if (pnn == nodemap->nodes[i].pnn) { + break; + } + } + if (i >= nodemap->num) { + DBG_ERR("Nodemap does not contain node %d\n", pnn); + talloc_free(tmp_ctx); + return -1; + } + + c.pnn = pnn; + c.old_flags = nodemap->nodes[i].flags; + c.new_flags = flags; + + data.dsize = sizeof(c); + data.dptr = (unsigned char *)&c; + + /* send the flags update to all connected nodes */ + nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true); + + ret = ctdb_client_async_control(ctdb, + CTDB_CONTROL_MODIFY_FLAGS, + nodes, + 0, + timeout, + false, + data, + NULL, + NULL, + NULL); + if (ret != 0) { + DBG_ERR("Unable to update flags on remote nodes\n"); + talloc_free(tmp_ctx); + return -1; + } + + talloc_free(tmp_ctx); + return 0; +} + +static bool _cluster_lock_lock(struct ctdb_recoverd *rec); +static bool cluster_lock_held(struct ctdb_recoverd *rec); + +static bool cluster_lock_enabled(struct ctdb_recoverd *rec) +{ + return rec->ctdb->recovery_lock != NULL; +} + +static bool cluster_lock_take(struct ctdb_recoverd *rec) +{ + struct ctdb_context *ctdb = rec->ctdb; + bool have_lock; + + if (!cluster_lock_enabled(rec)) { + return true; + } + + if (cluster_lock_held(rec)) { + D_NOTICE("Already holding cluster lock\n"); + return true; + } + + D_NOTICE("Attempting to take cluster lock (%s)\n", ctdb->recovery_lock); + have_lock = _cluster_lock_lock(rec); + if (!have_lock) { + return false; + } + + D_NOTICE("Cluster lock taken successfully\n"); + return true; +} + +/* + called when ctdb_wait_timeout should finish + */ +static void ctdb_wait_handler(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval yt, void *p) +{ + uint32_t *timed_out = (uint32_t *)p; + (*timed_out) = 1; +} + +/* + wait for a given number of seconds + */ +static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs) +{ + uint32_t timed_out = 0; + uint32_t usecs = (secs - (uint32_t)secs) * 1000000; + tevent_add_timer(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs), + ctdb_wait_handler, &timed_out); + while (!timed_out) { + tevent_loop_once(ctdb->ev); + } +} + +/* + * Broadcast cluster leader + */ + +static int leader_broadcast_send(struct ctdb_recoverd *rec, uint32_t pnn) +{ + struct ctdb_context *ctdb = rec->ctdb; + TDB_DATA data; + int ret; + + data.dptr = (uint8_t *)&pnn; + data.dsize = sizeof(pnn); + + ret = ctdb_client_send_message(ctdb, + CTDB_BROADCAST_CONNECTED, + CTDB_SRVID_LEADER, + data); + return ret; +} + +static int leader_broadcast_loop(struct ctdb_recoverd *rec); +static void cluster_lock_release(struct ctdb_recoverd *rec); + +/* This runs continuously but only sends the broadcast when leader */ +static void leader_broadcast_loop_handler(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval current_time, + void *private_data) +{ + struct ctdb_recoverd *rec = talloc_get_type_abort( + private_data, struct ctdb_recoverd); + int ret; + + if (!this_node_can_be_leader(rec)) { + if (this_node_is_leader(rec)) { + rec->leader = CTDB_UNKNOWN_PNN; + } + if (cluster_lock_enabled(rec) && cluster_lock_held(rec)) { + cluster_lock_release(rec); + } + goto done; + } + + if (!this_node_is_leader(rec)) { + goto done; + } + + if (rec->election_in_progress) { + goto done; + } + + ret = leader_broadcast_send(rec, rec->leader); + if (ret != 0) { + DBG_WARNING("Failed to send leader broadcast\n"); + } + +done: + ret = leader_broadcast_loop(rec); + if (ret != 0) { + D_WARNING("Failed to set up leader broadcast\n"); + } +} + +static int leader_broadcast_loop(struct ctdb_recoverd *rec) +{ + struct ctdb_context *ctdb = rec->ctdb; + + TALLOC_FREE(rec->leader_broadcast_te); + rec->leader_broadcast_te = + tevent_add_timer(ctdb->ev, + rec, + timeval_current_ofs(1, 0), + leader_broadcast_loop_handler, + rec); + if (rec->leader_broadcast_te == NULL) { + return ENOMEM; + } + + return 0; +} + +static bool leader_broadcast_loop_active(struct ctdb_recoverd *rec) +{ + return rec->leader_broadcast_te != NULL; +} + +/* + called when an election times out (ends) + */ +static void ctdb_election_timeout(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *p) +{ + struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd); + bool ok; + + rec->election_in_progress = false; + rec->election_timeout = NULL; + fast_start = false; + + D_WARNING("Election period ended, leader=%u\n", rec->leader); + + if (!this_node_is_leader(rec)) { + return; + } + + ok = cluster_lock_take(rec); + if (!ok) { + D_ERR("Unable to get cluster lock, banning node\n"); + ctdb_ban_node(rec, rec->pnn); + } +} + + +/* + wait for an election to finish. It finished election_timeout seconds after + the last election packet is received + */ +static void ctdb_wait_election(struct ctdb_recoverd *rec) +{ + struct ctdb_context *ctdb = rec->ctdb; + while (rec->election_in_progress) { + tevent_loop_once(ctdb->ev); + } +} + +/* + * Update local flags from all remote connected nodes and push out + * flags changes to all nodes. This is only run by the leader. + */ +static int update_flags(struct ctdb_recoverd *rec, + struct ctdb_node_map_old *nodemap, + struct ctdb_node_map_old **remote_nodemaps) +{ + unsigned int j; + struct ctdb_context *ctdb = rec->ctdb; + TALLOC_CTX *mem_ctx = talloc_new(ctdb); + + /* Check flags from remote nodes */ + for (j=0; j<nodemap->num; j++) { + struct ctdb_node_map_old *remote_nodemap=NULL; + uint32_t local_flags = nodemap->nodes[j].flags; + uint32_t remote_pnn = nodemap->nodes[j].pnn; + uint32_t remote_flags; + unsigned int i; + int ret; + + if (local_flags & NODE_FLAGS_DISCONNECTED) { + continue; + } + if (remote_pnn == rec->pnn) { + /* + * No remote nodemap for this node since this + * is the local nodemap. However, still need + * to check this against the remote nodes and + * push it if they are out-of-date. + */ + goto compare_remotes; + } + + remote_nodemap = remote_nodemaps[j]; + remote_flags = remote_nodemap->nodes[j].flags; + + if (local_flags != remote_flags) { + /* + * Update the local copy of the flags in the + * recovery daemon. + */ + D_NOTICE("Remote node %u had flags 0x%x, " + "local had 0x%x - updating local\n", + remote_pnn, + remote_flags, + local_flags); + nodemap->nodes[j].flags = remote_flags; + local_flags = remote_flags; + goto push; + } + +compare_remotes: + for (i = 0; i < nodemap->num; i++) { + if (i == j) { + continue; + } + if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) { + continue; + } + if (nodemap->nodes[i].pnn == rec->pnn) { + continue; + } + + remote_nodemap = remote_nodemaps[i]; + remote_flags = remote_nodemap->nodes[j].flags; + + if (local_flags != remote_flags) { + goto push; + } + } + + continue; + +push: + D_NOTICE("Pushing updated flags for node %u (0x%x)\n", + remote_pnn, + local_flags); + ret = update_flags_on_all_nodes(rec, remote_pnn, local_flags); + if (ret != 0) { + DBG_ERR("Unable to update flags on remote nodes\n"); + talloc_free(mem_ctx); + return -1; + } + } + talloc_free(mem_ctx); + return 0; +} + + +/* Create a new random generation id. + The generation id can not be the INVALID_GENERATION id +*/ +static uint32_t new_generation(void) +{ + uint32_t generation; + + while (1) { + generation = random(); + + if (generation != INVALID_GENERATION) { + break; + } + } + + return generation; +} + +static bool cluster_lock_held(struct ctdb_recoverd *rec) +{ + return (rec->cluster_lock_handle != NULL); +} + +struct ctdb_cluster_lock_handle { + bool done; + bool locked; + double latency; + struct ctdb_cluster_mutex_handle *h; + struct ctdb_recoverd *rec; +}; + +static void take_cluster_lock_handler(char status, + double latency, + void *private_data) +{ + struct ctdb_cluster_lock_handle *s = + (struct ctdb_cluster_lock_handle *) private_data; + + s->locked = (status == '0') ; + + /* + * If unsuccessful then ensure the process has exited and that + * the file descriptor event handler has been cancelled + */ + if (! s->locked) { + TALLOC_FREE(s->h); + } + + switch (status) { + case '0': + s->latency = latency; + break; + + case '1': + D_ERR("Unable to take cluster lock - contention\n"); + break; + + case '2': + D_ERR("Unable to take cluster lock - timeout\n"); + break; + + default: + D_ERR("Unable to take cluster lock - unknown error\n"); + } + + s->done = true; +} + +static void force_election(struct ctdb_recoverd *rec); + +static void lost_cluster_lock_handler(void *private_data) +{ + struct ctdb_recoverd *rec = talloc_get_type_abort( + private_data, struct ctdb_recoverd); + + D_ERR("Cluster lock helper terminated\n"); + TALLOC_FREE(rec->cluster_lock_handle); + + if (this_node_can_be_leader(rec)) { + force_election(rec); + } +} + +static bool _cluster_lock_lock(struct ctdb_recoverd *rec) +{ + struct ctdb_context *ctdb = rec->ctdb; + struct ctdb_cluster_mutex_handle *h; + struct ctdb_cluster_lock_handle *s; + + s = talloc_zero(rec, struct ctdb_cluster_lock_handle); + if (s == NULL) { + DBG_ERR("Memory allocation error\n"); + return false; + }; + + s->rec = rec; + + h = ctdb_cluster_mutex(s, + ctdb, + ctdb->recovery_lock, + 120, + take_cluster_lock_handler, + s, + lost_cluster_lock_handler, + rec); + if (h == NULL) { + talloc_free(s); + return false; + } + + rec->cluster_lock_handle = s; + s->h = h; + + while (! s->done) { + tevent_loop_once(ctdb->ev); + } + + if (! s->locked) { + TALLOC_FREE(rec->cluster_lock_handle); + return false; + } + + ctdb_ctrl_report_recd_lock_latency(ctdb, + CONTROL_TIMEOUT(), + s->latency); + + return true; +} + +static void cluster_lock_release(struct ctdb_recoverd *rec) +{ + if (rec->cluster_lock_handle == NULL) { + return; + } + + if (! rec->cluster_lock_handle->done) { + /* + * Taking of cluster lock still in progress. Free + * the cluster mutex handle to release it but leave + * the cluster lock handle in place to allow taking + * of the lock to fail. + */ + D_NOTICE("Cancelling cluster lock\n"); + TALLOC_FREE(rec->cluster_lock_handle->h); + rec->cluster_lock_handle->done = true; + rec->cluster_lock_handle->locked = false; + return; + } + + D_NOTICE("Releasing cluster lock\n"); + TALLOC_FREE(rec->cluster_lock_handle); +} + +static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban) +{ + size_t len = talloc_array_length(rec->banning_state); + size_t i; + + + *self_ban = false; + for (i = 0; i < len; i++) { + struct ctdb_banning_state *ban_state = &rec->banning_state[i]; + + if (ban_state->count < 2 * rec->nodemap->num) { + continue; + } + + D_NOTICE("Node %u reached %u banning credits\n", + ban_state->pnn, + ban_state->count); + ctdb_ban_node(rec, ban_state->pnn); + ban_state->count = 0; + + /* Banning ourself? */ + if (ban_state->pnn == rec->pnn) { + *self_ban = true; + } + } +} + +struct helper_state { + int fd[2]; + pid_t pid; + int result; + bool done; +}; + +static void helper_handler(struct tevent_context *ev, + struct tevent_fd *fde, + uint16_t flags, void *private_data) +{ + struct helper_state *state = talloc_get_type_abort( + private_data, struct helper_state); + int ret; + + ret = sys_read(state->fd[0], &state->result, sizeof(state->result)); + if (ret != sizeof(state->result)) { + state->result = EPIPE; + } + + state->done = true; +} + +static int helper_run(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx, + const char *prog, const char *arg, const char *type) +{ + struct helper_state *state; + struct tevent_fd *fde; + const char **args; + int nargs, ret; + + state = talloc_zero(mem_ctx, struct helper_state); + if (state == NULL) { + DEBUG(DEBUG_ERR, (__location__ " memory error\n")); + return -1; + } + + state->pid = -1; + + ret = pipe(state->fd); + if (ret != 0) { + DEBUG(DEBUG_ERR, + ("Failed to create pipe for %s helper\n", type)); + goto fail; + } + + set_close_on_exec(state->fd[0]); + + nargs = 4; + args = talloc_array(state, const char *, nargs); + if (args == NULL) { + DEBUG(DEBUG_ERR, (__location__ " memory error\n")); + goto fail; + } + + args[0] = talloc_asprintf(args, "%d", state->fd[1]); + if (args[0] == NULL) { + DEBUG(DEBUG_ERR, (__location__ " memory error\n")); + goto fail; + } + args[1] = rec->ctdb->daemon.name; + args[2] = arg; + args[3] = NULL; + + if (args[2] == NULL) { + nargs = 3; + } + + state->pid = ctdb_vfork_exec(state, rec->ctdb, prog, nargs, args); + if (state->pid == -1) { + DEBUG(DEBUG_ERR, + ("Failed to create child for %s helper\n", type)); + goto fail; + } + + close(state->fd[1]); + state->fd[1] = -1; + + rec->helper_pid = state->pid; + state->done = false; + + fde = tevent_add_fd(rec->ctdb->ev, state, state->fd[0], + TEVENT_FD_READ, helper_handler, state); + if (fde == NULL) { + goto fail; + } + tevent_fd_set_auto_close(fde); + + while (!state->done) { + tevent_loop_once(rec->ctdb->ev); + + if (!this_node_is_leader(rec)) { + D_ERR("Leader changed to %u, aborting %s\n", + rec->leader, + type); + state->result = 1; + break; + } + } + + close(state->fd[0]); + state->fd[0] = -1; + + if (state->result != 0) { + goto fail; + } + + rec->helper_pid = -1; + ctdb_kill(rec->ctdb, state->pid, SIGKILL); + talloc_free(state); + return 0; + +fail: + if (state->fd[0] != -1) { + close(state->fd[0]); + } + if (state->fd[1] != -1) { + close(state->fd[1]); + } + rec->helper_pid = -1; + if (state->pid != -1) { + ctdb_kill(rec->ctdb, state->pid, SIGKILL); + } + talloc_free(state); + return -1; +} + + +static int ctdb_takeover(struct ctdb_recoverd *rec, + uint32_t *force_rebalance_nodes) +{ + static char prog[PATH_MAX+1] = ""; + char *arg; + unsigned int i; + int ret; + + if (!ctdb_set_helper("takeover_helper", prog, sizeof(prog), + "CTDB_TAKEOVER_HELPER", CTDB_HELPER_BINDIR, + "ctdb_takeover_helper")) { + ctdb_die(rec->ctdb, "Unable to set takeover helper\n"); + } + + arg = NULL; + for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) { + uint32_t pnn = force_rebalance_nodes[i]; + if (arg == NULL) { + arg = talloc_asprintf(rec, "%u", pnn); + } else { + arg = talloc_asprintf_append(arg, ",%u", pnn); + } + if (arg == NULL) { + DEBUG(DEBUG_ERR, (__location__ " memory error\n")); + return -1; + } + } + + if (ctdb_config.failover_disabled) { + ret = setenv("CTDB_DISABLE_IP_FAILOVER", "1", 1); + if (ret != 0) { + D_ERR("Failed to set CTDB_DISABLE_IP_FAILOVER variable\n"); + return -1; + } + } + + return helper_run(rec, rec, prog, arg, "takeover"); +} + +static bool do_takeover_run(struct ctdb_recoverd *rec, + struct ctdb_node_map_old *nodemap) +{ + uint32_t *nodes = NULL; + struct ctdb_disable_message dtr; + TDB_DATA data; + size_t i; + uint32_t *rebalance_nodes = rec->force_rebalance_nodes; + int ret; + bool ok; + + DEBUG(DEBUG_NOTICE, ("Takeover run starting\n")); + + if (ctdb_op_is_in_progress(rec->takeover_run)) { + DEBUG(DEBUG_ERR, (__location__ + " takeover run already in progress \n")); + ok = false; + goto done; + } + + if (!ctdb_op_begin(rec->takeover_run)) { + ok = false; + goto done; + } + + /* Disable IP checks (takeover runs, really) on other nodes + * while doing this takeover run. This will stop those other + * nodes from triggering takeover runs when think they should + * be hosting an IP but it isn't yet on an interface. Don't + * wait for replies since a failure here might cause some + * noise in the logs but will not actually cause a problem. + */ + ZERO_STRUCT(dtr); + dtr.srvid = 0; /* No reply */ + dtr.pnn = -1; + + data.dptr = (uint8_t*)&dtr; + data.dsize = sizeof(dtr); + + nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false); + + /* Disable for 60 seconds. This can be a tunable later if + * necessary. + */ + dtr.timeout = 60; + for (i = 0; i < talloc_array_length(nodes); i++) { + if (ctdb_client_send_message(rec->ctdb, nodes[i], + CTDB_SRVID_DISABLE_TAKEOVER_RUNS, + data) != 0) { + DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n")); + } + } + + ret = ctdb_takeover(rec, rec->force_rebalance_nodes); + + /* Re-enable takeover runs and IP checks on other nodes */ + dtr.timeout = 0; + for (i = 0; i < talloc_array_length(nodes); i++) { + if (ctdb_client_send_message(rec->ctdb, nodes[i], + CTDB_SRVID_DISABLE_TAKEOVER_RUNS, + data) != 0) { + DEBUG(DEBUG_INFO,("Failed to re-enable takeover runs\n")); + } + } + + if (ret != 0) { + DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n")); + ok = false; + goto done; + } + + ok = true; + /* Takeover run was successful so clear force rebalance targets */ + if (rebalance_nodes == rec->force_rebalance_nodes) { + TALLOC_FREE(rec->force_rebalance_nodes); + } else { + DEBUG(DEBUG_WARNING, + ("Rebalance target nodes changed during takeover run - not clearing\n")); + } +done: + rec->need_takeover_run = !ok; + talloc_free(nodes); + ctdb_op_end(rec->takeover_run); + + DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful")); + return ok; +} + +static int db_recovery_parallel(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx) +{ + static char prog[PATH_MAX+1] = ""; + const char *arg; + + if (!ctdb_set_helper("recovery_helper", prog, sizeof(prog), + "CTDB_RECOVERY_HELPER", CTDB_HELPER_BINDIR, + "ctdb_recovery_helper")) { + ctdb_die(rec->ctdb, "Unable to set recovery helper\n"); + } + + arg = talloc_asprintf(mem_ctx, "%u", new_generation()); + if (arg == NULL) { + DEBUG(DEBUG_ERR, (__location__ " memory error\n")); + return -1; + } + + setenv("CTDB_DBDIR_STATE", rec->ctdb->db_directory_state, 1); + + return helper_run(rec, mem_ctx, prog, arg, "recovery"); +} + +/* + * Main recovery function, only run by leader + */ +static int do_recovery(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx) +{ + struct ctdb_context *ctdb = rec->ctdb; + struct ctdb_node_map_old *nodemap = rec->nodemap; + unsigned int i; + int ret; + bool self_ban; + + DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n")); + + /* Check if the current node is still the leader. It's possible that + * re-election has changed the leader. + */ + if (!this_node_is_leader(rec)) { + D_NOTICE("Leader changed to %u, aborting recovery\n", + rec->leader); + return -1; + } + + /* if recovery fails, force it again */ + rec->need_recovery = true; + + if (!ctdb_op_begin(rec->recovery)) { + return -1; + } + + if (rec->election_in_progress) { + /* an election is in progress */ + DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n")); + goto fail; + } + + ban_misbehaving_nodes(rec, &self_ban); + if (self_ban) { + DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n")); + goto fail; + } + + if (cluster_lock_enabled(rec) && !cluster_lock_held(rec)) { + /* Leader can change in ban_misbehaving_nodes() */ + if (!this_node_is_leader(rec)) { + D_NOTICE("Leader changed to %u, aborting recovery\n", + rec->leader); + rec->need_recovery = false; + goto fail; + } + + D_ERR("Cluster lock not held - abort recovery, ban node\n"); + ctdb_ban_node(rec, rec->pnn); + goto fail; + } + + DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node)); + + /* Retrieve capabilities from all connected nodes */ + ret = update_capabilities(rec, nodemap); + if (ret!=0) { + DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n")); + return -1; + } + + /* + update all nodes to have the same flags that we have + */ + for (i=0;i<nodemap->num;i++) { + if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) { + continue; + } + + ret = update_flags_on_all_nodes(rec, + nodemap->nodes[i].pnn, + nodemap->nodes[i].flags); + if (ret != 0) { + if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) { + DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i)); + } else { + DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i)); + return -1; + } + } + } + + DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n")); + + ret = db_recovery_parallel(rec, mem_ctx); + if (ret != 0) { + goto fail; + } + + do_takeover_run(rec, nodemap); + + /* send a message to all clients telling them that the cluster + has been reconfigured */ + ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED, + CTDB_SRVID_RECONFIGURE, tdb_null); + if (ret != 0) { + DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n")); + goto fail; + } + + DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n")); + + rec->need_recovery = false; + ctdb_op_end(rec->recovery); + + /* + * Completed a full recovery so forgive any past transgressions + */ + ban_counts_reset(rec); + + /* We just finished a recovery successfully. + We now wait for rerecovery_timeout before we allow + another recovery to take place. + */ + DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be suppressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout)); + ctdb_op_disable(rec->recovery, ctdb->ev, + ctdb->tunable.rerecovery_timeout); + return 0; + +fail: + ctdb_op_end(rec->recovery); + return -1; +} + + +/* + elections are won by first checking the number of connected nodes, then + the priority time, then the pnn + */ +struct election_message { + uint32_t num_connected; + struct timeval priority_time; + uint32_t pnn; + uint32_t node_flags; +}; + +/* + form this nodes election data + */ +static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em) +{ + unsigned int i; + int ret; + struct ctdb_node_map_old *nodemap; + struct ctdb_context *ctdb = rec->ctdb; + bool ok; + + ZERO_STRUCTP(em); + + em->pnn = rec->pnn; + em->priority_time = rec->priority_time; + + ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap); + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n")); + return; + } + + ok = node_flags(rec, rec->pnn, &rec->node_flags); + if (!ok) { + DBG_ERR("Unable to get node flags for this node\n"); + return; + } + em->node_flags = rec->node_flags; + + for (i=0;i<nodemap->num;i++) { + if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) { + em->num_connected++; + } + } + + if (!this_node_can_be_leader(rec)) { + /* Try to lose... */ + em->num_connected = 0; + em->priority_time = timeval_current(); + } + + talloc_free(nodemap); +} + +/* + see if the given election data wins + */ +static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em) +{ + struct election_message myem; + int cmp = 0; + + ctdb_election_data(rec, &myem); + + if (!this_node_can_be_leader(rec)) { + return false; + } + + /* Automatically win if other node is banned or stopped */ + if (em->node_flags & NODE_FLAGS_INACTIVE) { + return true; + } + + /* then the longest running node */ + if (cmp == 0) { + cmp = timeval_compare(&em->priority_time, &myem.priority_time); + } + + if (cmp == 0) { + cmp = (int)myem.pnn - (int)em->pnn; + } + + return cmp > 0; +} + +/* + send out an election request + */ +static int send_election_request(struct ctdb_recoverd *rec) +{ + TDB_DATA election_data; + struct election_message emsg; + uint64_t srvid; + struct ctdb_context *ctdb = rec->ctdb; + + srvid = CTDB_SRVID_ELECTION; + + ctdb_election_data(rec, &emsg); + + election_data.dsize = sizeof(struct election_message); + election_data.dptr = (unsigned char *)&emsg; + + + /* Assume this node will win the election, set leader accordingly */ + rec->leader = rec->pnn; + + /* send an election message to all active nodes */ + DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n")); + return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data); +} + +/* + we think we are winning the election - send a broadcast election request + */ +static void election_send_request(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *p) +{ + struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd); + int ret; + + ret = send_election_request(rec); + if (ret != 0) { + DEBUG(DEBUG_ERR,("Failed to send election request!\n")); + } + + TALLOC_FREE(rec->send_election_te); +} + +/* + handler for memory dumps +*/ +static void mem_dump_handler(uint64_t srvid, TDB_DATA data, void *private_data) +{ + struct ctdb_recoverd *rec = talloc_get_type( + private_data, struct ctdb_recoverd); + struct ctdb_context *ctdb = rec->ctdb; + TALLOC_CTX *tmp_ctx = talloc_new(ctdb); + TDB_DATA *dump; + int ret; + struct ctdb_srvid_message *rd; + + if (data.dsize != sizeof(struct ctdb_srvid_message)) { + DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n")); + talloc_free(tmp_ctx); + return; + } + rd = (struct ctdb_srvid_message *)data.dptr; + + dump = talloc_zero(tmp_ctx, TDB_DATA); + if (dump == NULL) { + DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n")); + talloc_free(tmp_ctx); + return; + } + ret = ctdb_dump_memory(ctdb, dump); + if (ret != 0) { + DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n")); + talloc_free(tmp_ctx); + return; + } + + DBG_ERR("recovery daemon memory dump\n"); + + ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump); + if (ret != 0) { + DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n")); + talloc_free(tmp_ctx); + return; + } + + talloc_free(tmp_ctx); +} + +/* + handler for reload_nodes +*/ +static void reload_nodes_handler(uint64_t srvid, TDB_DATA data, + void *private_data) +{ + struct ctdb_recoverd *rec = talloc_get_type( + private_data, struct ctdb_recoverd); + + DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n")); + + ctdb_load_nodes_file(rec->ctdb); +} + + +static void recd_node_rebalance_handler(uint64_t srvid, TDB_DATA data, + void *private_data) +{ + struct ctdb_recoverd *rec = talloc_get_type( + private_data, struct ctdb_recoverd); + struct ctdb_context *ctdb = rec->ctdb; + uint32_t pnn; + uint32_t *t; + int len; + + if (!this_node_is_leader(rec)) { + return; + } + + if (data.dsize != sizeof(uint32_t)) { + DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t))); + return; + } + + pnn = *(uint32_t *)&data.dptr[0]; + + DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn)); + + /* Copy any existing list of nodes. There's probably some + * sort of realloc variant that will do this but we need to + * make sure that freeing the old array also cancels the timer + * event for the timeout... not sure if realloc will do that. + */ + len = (rec->force_rebalance_nodes != NULL) ? + talloc_array_length(rec->force_rebalance_nodes) : + 0; + + /* This allows duplicates to be added but they don't cause + * harm. A call to add a duplicate PNN arguably means that + * the timeout should be reset, so this is the simplest + * solution. + */ + t = talloc_zero_array(rec, uint32_t, len+1); + CTDB_NO_MEMORY_VOID(ctdb, t); + if (len > 0) { + memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len); + } + t[len] = pnn; + + talloc_free(rec->force_rebalance_nodes); + + rec->force_rebalance_nodes = t; +} + + + +static void srvid_disable_and_reply(struct ctdb_recoverd *rec, + TDB_DATA data, + struct ctdb_op_state *op_state) +{ + struct ctdb_context *ctdb = rec->ctdb; + struct ctdb_disable_message *r; + uint32_t timeout; + TDB_DATA result; + int32_t ret = 0; + + /* Validate input data */ + if (data.dsize != sizeof(struct ctdb_disable_message)) { + DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu " + "expecting %lu\n", (long unsigned)data.dsize, + (long unsigned)sizeof(struct ctdb_srvid_message))); + return; + } + if (data.dptr == NULL) { + DEBUG(DEBUG_ERR,(__location__ " No data received\n")); + return; + } + + r = (struct ctdb_disable_message *)data.dptr; + timeout = r->timeout; + + ret = ctdb_op_disable(op_state, ctdb->ev, timeout); + if (ret != 0) { + goto done; + } + + /* Returning our PNN tells the caller that we succeeded */ + ret = rec->pnn; +done: + result.dsize = sizeof(int32_t); + result.dptr = (uint8_t *)&ret; + srvid_request_reply(ctdb, (struct ctdb_srvid_message *)r, result); +} + +static void disable_takeover_runs_handler(uint64_t srvid, TDB_DATA data, + void *private_data) +{ + struct ctdb_recoverd *rec = talloc_get_type( + private_data, struct ctdb_recoverd); + + srvid_disable_and_reply(rec, data, rec->takeover_run); +} + +/* Backward compatibility for this SRVID */ +static void disable_ip_check_handler(uint64_t srvid, TDB_DATA data, + void *private_data) +{ + struct ctdb_recoverd *rec = talloc_get_type( + private_data, struct ctdb_recoverd); + uint32_t timeout; + + if (data.dsize != sizeof(uint32_t)) { + DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu " + "expecting %lu\n", (long unsigned)data.dsize, + (long unsigned)sizeof(uint32_t))); + return; + } + if (data.dptr == NULL) { + DEBUG(DEBUG_ERR,(__location__ " No data received\n")); + return; + } + + timeout = *((uint32_t *)data.dptr); + + ctdb_op_disable(rec->takeover_run, rec->ctdb->ev, timeout); +} + +static void disable_recoveries_handler(uint64_t srvid, TDB_DATA data, + void *private_data) +{ + struct ctdb_recoverd *rec = talloc_get_type( + private_data, struct ctdb_recoverd); + + srvid_disable_and_reply(rec, data, rec->recovery); +} + +/* + handler for ip reallocate, just add it to the list of requests and + handle this later in the monitor_cluster loop so we do not recurse + with other requests to takeover_run() +*/ +static void ip_reallocate_handler(uint64_t srvid, TDB_DATA data, + void *private_data) +{ + struct ctdb_srvid_message *request; + struct ctdb_recoverd *rec = talloc_get_type( + private_data, struct ctdb_recoverd); + + if (data.dsize != sizeof(struct ctdb_srvid_message)) { + DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n")); + return; + } + + request = (struct ctdb_srvid_message *)data.dptr; + + srvid_request_add(rec->ctdb, &rec->reallocate_requests, request); +} + +static void process_ipreallocate_requests(struct ctdb_context *ctdb, + struct ctdb_recoverd *rec) +{ + TDB_DATA result; + int32_t ret; + struct srvid_requests *current; + + /* Only process requests that are currently pending. More + * might come in while the takeover run is in progress and + * they will need to be processed later since they might + * be in response flag changes. + */ + current = rec->reallocate_requests; + rec->reallocate_requests = NULL; + + if (do_takeover_run(rec, rec->nodemap)) { + ret = rec->pnn; + } else { + ret = -1; + } + + result.dsize = sizeof(int32_t); + result.dptr = (uint8_t *)&ret; + + srvid_requests_reply(ctdb, ¤t, result); +} + +/* + * handler for assigning banning credits + */ +static void banning_handler(uint64_t srvid, TDB_DATA data, void *private_data) +{ + struct ctdb_recoverd *rec = talloc_get_type( + private_data, struct ctdb_recoverd); + uint32_t ban_pnn; + + /* Ignore if we are not leader */ + if (!this_node_is_leader(rec)) { + return; + } + + if (data.dsize != sizeof(uint32_t)) { + DEBUG(DEBUG_ERR, (__location__ "invalid data size %zu\n", + data.dsize)); + return; + } + + ban_pnn = *(uint32_t *)data.dptr; + + ctdb_set_culprit_count(rec, ban_pnn, rec->nodemap->num); +} + +/* + * Handler for leader elections + */ +static void election_handler(uint64_t srvid, TDB_DATA data, void *private_data) +{ + struct ctdb_recoverd *rec = talloc_get_type( + private_data, struct ctdb_recoverd); + struct ctdb_context *ctdb = rec->ctdb; + struct election_message *em = (struct election_message *)data.dptr; + + /* Ignore election packets from ourself */ + if (rec->pnn == em->pnn) { + return; + } + + /* we got an election packet - update the timeout for the election */ + talloc_free(rec->election_timeout); + rec->election_in_progress = true; + rec->election_timeout = tevent_add_timer( + ctdb->ev, ctdb, + fast_start ? + timeval_current_ofs(0, 500000) : + timeval_current_ofs(ctdb->tunable.election_timeout, 0), + ctdb_election_timeout, rec); + + /* someone called an election. check their election data + and if we disagree and we would rather be the elected node, + send a new election message to all other nodes + */ + if (ctdb_election_win(rec, em)) { + if (!rec->send_election_te) { + rec->send_election_te = tevent_add_timer( + ctdb->ev, rec, + timeval_current_ofs(0, 500000), + election_send_request, rec); + } + return; + } + + /* we didn't win */ + TALLOC_FREE(rec->send_election_te); + + /* Release the cluster lock file */ + if (cluster_lock_held(rec)) { + cluster_lock_release(rec); + } + + /* Set leader to the winner of this round */ + rec->leader = em->pnn; + + return; +} + +static void cluster_lock_election(struct ctdb_recoverd *rec) +{ + bool ok; + + if (!this_node_can_be_leader(rec)) { + if (cluster_lock_held(rec)) { + cluster_lock_release(rec); + } + goto done; + } + + /* + * Don't need to unconditionally release the lock and then + * attempt to retake it. This provides stability. + */ + if (cluster_lock_held(rec)) { + goto done; + } + + rec->leader = CTDB_UNKNOWN_PNN; + + ok = cluster_lock_take(rec); + if (ok) { + rec->leader = rec->pnn; + D_WARNING("Took cluster lock, leader=%"PRIu32"\n", rec->leader); + } + +done: + rec->election_in_progress = false; +} + +/* + force the start of the election process + */ +static void force_election(struct ctdb_recoverd *rec) +{ + int ret; + struct ctdb_context *ctdb = rec->ctdb; + + D_ERR("Start election\n"); + + /* set all nodes to recovery mode to stop all internode traffic */ + ret = set_recovery_mode(ctdb, rec, rec->nodemap, CTDB_RECOVERY_ACTIVE); + if (ret != 0) { + DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n")); + return; + } + + rec->election_in_progress = true; + /* Let other nodes know that an election is underway */ + leader_broadcast_send(rec, CTDB_UNKNOWN_PNN); + + if (cluster_lock_enabled(rec)) { + cluster_lock_election(rec); + return; + } + + talloc_free(rec->election_timeout); + rec->election_timeout = tevent_add_timer( + ctdb->ev, ctdb, + fast_start ? + timeval_current_ofs(0, 500000) : + timeval_current_ofs(ctdb->tunable.election_timeout, 0), + ctdb_election_timeout, rec); + + ret = send_election_request(rec); + if (ret!=0) { + DBG_ERR("Failed to initiate leader election\n"); + return; + } + + /* wait for a few seconds to collect all responses */ + ctdb_wait_election(rec); +} + + +static void srvid_not_implemented(uint64_t srvid, + TDB_DATA data, + void *private_data) +{ + const char *s; + + switch (srvid) { + case CTDB_SRVID_SET_NODE_FLAGS: + s = "CTDB_SRVID_SET_NODE_FLAGS"; + break; + default: + s = "UNKNOWN"; + } + + D_WARNING("SRVID %s (0x%" PRIx64 ") is obsolete\n", s, srvid); +} + +/* + handler for when we need to push out flag changes to all other nodes +*/ +static void push_flags_handler(uint64_t srvid, TDB_DATA data, + void *private_data) +{ + struct ctdb_recoverd *rec = talloc_get_type( + private_data, struct ctdb_recoverd); + struct ctdb_context *ctdb = rec->ctdb; + int ret; + struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr; + struct ctdb_node_map_old *nodemap=NULL; + TALLOC_CTX *tmp_ctx = talloc_new(ctdb); + uint32_t *nodes; + + /* read the node flags from the leader */ + ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->leader, + tmp_ctx, &nodemap); + if (ret != 0) { + DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn)); + talloc_free(tmp_ctx); + return; + } + if (c->pnn >= nodemap->num) { + DBG_ERR("Nodemap from leader does not contain node %d\n", + c->pnn); + talloc_free(tmp_ctx); + return; + } + + /* send the flags update to all connected nodes */ + nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true); + + if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS, + nodes, 0, CONTROL_TIMEOUT(), + false, data, + NULL, NULL, + NULL) != 0) { + DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n")); + + talloc_free(tmp_ctx); + return; + } + + talloc_free(tmp_ctx); +} + +static void leader_broadcast_timeout_handler(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval current_time, + void *private_data) +{ + struct ctdb_recoverd *rec = talloc_get_type_abort( + private_data, struct ctdb_recoverd); + + rec->leader_broadcast_timeout_te = NULL; + + D_NOTICE("Leader broadcast timeout\n"); + + force_election(rec); +} + +static void leader_broadcast_timeout_cancel(struct ctdb_recoverd *rec) +{ + TALLOC_FREE(rec->leader_broadcast_timeout_te); +} + +static int leader_broadcast_timeout_start(struct ctdb_recoverd *rec) +{ + struct ctdb_context *ctdb = rec->ctdb; + + /* + * This should not be necessary. However, there will be + * interactions with election code here. It will want to + * cancel and restart the timer around potentially long + * elections. + */ + leader_broadcast_timeout_cancel(rec); + + rec->leader_broadcast_timeout_te = + tevent_add_timer( + ctdb->ev, + rec, + timeval_current_ofs(ctdb_config.leader_timeout, 0), + leader_broadcast_timeout_handler, + rec); + if (rec->leader_broadcast_timeout_te == NULL) { + D_ERR("Unable to start leader broadcast timeout\n"); + return ENOMEM; + } + + return 0; +} + +static bool leader_broadcast_timeout_active(struct ctdb_recoverd *rec) +{ + return rec->leader_broadcast_timeout_te != NULL; +} + +static void leader_handler(uint64_t srvid, TDB_DATA data, void *private_data) +{ + struct ctdb_recoverd *rec = talloc_get_type_abort( + private_data, struct ctdb_recoverd); + uint32_t pnn; + size_t npull; + int ret; + + ret = ctdb_uint32_pull(data.dptr, data.dsize, &pnn, &npull); + if (ret != 0) { + DBG_WARNING("Unable to parse leader broadcast, ret=%d\n", ret); + return; + } + + leader_broadcast_timeout_cancel(rec); + + if (pnn == rec->leader) { + goto done; + } + + if (pnn == CTDB_UNKNOWN_PNN) { + bool was_election_in_progress = rec->election_in_progress; + + /* + * Leader broadcast timeout was cancelled above - stop + * main loop from restarting it until election is + * complete + */ + rec->election_in_progress = true; + + /* + * This is the only notification for a cluster lock + * election, so handle it here... + */ + if (cluster_lock_enabled(rec) && !was_election_in_progress) { + cluster_lock_election(rec); + } + + return; + } + + D_NOTICE("Received leader broadcast, leader=%"PRIu32"\n", pnn); + rec->leader = pnn; + +done: + leader_broadcast_timeout_start(rec); +} + +struct verify_recmode_normal_data { + uint32_t count; + enum monitor_result status; +}; + +static void verify_recmode_normal_callback(struct ctdb_client_control_state *state) +{ + struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data); + + + /* one more node has responded with recmode data*/ + rmdata->count--; + + /* if we failed to get the recmode, then return an error and let + the main loop try again. + */ + if (state->state != CTDB_CONTROL_DONE) { + if (rmdata->status == MONITOR_OK) { + rmdata->status = MONITOR_FAILED; + } + return; + } + + /* if we got a response, then the recmode will be stored in the + status field + */ + if (state->status != CTDB_RECOVERY_NORMAL) { + DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode)); + rmdata->status = MONITOR_RECOVERY_NEEDED; + } + + return; +} + + +/* verify that all nodes are in normal recovery mode */ +static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap) +{ + struct verify_recmode_normal_data *rmdata; + TALLOC_CTX *mem_ctx = talloc_new(ctdb); + struct ctdb_client_control_state *state; + enum monitor_result status; + unsigned int j; + + rmdata = talloc(mem_ctx, struct verify_recmode_normal_data); + CTDB_NO_MEMORY_FATAL(ctdb, rmdata); + rmdata->count = 0; + rmdata->status = MONITOR_OK; + + /* loop over all active nodes and send an async getrecmode call to + them*/ + for (j=0; j<nodemap->num; j++) { + if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) { + continue; + } + state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx, + CONTROL_TIMEOUT(), + nodemap->nodes[j].pnn); + if (state == NULL) { + /* we failed to send the control, treat this as + an error and try again next iteration + */ + DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n")); + talloc_free(mem_ctx); + return MONITOR_FAILED; + } + + /* set up the callback functions */ + state->async.fn = verify_recmode_normal_callback; + state->async.private_data = rmdata; + + /* one more control to wait for to complete */ + rmdata->count++; + } + + + /* now wait for up to the maximum number of seconds allowed + or until all nodes we expect a response from has replied + */ + while (rmdata->count > 0) { + tevent_loop_once(ctdb->ev); + } + + status = rmdata->status; + talloc_free(mem_ctx); + return status; +} + + +static bool interfaces_have_changed(struct ctdb_context *ctdb, + struct ctdb_recoverd *rec) +{ + struct ctdb_iface_list_old *ifaces = NULL; + TALLOC_CTX *mem_ctx; + bool ret = false; + + mem_ctx = talloc_new(NULL); + + /* Read the interfaces from the local node */ + if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(), + CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) { + D_ERR("Unable to get interfaces from local node %u\n", rec->pnn); + /* We could return an error. However, this will be + * rare so we'll decide that the interfaces have + * actually changed, just in case. + */ + talloc_free(mem_ctx); + return true; + } + + if (!rec->ifaces) { + /* We haven't been here before so things have changed */ + DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n")); + ret = true; + } else if (rec->ifaces->num != ifaces->num) { + /* Number of interfaces has changed */ + DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n", + rec->ifaces->num, ifaces->num)); + ret = true; + } else { + /* See if interface names or link states have changed */ + unsigned int i; + for (i = 0; i < rec->ifaces->num; i++) { + struct ctdb_iface * iface = &rec->ifaces->ifaces[i]; + if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) { + DEBUG(DEBUG_NOTICE, + ("Interface in slot %d changed: %s => %s\n", + i, iface->name, ifaces->ifaces[i].name)); + ret = true; + break; + } + if (iface->link_state != ifaces->ifaces[i].link_state) { + DEBUG(DEBUG_NOTICE, + ("Interface %s changed state: %d => %d\n", + iface->name, iface->link_state, + ifaces->ifaces[i].link_state)); + ret = true; + break; + } + } + } + + talloc_free(rec->ifaces); + rec->ifaces = talloc_steal(rec, ifaces); + + talloc_free(mem_ctx); + return ret; +} + +/* Check that the local allocation of public IP addresses is correct + * and do some house-keeping */ +static int verify_local_ip_allocation(struct ctdb_recoverd *rec) +{ + TALLOC_CTX *mem_ctx = talloc_new(NULL); + struct ctdb_context *ctdb = rec->ctdb; + unsigned int j; + int ret; + bool need_takeover_run = false; + struct ctdb_public_ip_list_old *ips = NULL; + + /* If we are not the leader then do some housekeeping */ + if (!this_node_is_leader(rec)) { + /* Ignore any IP reallocate requests - only leader + * processes them + */ + TALLOC_FREE(rec->reallocate_requests); + /* Clear any nodes that should be force rebalanced in + * the next takeover run. If the leader has changed + * then we don't want to process these some time in + * the future. + */ + TALLOC_FREE(rec->force_rebalance_nodes); + } + + /* Return early if disabled... */ + if (ctdb_config.failover_disabled || + ctdb_op_is_disabled(rec->takeover_run)) { + talloc_free(mem_ctx); + return 0; + } + + if (interfaces_have_changed(ctdb, rec)) { + need_takeover_run = true; + } + + /* If there are unhosted IPs but this node can host them then + * trigger an IP reallocation */ + + /* Read *available* IPs from local node */ + ret = ctdb_ctrl_get_public_ips_flags( + ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, + CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips); + if (ret != 0) { + DEBUG(DEBUG_ERR, ("Unable to retrieve available public IPs\n")); + talloc_free(mem_ctx); + return -1; + } + + for (j=0; j<ips->num; j++) { + if (ips->ips[j].pnn == CTDB_UNKNOWN_PNN && + rec->nodemap->nodes[rec->pnn].flags == 0) { + DEBUG(DEBUG_WARNING, + ("Unassigned IP %s can be served by this node\n", + ctdb_addr_to_str(&ips->ips[j].addr))); + need_takeover_run = true; + } + } + + talloc_free(ips); + + if (!ctdb->do_checkpublicip) { + goto done; + } + + /* Validate the IP addresses that this node has on network + * interfaces. If there is an inconsistency between reality + * and the state expected by CTDB then try to fix it by + * triggering an IP reallocation or releasing extraneous IP + * addresses. */ + + /* Read *known* IPs from local node */ + ret = ctdb_ctrl_get_public_ips_flags( + ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips); + if (ret != 0) { + DEBUG(DEBUG_ERR, ("Unable to retrieve known public IPs\n")); + talloc_free(mem_ctx); + return -1; + } + + for (j=0; j<ips->num; j++) { + if (ips->ips[j].pnn == rec->pnn) { + if (!ctdb_sys_have_ip(&ips->ips[j].addr)) { + DEBUG(DEBUG_ERR, + ("Assigned IP %s not on an interface\n", + ctdb_addr_to_str(&ips->ips[j].addr))); + need_takeover_run = true; + } + } else { + if (ctdb_sys_have_ip(&ips->ips[j].addr)) { + DEBUG(DEBUG_ERR, + ("IP %s incorrectly on an interface\n", + ctdb_addr_to_str(&ips->ips[j].addr))); + need_takeover_run = true; + } + } + } + +done: + if (need_takeover_run) { + struct ctdb_srvid_message rd; + TDB_DATA data; + + DEBUG(DEBUG_NOTICE,("Trigger takeoverrun\n")); + + ZERO_STRUCT(rd); + rd.pnn = rec->pnn; + rd.srvid = 0; + data.dptr = (uint8_t *)&rd; + data.dsize = sizeof(rd); + + ret = ctdb_client_send_message(ctdb, + CTDB_BROADCAST_CONNECTED, + CTDB_SRVID_TAKEOVER_RUN, + data); + if (ret != 0) { + D_ERR("Failed to send takeover run request\n"); + } + } + talloc_free(mem_ctx); + return 0; +} + + +struct remote_nodemaps_state { + struct ctdb_node_map_old **remote_nodemaps; + struct ctdb_recoverd *rec; +}; + +static void async_getnodemap_callback(struct ctdb_context *ctdb, + uint32_t node_pnn, + int32_t res, + TDB_DATA outdata, + void *callback_data) +{ + struct remote_nodemaps_state *state = + (struct remote_nodemaps_state *)callback_data; + struct ctdb_node_map_old **remote_nodemaps = state->remote_nodemaps; + struct ctdb_node_map_old *nodemap = state->rec->nodemap; + size_t i; + + for (i = 0; i < nodemap->num; i++) { + if (nodemap->nodes[i].pnn == node_pnn) { + break; + } + } + + if (i >= nodemap->num) { + DBG_ERR("Invalid PNN %"PRIu32"\n", node_pnn); + return; + } + + remote_nodemaps[i] = (struct ctdb_node_map_old *)talloc_steal( + remote_nodemaps, outdata.dptr); + +} + +static void async_getnodemap_error(struct ctdb_context *ctdb, + uint32_t node_pnn, + int32_t res, + TDB_DATA outdata, + void *callback_data) +{ + struct remote_nodemaps_state *state = + (struct remote_nodemaps_state *)callback_data; + struct ctdb_recoverd *rec = state->rec; + + DBG_ERR("Failed to retrieve nodemap from node %u\n", node_pnn); + ctdb_set_culprit(rec, node_pnn); +} + +static int get_remote_nodemaps(struct ctdb_recoverd *rec, + TALLOC_CTX *mem_ctx, + struct ctdb_node_map_old ***remote_nodemaps) +{ + struct ctdb_context *ctdb = rec->ctdb; + struct ctdb_node_map_old **t; + uint32_t *nodes; + struct remote_nodemaps_state state; + int ret; + + t = talloc_zero_array(mem_ctx, + struct ctdb_node_map_old *, + rec->nodemap->num); + if (t == NULL) { + DBG_ERR("Memory allocation error\n"); + return -1; + } + + nodes = list_of_connected_nodes(ctdb, rec->nodemap, mem_ctx, false); + + state.remote_nodemaps = t; + state.rec = rec; + + ret = ctdb_client_async_control(ctdb, + CTDB_CONTROL_GET_NODEMAP, + nodes, + 0, + CONTROL_TIMEOUT(), + false, + tdb_null, + async_getnodemap_callback, + async_getnodemap_error, + &state); + talloc_free(nodes); + + if (ret != 0) { + talloc_free(t); + return ret; + } + + *remote_nodemaps = t; + return 0; +} + +static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, + TALLOC_CTX *mem_ctx) +{ + struct ctdb_node_map_old *nodemap=NULL; + struct ctdb_node_map_old **remote_nodemaps=NULL; + struct ctdb_vnn_map *vnnmap=NULL; + struct ctdb_vnn_map *remote_vnnmap=NULL; + uint32_t num_lmasters; + int32_t debug_level; + unsigned int i, j; + int ret; + bool self_ban; + + + /* verify that the main daemon is still running */ + if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) { + DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n")); + exit(-1); + } + + /* ping the local daemon to tell it we are alive */ + ctdb_ctrl_recd_ping(ctdb); + + if (rec->election_in_progress) { + /* an election is in progress */ + return; + } + + /* + * Start leader broadcasts if they are not active (1st time + * through main loop? Memory allocation error?) + */ + if (!leader_broadcast_loop_active(rec)) { + ret = leader_broadcast_loop(rec); + if (ret != 0) { + D_ERR("Failed to set up leader broadcast\n"); + ctdb_set_culprit(rec, rec->pnn); + } + } + /* + * Similar for leader broadcast timeouts. These can also have + * been stopped by another node receiving a leader broadcast + * timeout and transmitting an "unknown leader broadcast". + * Note that this should never be done during an election - at + * the moment there is nothing between here and the above + * election-in-progress check that can process an election + * result (i.e. no event loop). + */ + if (!leader_broadcast_timeout_active(rec)) { + ret = leader_broadcast_timeout_start(rec); + if (ret != 0) { + ctdb_set_culprit(rec, rec->pnn); + } + } + + + /* read the debug level from the parent and update locally */ + ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level); + if (ret !=0) { + DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n")); + return; + } + debuglevel_set(debug_level); + + /* get relevant tunables */ + ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable); + if (ret != 0) { + DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n")); + return; + } + + /* get runstate */ + ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(), + CTDB_CURRENT_NODE, &ctdb->runstate); + if (ret != 0) { + DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n")); + return; + } + + /* get nodemap */ + ret = ctdb_ctrl_getnodemap(ctdb, + CONTROL_TIMEOUT(), + rec->pnn, + rec, + &nodemap); + if (ret != 0) { + DBG_ERR("Unable to get nodemap from node %"PRIu32"\n", rec->pnn); + return; + } + talloc_free(rec->nodemap); + rec->nodemap = nodemap; + + /* remember our own node flags */ + rec->node_flags = nodemap->nodes[rec->pnn].flags; + + ban_misbehaving_nodes(rec, &self_ban); + if (self_ban) { + DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n")); + return; + } + + ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), + CTDB_CURRENT_NODE, &ctdb->recovery_mode); + if (ret != 0) { + D_ERR("Failed to read recmode from local node\n"); + return; + } + + /* if the local daemon is STOPPED or BANNED, we verify that the databases are + also frozen and that the recmode is set to active. + */ + if (rec->node_flags & NODE_FLAGS_INACTIVE) { + /* If this node has become inactive then we want to + * reduce the chances of it taking over the leader + * role when it becomes active again. This + * helps to stabilise the leader role so that + * it stays on the most stable node. + */ + rec->priority_time = timeval_current(); + + if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) { + DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n")); + + ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE); + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n")); + + return; + } + } + if (! rec->frozen_on_inactive) { + ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), + CTDB_CURRENT_NODE); + if (ret != 0) { + DEBUG(DEBUG_ERR, + (__location__ " Failed to freeze node " + "in STOPPED or BANNED state\n")); + return; + } + + rec->frozen_on_inactive = true; + } + + /* If this node is stopped or banned then it is not the recovery + * master, so don't do anything. This prevents stopped or banned + * node from starting election and sending unnecessary controls. + */ + return; + } + + rec->frozen_on_inactive = false; + + /* Retrieve capabilities from all connected nodes */ + ret = update_capabilities(rec, nodemap); + if (ret != 0) { + DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n")); + return; + } + + if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) { + /* Check if an IP takeover run is needed and trigger one if + * necessary */ + verify_local_ip_allocation(rec); + } + + /* If this node is not the leader then skip recovery checks */ + if (!this_node_is_leader(rec)) { + return; + } + + + /* Get the nodemaps for all connected remote nodes */ + ret = get_remote_nodemaps(rec, mem_ctx, &remote_nodemaps); + if (ret != 0) { + DBG_ERR("Failed to read remote nodemaps\n"); + return; + } + + /* Ensure our local and remote flags are correct */ + ret = update_flags(rec, nodemap, remote_nodemaps); + if (ret != 0) { + D_ERR("Unable to update flags\n"); + return; + } + + if (ctdb->num_nodes != nodemap->num) { + DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num)); + ctdb_load_nodes_file(ctdb); + return; + } + + /* get the vnnmap */ + ret = ctdb_ctrl_getvnnmap(ctdb, + CONTROL_TIMEOUT(), + rec->pnn, + mem_ctx, + &vnnmap); + if (ret != 0) { + DBG_ERR("Unable to get vnnmap from node %u\n", rec->pnn); + return; + } + + if (rec->need_recovery) { + /* a previous recovery didn't finish */ + do_recovery(rec, mem_ctx); + return; + } + + /* verify that all active nodes are in normal mode + and not in recovery mode + */ + switch (verify_recmode(ctdb, nodemap)) { + case MONITOR_RECOVERY_NEEDED: + do_recovery(rec, mem_ctx); + return; + case MONITOR_FAILED: + return; + case MONITOR_ELECTION_NEEDED: + /* can not happen */ + case MONITOR_OK: + break; + } + + if (cluster_lock_enabled(rec)) { + /* We must already hold the cluster lock */ + if (!cluster_lock_held(rec)) { + D_ERR("Failed cluster lock sanity check\n"); + ctdb_set_culprit(rec, rec->pnn); + do_recovery(rec, mem_ctx); + return; + } + } + + + /* If recoveries are disabled then there is no use doing any + * nodemap or flags checks. Recoveries might be disabled due + * to "reloadnodes", so doing these checks might cause an + * unnecessary recovery. */ + if (ctdb_op_is_disabled(rec->recovery)) { + goto takeover_run_checks; + } + + /* verify that all other nodes have the same nodemap as we have + */ + for (j=0; j<nodemap->num; j++) { + if (nodemap->nodes[j].pnn == rec->pnn) { + continue; + } + if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) { + continue; + } + + /* if the nodes disagree on how many nodes there are + then this is a good reason to try recovery + */ + if (remote_nodemaps[j]->num != nodemap->num) { + DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n", + nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num)); + ctdb_set_culprit(rec, nodemap->nodes[j].pnn); + do_recovery(rec, mem_ctx); + return; + } + + /* if the nodes disagree on which nodes exist and are + active, then that is also a good reason to do recovery + */ + for (i=0;i<nodemap->num;i++) { + if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) { + DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n", + nodemap->nodes[j].pnn, i, + remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn)); + ctdb_set_culprit(rec, nodemap->nodes[j].pnn); + do_recovery(rec, mem_ctx); + return; + } + } + } + + /* count how many active nodes there are */ + num_lmasters = 0; + for (i=0; i<nodemap->num; i++) { + if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) { + if (ctdb_node_has_capabilities(rec->caps, + ctdb->nodes[i]->pnn, + CTDB_CAP_LMASTER)) { + num_lmasters++; + } + } + } + + + /* There must be the same number of lmasters in the vnn map as + * there are active nodes with the lmaster capability... or + * do a recovery. + */ + if (vnnmap->size != num_lmasters) { + DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n", + vnnmap->size, num_lmasters)); + ctdb_set_culprit(rec, rec->pnn); + do_recovery(rec, mem_ctx); + return; + } + + /* + * Verify that all active lmaster nodes in the nodemap also + * exist in the vnnmap + */ + for (j=0; j<nodemap->num; j++) { + if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) { + continue; + } + if (! ctdb_node_has_capabilities(rec->caps, + nodemap->nodes[j].pnn, + CTDB_CAP_LMASTER)) { + continue; + } + if (nodemap->nodes[j].pnn == rec->pnn) { + continue; + } + + for (i=0; i<vnnmap->size; i++) { + if (vnnmap->map[i] == nodemap->nodes[j].pnn) { + break; + } + } + if (i == vnnmap->size) { + D_ERR("Active LMASTER node %u is not in the vnnmap\n", + nodemap->nodes[j].pnn); + ctdb_set_culprit(rec, nodemap->nodes[j].pnn); + do_recovery(rec, mem_ctx); + return; + } + } + + + /* verify that all other nodes have the same vnnmap + and are from the same generation + */ + for (j=0; j<nodemap->num; j++) { + if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) { + continue; + } + if (nodemap->nodes[j].pnn == rec->pnn) { + continue; + } + + ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, + mem_ctx, &remote_vnnmap); + if (ret != 0) { + DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n", + nodemap->nodes[j].pnn)); + return; + } + + /* verify the vnnmap generation is the same */ + if (vnnmap->generation != remote_vnnmap->generation) { + DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n", + nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation)); + ctdb_set_culprit(rec, nodemap->nodes[j].pnn); + do_recovery(rec, mem_ctx); + return; + } + + /* verify the vnnmap size is the same */ + if (vnnmap->size != remote_vnnmap->size) { + DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n", + nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size)); + ctdb_set_culprit(rec, nodemap->nodes[j].pnn); + do_recovery(rec, mem_ctx); + return; + } + + /* verify the vnnmap is the same */ + for (i=0;i<vnnmap->size;i++) { + if (remote_vnnmap->map[i] != vnnmap->map[i]) { + DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n", + nodemap->nodes[j].pnn)); + ctdb_set_culprit(rec, nodemap->nodes[j].pnn); + do_recovery(rec, mem_ctx); + return; + } + } + } + + /* FIXME: Add remote public IP checking to ensure that nodes + * have the IP addresses that are allocated to them. */ + +takeover_run_checks: + + /* If there are IP takeover runs requested or the previous one + * failed then perform one and notify the waiters */ + if (!ctdb_op_is_disabled(rec->takeover_run) && + (rec->reallocate_requests || rec->need_takeover_run)) { + process_ipreallocate_requests(ctdb, rec); + } +} + +static void recd_sig_term_handler(struct tevent_context *ev, + struct tevent_signal *se, int signum, + int count, void *dont_care, + void *private_data) +{ + struct ctdb_recoverd *rec = talloc_get_type_abort( + private_data, struct ctdb_recoverd); + + DEBUG(DEBUG_ERR, ("Received SIGTERM, exiting\n")); + cluster_lock_release(rec); + exit(0); +} + +/* + * Periodically log elements of the cluster state + * + * This can be used to confirm a split brain has occurred + */ +static void maybe_log_cluster_state(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval current_time, + void *private_data) +{ + struct ctdb_recoverd *rec = talloc_get_type_abort( + private_data, struct ctdb_recoverd); + struct ctdb_context *ctdb = rec->ctdb; + struct tevent_timer *tt; + + static struct timeval start_incomplete = { + .tv_sec = 0, + }; + + bool is_complete; + bool was_complete; + unsigned int i; + double seconds; + unsigned int minutes; + unsigned int num_connected; + + if (!this_node_is_leader(rec)) { + goto done; + } + + if (rec->nodemap == NULL) { + goto done; + } + + is_complete = true; + num_connected = 0; + for (i = 0; i < rec->nodemap->num; i++) { + struct ctdb_node_and_flags *n = &rec->nodemap->nodes[i]; + + if (n->pnn == rec->pnn) { + continue; + } + if ((n->flags & NODE_FLAGS_DELETED) != 0) { + continue; + } + if ((n->flags & NODE_FLAGS_DISCONNECTED) != 0) { + is_complete = false; + continue; + } + + num_connected++; + } + + was_complete = timeval_is_zero(&start_incomplete); + + if (is_complete) { + if (! was_complete) { + D_WARNING("Cluster complete with leader=%u\n", + rec->leader); + start_incomplete = timeval_zero(); + } + goto done; + } + + /* Cluster is newly incomplete... */ + if (was_complete) { + start_incomplete = current_time; + minutes = 0; + goto log; + } + + /* + * Cluster has been incomplete since previous check, so figure + * out how long (in minutes) and decide whether to log anything + */ + seconds = timeval_elapsed2(&start_incomplete, ¤t_time); + minutes = (unsigned int)seconds / 60; + if (minutes >= 60) { + /* Over an hour, log every hour */ + if (minutes % 60 != 0) { + goto done; + } + } else if (minutes >= 10) { + /* Over 10 minutes, log every 10 minutes */ + if (minutes % 10 != 0) { + goto done; + } + } + +log: + D_WARNING("Cluster incomplete with leader=%u, elapsed=%u minutes, " + "connected=%u\n", + rec->leader, + minutes, + num_connected); + +done: + tt = tevent_add_timer(ctdb->ev, + rec, + timeval_current_ofs(60, 0), + maybe_log_cluster_state, + rec); + if (tt == NULL) { + DBG_WARNING("Failed to set up cluster state timer\n"); + } +} + +static void recd_sighup_hook(void *private_data) +{ + struct ctdb_recoverd *rec = talloc_get_type_abort( + private_data, struct ctdb_recoverd); + + if (rec->helper_pid > 0) { + kill(rec->helper_pid, SIGHUP); + } +} + +/* + the main monitoring loop + */ +static void monitor_cluster(struct ctdb_context *ctdb) +{ + struct tevent_signal *se; + struct ctdb_recoverd *rec; + bool status; + + DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n")); + + rec = talloc_zero(ctdb, struct ctdb_recoverd); + CTDB_NO_MEMORY_FATAL(ctdb, rec); + + rec->ctdb = ctdb; + rec->leader = CTDB_UNKNOWN_PNN; + rec->pnn = ctdb_get_pnn(ctdb); + rec->cluster_lock_handle = NULL; + rec->helper_pid = -1; + + rec->takeover_run = ctdb_op_init(rec, "takeover runs"); + CTDB_NO_MEMORY_FATAL(ctdb, rec->takeover_run); + + rec->recovery = ctdb_op_init(rec, "recoveries"); + CTDB_NO_MEMORY_FATAL(ctdb, rec->recovery); + + rec->priority_time = timeval_current(); + rec->frozen_on_inactive = false; + + status = logging_setup_sighup_handler(rec->ctdb->ev, + rec, + recd_sighup_hook, + rec); + if (!status) { + D_ERR("Failed to install SIGHUP handler\n"); + exit(1); + } + + se = tevent_add_signal(ctdb->ev, ctdb, SIGTERM, 0, + recd_sig_term_handler, rec); + if (se == NULL) { + DEBUG(DEBUG_ERR, ("Failed to install SIGTERM handler\n")); + exit(1); + } + + if (!cluster_lock_enabled(rec)) { + struct tevent_timer *tt; + + tt = tevent_add_timer(ctdb->ev, + rec, + timeval_current_ofs(60, 0), + maybe_log_cluster_state, + rec); + if (tt == NULL) { + DBG_WARNING("Failed to set up cluster state timer\n"); + } + } + + /* register a message port for sending memory dumps */ + ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec); + + /* when a node is assigned banning credits */ + ctdb_client_set_message_handler(ctdb, CTDB_SRVID_BANNING, + banning_handler, rec); + + /* register a message port for recovery elections */ + ctdb_client_set_message_handler(ctdb, CTDB_SRVID_ELECTION, election_handler, rec); + + ctdb_client_set_message_handler(ctdb, + CTDB_SRVID_SET_NODE_FLAGS, + srvid_not_implemented, + rec); + + /* when we are asked to puch out a flag change */ + ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec); + + /* register a message port for reloadnodes */ + ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec); + + /* register a message port for performing a takeover run */ + ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec); + + /* register a message port for disabling the ip check for a short while */ + ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec); + + /* register a message port for forcing a rebalance of a node next + reallocation */ + ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec); + + /* Register a message port for disabling takeover runs */ + ctdb_client_set_message_handler(ctdb, + CTDB_SRVID_DISABLE_TAKEOVER_RUNS, + disable_takeover_runs_handler, rec); + + /* Register a message port for disabling recoveries */ + ctdb_client_set_message_handler(ctdb, + CTDB_SRVID_DISABLE_RECOVERIES, + disable_recoveries_handler, rec); + + ctdb_client_set_message_handler(ctdb, + CTDB_SRVID_LEADER, + leader_handler, + rec); + + for (;;) { + TALLOC_CTX *mem_ctx = talloc_new(ctdb); + struct timeval start; + double elapsed; + + if (!mem_ctx) { + DEBUG(DEBUG_CRIT,(__location__ + " Failed to create temp context\n")); + exit(-1); + } + + start = timeval_current(); + main_loop(ctdb, rec, mem_ctx); + talloc_free(mem_ctx); + + /* we only check for recovery once every second */ + elapsed = timeval_elapsed(&start); + if (elapsed < ctdb->tunable.recover_interval) { + ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval + - elapsed); + } + } +} + +/* + event handler for when the main ctdbd dies + */ +static void ctdb_recoverd_parent(struct tevent_context *ev, + struct tevent_fd *fde, + uint16_t flags, void *private_data) +{ + DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n")); + _exit(1); +} + +/* + called regularly to verify that the recovery daemon is still running + */ +static void ctdb_check_recd(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval yt, void *p) +{ + struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context); + + if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) { + DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid)); + + tevent_add_timer(ctdb->ev, ctdb, timeval_zero(), + ctdb_restart_recd, ctdb); + + return; + } + + tevent_add_timer(ctdb->ev, ctdb->recd_ctx, + timeval_current_ofs(30, 0), + ctdb_check_recd, ctdb); +} + +static void recd_sig_child_handler(struct tevent_context *ev, + struct tevent_signal *se, int signum, + int count, void *dont_care, + void *private_data) +{ +// struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context); + int status; + pid_t pid = -1; + + while (pid != 0) { + pid = waitpid(-1, &status, WNOHANG); + if (pid == -1) { + if (errno != ECHILD) { + DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno)); + } + return; + } + if (pid > 0) { + DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid)); + } + } +} + +/* + startup the recovery daemon as a child of the main ctdb daemon + */ +int ctdb_start_recoverd(struct ctdb_context *ctdb) +{ + int fd[2]; + struct tevent_signal *se; + struct tevent_fd *fde; + int ret; + + if (pipe(fd) != 0) { + return -1; + } + + ctdb->recoverd_pid = ctdb_fork(ctdb); + if (ctdb->recoverd_pid == -1) { + return -1; + } + + if (ctdb->recoverd_pid != 0) { + talloc_free(ctdb->recd_ctx); + ctdb->recd_ctx = talloc_new(ctdb); + CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx); + + close(fd[0]); + tevent_add_timer(ctdb->ev, ctdb->recd_ctx, + timeval_current_ofs(30, 0), + ctdb_check_recd, ctdb); + return 0; + } + + close(fd[1]); + + srandom(getpid() ^ time(NULL)); + + ret = logging_init(ctdb, NULL, NULL, "ctdb-recoverd"); + if (ret != 0) { + return -1; + } + + prctl_set_comment("ctdb_recoverd"); + if (switch_from_server_to_client(ctdb) != 0) { + DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n")); + exit(1); + } + + DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0])); + + fde = tevent_add_fd(ctdb->ev, ctdb, fd[0], TEVENT_FD_READ, + ctdb_recoverd_parent, &fd[0]); + tevent_fd_set_auto_close(fde); + + /* set up a handler to pick up sigchld */ + se = tevent_add_signal(ctdb->ev, ctdb, SIGCHLD, 0, + recd_sig_child_handler, ctdb); + if (se == NULL) { + DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n")); + exit(1); + } + + monitor_cluster(ctdb); + + DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n")); + return -1; +} + +/* + shutdown the recovery daemon + */ +void ctdb_stop_recoverd(struct ctdb_context *ctdb) +{ + if (ctdb->recoverd_pid == 0) { + return; + } + + DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n")); + ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM); + + TALLOC_FREE(ctdb->recd_ctx); + TALLOC_FREE(ctdb->recd_ping_count); +} + +static void ctdb_restart_recd(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *private_data) +{ + struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context); + + DEBUG(DEBUG_ERR,("Restarting recovery daemon\n")); + ctdb_stop_recoverd(ctdb); + ctdb_start_recoverd(ctdb); +} diff --git a/ctdb/server/ctdb_recovery_helper.c b/ctdb/server/ctdb_recovery_helper.c new file mode 100644 index 0000000..4df4841 --- /dev/null +++ b/ctdb/server/ctdb_recovery_helper.c @@ -0,0 +1,3200 @@ +/* + ctdb parallel database recovery + + Copyright (C) Amitay Isaacs 2015 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "replace.h" +#include "system/network.h" +#include "system/filesys.h" + +#include <talloc.h> +#include <tevent.h> +#include <tdb.h> +#include <libgen.h> + +#include "lib/tdb_wrap/tdb_wrap.h" +#include "lib/util/dlinklist.h" +#include "lib/util/sys_rw.h" +#include "lib/util/time.h" +#include "lib/util/tevent_unix.h" +#include "lib/util/util.h" +#include "lib/util/smb_strtox.h" + +#include "protocol/protocol.h" +#include "protocol/protocol_api.h" +#include "client/client.h" + +#include "common/logging.h" + +static int recover_timeout = 30; + +#define NUM_RETRIES 3 + +#define TIMEOUT() timeval_current_ofs(recover_timeout, 0) + +/* + * Utility functions + */ + +static bool generic_recv(struct tevent_req *req, int *perr) +{ + int err; + + if (tevent_req_is_unix_error(req, &err)) { + if (perr != NULL) { + *perr = err; + } + return false; + } + + return true; +} + +static uint64_t rec_srvid = CTDB_SRVID_RECOVERY; + +static uint64_t srvid_next(void) +{ + rec_srvid += 1; + return rec_srvid; +} + +/* + * Node related functions + */ + +struct node_list { + uint32_t *pnn_list; + uint32_t *caps; + uint32_t *ban_credits; + unsigned int size; + unsigned int count; +}; + +static struct node_list *node_list_init(TALLOC_CTX *mem_ctx, unsigned int size) +{ + struct node_list *nlist; + unsigned int i; + + nlist = talloc_zero(mem_ctx, struct node_list); + if (nlist == NULL) { + return NULL; + } + + nlist->pnn_list = talloc_array(nlist, uint32_t, size); + nlist->caps = talloc_zero_array(nlist, uint32_t, size); + nlist->ban_credits = talloc_zero_array(nlist, uint32_t, size); + + if (nlist->pnn_list == NULL || + nlist->caps == NULL || + nlist->ban_credits == NULL) { + talloc_free(nlist); + return NULL; + } + nlist->size = size; + + for (i=0; i<nlist->size; i++) { + nlist->pnn_list[i] = CTDB_UNKNOWN_PNN; + } + + return nlist; +} + +static bool node_list_add(struct node_list *nlist, uint32_t pnn) +{ + unsigned int i; + + if (nlist->count == nlist->size) { + return false; + } + + for (i=0; i<nlist->count; i++) { + if (nlist->pnn_list[i] == pnn) { + return false; + } + } + + nlist->pnn_list[nlist->count] = pnn; + nlist->count += 1; + + return true; +} + +static uint32_t *node_list_lmaster(struct node_list *nlist, + TALLOC_CTX *mem_ctx, + unsigned int *pnn_count) +{ + uint32_t *pnn_list; + unsigned int count, i; + + pnn_list = talloc_zero_array(mem_ctx, uint32_t, nlist->count); + if (pnn_list == NULL) { + return NULL; + } + + count = 0; + for (i=0; i<nlist->count; i++) { + if (!(nlist->caps[i] & CTDB_CAP_LMASTER)) { + continue; + } + + pnn_list[count] = nlist->pnn_list[i]; + count += 1; + } + + *pnn_count = count; + return pnn_list; +} + +static void node_list_ban_credits(struct node_list *nlist, uint32_t pnn) +{ + unsigned int i; + + for (i=0; i<nlist->count; i++) { + if (nlist->pnn_list[i] == pnn) { + nlist->ban_credits[i] += 1; + break; + } + } +} + +/* + * Database list functions + * + * Simple, naive implementation that could be updated to a db_hash or similar + */ + +struct db { + struct db *prev, *next; + + uint32_t db_id; + uint32_t db_flags; + uint32_t *pnn_list; + unsigned int num_nodes; +}; + +struct db_list { + unsigned int num_dbs; + struct db *db; + unsigned int num_nodes; +}; + +static struct db_list *db_list_init(TALLOC_CTX *mem_ctx, unsigned int num_nodes) +{ + struct db_list *l; + + l = talloc_zero(mem_ctx, struct db_list); + l->num_nodes = num_nodes; + + return l; +} + +static struct db *db_list_find(struct db_list *dblist, uint32_t db_id) +{ + struct db *db; + + if (dblist == NULL) { + return NULL; + } + + db = dblist->db; + while (db != NULL && db->db_id != db_id) { + db = db->next; + } + + return db; +} + +static int db_list_add(struct db_list *dblist, + uint32_t db_id, + uint32_t db_flags, + uint32_t node) +{ + struct db *db = NULL; + + if (dblist == NULL) { + return EINVAL; + } + + db = talloc_zero(dblist, struct db); + if (db == NULL) { + return ENOMEM; + } + + db->db_id = db_id; + db->db_flags = db_flags; + db->pnn_list = talloc_zero_array(db, uint32_t, dblist->num_nodes); + if (db->pnn_list == NULL) { + talloc_free(db); + return ENOMEM; + } + db->pnn_list[0] = node; + db->num_nodes = 1; + + DLIST_ADD_END(dblist->db, db); + dblist->num_dbs++; + + return 0; +} + +static int db_list_check_and_add(struct db_list *dblist, + uint32_t db_id, + uint32_t db_flags, + uint32_t node) +{ + struct db *db = NULL; + int ret; + + /* + * These flags are masked out because they are only set on a + * node when a client attaches to that node, so they might not + * be set yet. They can't be passed as part of the attach, so + * they're no use here. + */ + db_flags &= ~(CTDB_DB_FLAGS_READONLY | CTDB_DB_FLAGS_STICKY); + + if (dblist == NULL) { + return EINVAL; + } + + db = db_list_find(dblist, db_id); + if (db == NULL) { + ret = db_list_add(dblist, db_id, db_flags, node); + return ret; + } + + if (db->db_flags != db_flags) { + D_ERR("Incompatible database flags for 0x%"PRIx32" " + "(0x%"PRIx32" != 0x%"PRIx32")\n", + db_id, + db_flags, + db->db_flags); + return EINVAL; + } + + if (db->num_nodes >= dblist->num_nodes) { + return EINVAL; + } + + db->pnn_list[db->num_nodes] = node; + db->num_nodes++; + + return 0; +} + +/* + * Create database on nodes where it is missing + */ + +struct db_create_missing_state { + struct tevent_context *ev; + struct ctdb_client_context *client; + + struct node_list *nlist; + + const char *db_name; + uint32_t *missing_pnn_list; + int missing_num_nodes; +}; + +static void db_create_missing_done(struct tevent_req *subreq); + +static struct tevent_req *db_create_missing_send( + TALLOC_CTX *mem_ctx, + struct tevent_context *ev, + struct ctdb_client_context *client, + struct node_list *nlist, + const char *db_name, + struct db *db) +{ + struct tevent_req *req, *subreq; + struct db_create_missing_state *state; + struct ctdb_req_control request; + unsigned int i, j; + + req = tevent_req_create(mem_ctx, + &state, + struct db_create_missing_state); + if (req == NULL) { + return NULL; + } + + state->ev = ev; + state->client = client; + state->nlist = nlist; + state->db_name = db_name; + + if (nlist->count == db->num_nodes) { + tevent_req_done(req); + return tevent_req_post(req, ev); + } + + state->missing_pnn_list = talloc_array(mem_ctx, uint32_t, nlist->count); + if (tevent_req_nomem(state->missing_pnn_list, req)) { + return tevent_req_post(req, ev); + } + + for (i = 0; i < nlist->count; i++) { + uint32_t pnn = nlist->pnn_list[i] ; + + for (j = 0; j < db->num_nodes; j++) { + if (pnn == db->pnn_list[j]) { + break; + } + } + + if (j < db->num_nodes) { + continue; + } + + DBG_INFO("Create database %s on node %u\n", + state->db_name, + pnn); + state->missing_pnn_list[state->missing_num_nodes] = pnn; + state->missing_num_nodes++; + } + + if (db->db_flags & CTDB_DB_FLAGS_PERSISTENT) { + ctdb_req_control_db_attach_persistent(&request, db_name); + } else if (db->db_flags & CTDB_DB_FLAGS_REPLICATED) { + ctdb_req_control_db_attach_replicated(&request, db_name); + } else { + ctdb_req_control_db_attach(&request, db_name); + } + request.flags = CTDB_CTRL_FLAG_ATTACH_RECOVERY; + subreq = ctdb_client_control_multi_send(state, + state->ev, + state->client, + state->missing_pnn_list, + state->missing_num_nodes, + TIMEOUT(), + &request); + if (tevent_req_nomem(subreq, req)) { + return tevent_req_post(req, ev); + } + tevent_req_set_callback(subreq, db_create_missing_done, req); + + return req; +} + +static void db_create_missing_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct db_create_missing_state *state = tevent_req_data( + req, struct db_create_missing_state); + int *err_list; + int ret; + bool status; + + status = ctdb_client_control_multi_recv(subreq, + &ret, + NULL, + &err_list, + NULL); + TALLOC_FREE(subreq); + if (! status) { + int ret2; + uint32_t pnn; + + ret2 = ctdb_client_control_multi_error( + state->missing_pnn_list, + state->missing_num_nodes, + err_list, + &pnn); + if (ret2 != 0) { + D_ERR("control DB_ATTACH failed for db %s" + " on node %u, ret=%d\n", + state->db_name, + pnn, + ret2); + node_list_ban_credits(state->nlist, pnn); + } else { + D_ERR("control DB_ATTACH failed for db %s, ret=%d\n", + state->db_name, + ret); + } + tevent_req_error(req, ret); + return; + } + + tevent_req_done(req); +} + +static bool db_create_missing_recv(struct tevent_req *req, int *perr) +{ + return generic_recv(req, perr); +} + +/* + * Recovery database functions + */ + +struct recdb_context { + uint32_t db_id; + const char *db_name; + const char *db_path; + struct tdb_wrap *db; + bool persistent; +}; + +static struct recdb_context *recdb_create(TALLOC_CTX *mem_ctx, uint32_t db_id, + const char *db_name, + const char *db_path, + uint32_t hash_size, bool persistent) +{ + static char *db_dir_state = NULL; + struct recdb_context *recdb; + unsigned int tdb_flags; + + recdb = talloc(mem_ctx, struct recdb_context); + if (recdb == NULL) { + return NULL; + } + + if (db_dir_state == NULL) { + db_dir_state = getenv("CTDB_DBDIR_STATE"); + } + + recdb->db_name = db_name; + recdb->db_id = db_id; + recdb->db_path = talloc_asprintf(recdb, "%s/recdb.%s", + db_dir_state != NULL ? + db_dir_state : + dirname(discard_const(db_path)), + db_name); + if (recdb->db_path == NULL) { + talloc_free(recdb); + return NULL; + } + unlink(recdb->db_path); + + tdb_flags = TDB_NOLOCK | TDB_INCOMPATIBLE_HASH | TDB_DISALLOW_NESTING; + recdb->db = tdb_wrap_open(mem_ctx, recdb->db_path, hash_size, + tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600); + if (recdb->db == NULL) { + talloc_free(recdb); + D_ERR("failed to create recovery db %s\n", recdb->db_path); + return NULL; + } + + recdb->persistent = persistent; + + return recdb; +} + +static uint32_t recdb_id(struct recdb_context *recdb) +{ + return recdb->db_id; +} + +static const char *recdb_name(struct recdb_context *recdb) +{ + return recdb->db_name; +} + +static const char *recdb_path(struct recdb_context *recdb) +{ + return recdb->db_path; +} + +static struct tdb_context *recdb_tdb(struct recdb_context *recdb) +{ + return recdb->db->tdb; +} + +static bool recdb_persistent(struct recdb_context *recdb) +{ + return recdb->persistent; +} + +struct recdb_add_traverse_state { + struct recdb_context *recdb; + uint32_t mypnn; +}; + +static int recdb_add_traverse(uint32_t reqid, struct ctdb_ltdb_header *header, + TDB_DATA key, TDB_DATA data, + void *private_data) +{ + struct recdb_add_traverse_state *state = + (struct recdb_add_traverse_state *)private_data; + struct ctdb_ltdb_header *hdr; + TDB_DATA prev_data; + int ret; + + /* header is not marshalled separately in the pulldb control */ + if (data.dsize < sizeof(struct ctdb_ltdb_header)) { + return -1; + } + + hdr = (struct ctdb_ltdb_header *)data.dptr; + + /* fetch the existing record, if any */ + prev_data = tdb_fetch(recdb_tdb(state->recdb), key); + + if (prev_data.dptr != NULL) { + struct ctdb_ltdb_header prev_hdr; + + prev_hdr = *(struct ctdb_ltdb_header *)prev_data.dptr; + free(prev_data.dptr); + if (hdr->rsn < prev_hdr.rsn || + (hdr->rsn == prev_hdr.rsn && + prev_hdr.dmaster != state->mypnn)) { + return 0; + } + } + + ret = tdb_store(recdb_tdb(state->recdb), key, data, TDB_REPLACE); + if (ret != 0) { + return -1; + } + return 0; +} + +static bool recdb_add(struct recdb_context *recdb, int mypnn, + struct ctdb_rec_buffer *recbuf) +{ + struct recdb_add_traverse_state state; + int ret; + + state.recdb = recdb; + state.mypnn = mypnn; + + ret = ctdb_rec_buffer_traverse(recbuf, recdb_add_traverse, &state); + if (ret != 0) { + return false; + } + + return true; +} + +/* This function decides which records from recdb are retained */ +static int recbuf_filter_add(struct ctdb_rec_buffer *recbuf, bool persistent, + uint32_t reqid, uint32_t dmaster, + TDB_DATA key, TDB_DATA data) +{ + struct ctdb_ltdb_header *header; + int ret; + + /* Skip empty records */ + if (data.dsize <= sizeof(struct ctdb_ltdb_header)) { + return 0; + } + + /* update the dmaster field to point to us */ + header = (struct ctdb_ltdb_header *)data.dptr; + if (!persistent) { + header->dmaster = dmaster; + header->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA; + } + + ret = ctdb_rec_buffer_add(recbuf, recbuf, reqid, NULL, key, data); + if (ret != 0) { + return ret; + } + + return 0; +} + +struct recdb_file_traverse_state { + struct ctdb_rec_buffer *recbuf; + struct recdb_context *recdb; + TALLOC_CTX *mem_ctx; + uint32_t dmaster; + uint32_t reqid; + bool persistent; + bool failed; + int fd; + size_t max_size; + unsigned int num_buffers; +}; + +static int recdb_file_traverse(struct tdb_context *tdb, + TDB_DATA key, TDB_DATA data, + void *private_data) +{ + struct recdb_file_traverse_state *state = + (struct recdb_file_traverse_state *)private_data; + int ret; + + ret = recbuf_filter_add(state->recbuf, state->persistent, + state->reqid, state->dmaster, key, data); + if (ret != 0) { + state->failed = true; + return ret; + } + + if (ctdb_rec_buffer_len(state->recbuf) > state->max_size) { + ret = ctdb_rec_buffer_write(state->recbuf, state->fd); + if (ret != 0) { + D_ERR("Failed to collect recovery records for %s\n", + recdb_name(state->recdb)); + state->failed = true; + return ret; + } + + state->num_buffers += 1; + + TALLOC_FREE(state->recbuf); + state->recbuf = ctdb_rec_buffer_init(state->mem_ctx, + recdb_id(state->recdb)); + if (state->recbuf == NULL) { + state->failed = true; + return ENOMEM; + } + } + + return 0; +} + +static int recdb_file(struct recdb_context *recdb, TALLOC_CTX *mem_ctx, + uint32_t dmaster, int fd, int max_size) +{ + struct recdb_file_traverse_state state; + int ret; + + state.recbuf = ctdb_rec_buffer_init(mem_ctx, recdb_id(recdb)); + if (state.recbuf == NULL) { + return -1; + } + state.recdb = recdb; + state.mem_ctx = mem_ctx; + state.dmaster = dmaster; + state.reqid = 0; + state.persistent = recdb_persistent(recdb); + state.failed = false; + state.fd = fd; + state.max_size = max_size; + state.num_buffers = 0; + + ret = tdb_traverse_read(recdb_tdb(recdb), recdb_file_traverse, &state); + if (ret == -1 || state.failed) { + TALLOC_FREE(state.recbuf); + return -1; + } + + ret = ctdb_rec_buffer_write(state.recbuf, fd); + if (ret != 0) { + D_ERR("Failed to collect recovery records for %s\n", + recdb_name(recdb)); + TALLOC_FREE(state.recbuf); + return -1; + } + state.num_buffers += 1; + + D_DEBUG("Wrote %d buffers of recovery records for %s\n", + state.num_buffers, recdb_name(recdb)); + + return state.num_buffers; +} + +/* + * Pull database from a single node + */ + +struct pull_database_state { + struct tevent_context *ev; + struct ctdb_client_context *client; + struct recdb_context *recdb; + uint32_t pnn; + uint64_t srvid; + unsigned int num_records; + int result; +}; + +static void pull_database_handler(uint64_t srvid, TDB_DATA data, + void *private_data); +static void pull_database_register_done(struct tevent_req *subreq); +static void pull_database_unregister_done(struct tevent_req *subreq); +static void pull_database_done(struct tevent_req *subreq); + +static struct tevent_req *pull_database_send( + TALLOC_CTX *mem_ctx, + struct tevent_context *ev, + struct ctdb_client_context *client, + uint32_t pnn, + struct recdb_context *recdb) +{ + struct tevent_req *req, *subreq; + struct pull_database_state *state; + + req = tevent_req_create(mem_ctx, &state, struct pull_database_state); + if (req == NULL) { + return NULL; + } + + state->ev = ev; + state->client = client; + state->recdb = recdb; + state->pnn = pnn; + state->srvid = srvid_next(); + + subreq = ctdb_client_set_message_handler_send( + state, state->ev, state->client, + state->srvid, pull_database_handler, + req); + if (tevent_req_nomem(subreq, req)) { + return tevent_req_post(req, ev); + } + + tevent_req_set_callback(subreq, pull_database_register_done, req); + + return req; +} + +static void pull_database_handler(uint64_t srvid, TDB_DATA data, + void *private_data) +{ + struct tevent_req *req = talloc_get_type_abort( + private_data, struct tevent_req); + struct pull_database_state *state = tevent_req_data( + req, struct pull_database_state); + struct ctdb_rec_buffer *recbuf; + size_t np; + int ret; + bool status; + + if (srvid != state->srvid) { + return; + } + + ret = ctdb_rec_buffer_pull(data.dptr, data.dsize, state, &recbuf, &np); + if (ret != 0) { + D_ERR("Invalid data received for DB_PULL messages\n"); + return; + } + + if (recbuf->db_id != recdb_id(state->recdb)) { + talloc_free(recbuf); + D_ERR("Invalid dbid:%08x for DB_PULL messages for %s\n", + recbuf->db_id, recdb_name(state->recdb)); + return; + } + + status = recdb_add(state->recdb, ctdb_client_pnn(state->client), + recbuf); + if (! status) { + talloc_free(recbuf); + D_ERR("Failed to add records to recdb for %s\n", + recdb_name(state->recdb)); + return; + } + + state->num_records += recbuf->count; + talloc_free(recbuf); +} + +static void pull_database_register_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct pull_database_state *state = tevent_req_data( + req, struct pull_database_state); + struct ctdb_req_control request; + struct ctdb_pulldb_ext pulldb_ext; + int ret; + bool status; + + status = ctdb_client_set_message_handler_recv(subreq, &ret); + TALLOC_FREE(subreq); + if (! status) { + D_ERR("Failed to set message handler for DB_PULL for %s\n", + recdb_name(state->recdb)); + tevent_req_error(req, ret); + return; + } + + pulldb_ext.db_id = recdb_id(state->recdb); + pulldb_ext.lmaster = CTDB_LMASTER_ANY; + pulldb_ext.srvid = state->srvid; + + ctdb_req_control_db_pull(&request, &pulldb_ext); + subreq = ctdb_client_control_send(state, state->ev, state->client, + state->pnn, TIMEOUT(), &request); + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, pull_database_done, req); +} + +static void pull_database_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct pull_database_state *state = tevent_req_data( + req, struct pull_database_state); + struct ctdb_reply_control *reply; + uint32_t num_records; + int ret; + bool status; + + status = ctdb_client_control_recv(subreq, &ret, state, &reply); + TALLOC_FREE(subreq); + if (! status) { + D_ERR("control DB_PULL failed for %s on node %u, ret=%d\n", + recdb_name(state->recdb), state->pnn, ret); + state->result = ret; + goto unregister; + } + + ret = ctdb_reply_control_db_pull(reply, &num_records); + talloc_free(reply); + if (num_records != state->num_records) { + D_ERR("mismatch (%u != %u) in DB_PULL records for db %s\n", + num_records, state->num_records, + recdb_name(state->recdb)); + state->result = EIO; + goto unregister; + } + + D_INFO("Pulled %d records for db %s from node %d\n", + state->num_records, recdb_name(state->recdb), state->pnn); + +unregister: + + subreq = ctdb_client_remove_message_handler_send( + state, state->ev, state->client, + state->srvid, req); + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, pull_database_unregister_done, req); +} + +static void pull_database_unregister_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct pull_database_state *state = tevent_req_data( + req, struct pull_database_state); + int ret; + bool status; + + status = ctdb_client_remove_message_handler_recv(subreq, &ret); + TALLOC_FREE(subreq); + if (! status) { + D_ERR("failed to remove message handler for DB_PULL for db %s\n", + recdb_name(state->recdb)); + tevent_req_error(req, ret); + return; + } + + if (state->result != 0) { + tevent_req_error(req, state->result); + return; + } + + tevent_req_done(req); +} + +static bool pull_database_recv(struct tevent_req *req, int *perr) +{ + return generic_recv(req, perr); +} + +/* + * Push database to specified nodes (new style) + */ + +struct push_database_state { + struct tevent_context *ev; + struct ctdb_client_context *client; + struct recdb_context *recdb; + uint32_t *pnn_list; + unsigned int count; + uint64_t srvid; + uint32_t dmaster; + int fd; + int num_buffers; + int num_buffers_sent; + unsigned int num_records; +}; + +static void push_database_started(struct tevent_req *subreq); +static void push_database_send_msg(struct tevent_req *req); +static void push_database_send_done(struct tevent_req *subreq); +static void push_database_confirmed(struct tevent_req *subreq); + +static struct tevent_req *push_database_send( + TALLOC_CTX *mem_ctx, + struct tevent_context *ev, + struct ctdb_client_context *client, + uint32_t *pnn_list, + unsigned int count, + struct recdb_context *recdb, + int max_size) +{ + struct tevent_req *req, *subreq; + struct push_database_state *state; + struct ctdb_req_control request; + struct ctdb_pulldb_ext pulldb_ext; + char *filename; + off_t offset; + + req = tevent_req_create(mem_ctx, &state, + struct push_database_state); + if (req == NULL) { + return NULL; + } + + state->ev = ev; + state->client = client; + state->recdb = recdb; + state->pnn_list = pnn_list; + state->count = count; + + state->srvid = srvid_next(); + state->dmaster = ctdb_client_pnn(client); + state->num_buffers_sent = 0; + state->num_records = 0; + + filename = talloc_asprintf(state, "%s.dat", recdb_path(recdb)); + if (tevent_req_nomem(filename, req)) { + return tevent_req_post(req, ev); + } + + state->fd = open(filename, O_RDWR|O_CREAT, 0644); + if (state->fd == -1) { + tevent_req_error(req, errno); + return tevent_req_post(req, ev); + } + unlink(filename); + talloc_free(filename); + + state->num_buffers = recdb_file(recdb, state, state->dmaster, + state->fd, max_size); + if (state->num_buffers == -1) { + tevent_req_error(req, ENOMEM); + return tevent_req_post(req, ev); + } + + offset = lseek(state->fd, 0, SEEK_SET); + if (offset != 0) { + tevent_req_error(req, EIO); + return tevent_req_post(req, ev); + } + + pulldb_ext.db_id = recdb_id(recdb); + pulldb_ext.srvid = state->srvid; + + ctdb_req_control_db_push_start(&request, &pulldb_ext); + subreq = ctdb_client_control_multi_send(state, ev, client, + pnn_list, count, + TIMEOUT(), &request); + if (tevent_req_nomem(subreq, req)) { + return tevent_req_post(req, ev); + } + tevent_req_set_callback(subreq, push_database_started, req); + + return req; +} + +static void push_database_started(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct push_database_state *state = tevent_req_data( + req, struct push_database_state); + int *err_list; + int ret; + bool status; + + status = ctdb_client_control_multi_recv(subreq, &ret, state, + &err_list, NULL); + TALLOC_FREE(subreq); + if (! status) { + int ret2; + uint32_t pnn; + + ret2 = ctdb_client_control_multi_error(state->pnn_list, + state->count, + err_list, &pnn); + if (ret2 != 0) { + D_ERR("control DB_PUSH_START failed for db %s" + " on node %u, ret=%d\n", + recdb_name(state->recdb), pnn, ret2); + } else { + D_ERR("control DB_PUSH_START failed for db %s," + " ret=%d\n", + recdb_name(state->recdb), ret); + } + talloc_free(err_list); + + tevent_req_error(req, ret); + return; + } + + push_database_send_msg(req); +} + +static void push_database_send_msg(struct tevent_req *req) +{ + struct push_database_state *state = tevent_req_data( + req, struct push_database_state); + struct tevent_req *subreq; + struct ctdb_rec_buffer *recbuf; + struct ctdb_req_message message; + TDB_DATA data; + size_t np; + int ret; + + if (state->num_buffers_sent == state->num_buffers) { + struct ctdb_req_control request; + + ctdb_req_control_db_push_confirm(&request, + recdb_id(state->recdb)); + subreq = ctdb_client_control_multi_send(state, state->ev, + state->client, + state->pnn_list, + state->count, + TIMEOUT(), &request); + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, push_database_confirmed, req); + return; + } + + ret = ctdb_rec_buffer_read(state->fd, state, &recbuf); + if (ret != 0) { + tevent_req_error(req, ret); + return; + } + + data.dsize = ctdb_rec_buffer_len(recbuf); + data.dptr = talloc_size(state, data.dsize); + if (tevent_req_nomem(data.dptr, req)) { + return; + } + + ctdb_rec_buffer_push(recbuf, data.dptr, &np); + + message.srvid = state->srvid; + message.data.data = data; + + D_DEBUG("Pushing buffer %d with %d records for db %s\n", + state->num_buffers_sent, recbuf->count, + recdb_name(state->recdb)); + + subreq = ctdb_client_message_multi_send(state, state->ev, + state->client, + state->pnn_list, state->count, + &message); + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, push_database_send_done, req); + + state->num_records += recbuf->count; + + talloc_free(data.dptr); + talloc_free(recbuf); +} + +static void push_database_send_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct push_database_state *state = tevent_req_data( + req, struct push_database_state); + bool status; + int ret; + + status = ctdb_client_message_multi_recv(subreq, &ret, NULL, NULL); + TALLOC_FREE(subreq); + if (! status) { + D_ERR("Sending recovery records failed for %s\n", + recdb_name(state->recdb)); + tevent_req_error(req, ret); + return; + } + + state->num_buffers_sent += 1; + + push_database_send_msg(req); +} + +static void push_database_confirmed(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct push_database_state *state = tevent_req_data( + req, struct push_database_state); + struct ctdb_reply_control **reply; + int *err_list; + bool status; + unsigned int i; + int ret; + uint32_t num_records; + + status = ctdb_client_control_multi_recv(subreq, &ret, state, + &err_list, &reply); + TALLOC_FREE(subreq); + if (! status) { + int ret2; + uint32_t pnn; + + ret2 = ctdb_client_control_multi_error(state->pnn_list, + state->count, err_list, + &pnn); + if (ret2 != 0) { + D_ERR("control DB_PUSH_CONFIRM failed for db %s" + " on node %u, ret=%d\n", + recdb_name(state->recdb), pnn, ret2); + } else { + D_ERR("control DB_PUSH_CONFIRM failed for db %s," + " ret=%d\n", + recdb_name(state->recdb), ret); + } + tevent_req_error(req, ret); + return; + } + + for (i=0; i<state->count; i++) { + ret = ctdb_reply_control_db_push_confirm(reply[i], + &num_records); + if (ret != 0) { + tevent_req_error(req, EPROTO); + return; + } + + if (num_records != state->num_records) { + D_ERR("Node %u received %d of %d records for %s\n", + state->pnn_list[i], num_records, + state->num_records, recdb_name(state->recdb)); + tevent_req_error(req, EPROTO); + return; + } + } + + talloc_free(reply); + + D_INFO("Pushed %d records for db %s\n", + state->num_records, recdb_name(state->recdb)); + + tevent_req_done(req); +} + +static bool push_database_recv(struct tevent_req *req, int *perr) +{ + return generic_recv(req, perr); +} + +/* + * Collect databases using highest sequence number + */ + +struct collect_highseqnum_db_state { + struct tevent_context *ev; + struct ctdb_client_context *client; + struct node_list *nlist; + uint32_t db_id; + struct recdb_context *recdb; + + uint32_t max_pnn; +}; + +static void collect_highseqnum_db_seqnum_done(struct tevent_req *subreq); +static void collect_highseqnum_db_pulldb_done(struct tevent_req *subreq); + +static struct tevent_req *collect_highseqnum_db_send( + TALLOC_CTX *mem_ctx, + struct tevent_context *ev, + struct ctdb_client_context *client, + struct node_list *nlist, + uint32_t db_id, + struct recdb_context *recdb) +{ + struct tevent_req *req, *subreq; + struct collect_highseqnum_db_state *state; + struct ctdb_req_control request; + + req = tevent_req_create(mem_ctx, &state, + struct collect_highseqnum_db_state); + if (req == NULL) { + return NULL; + } + + state->ev = ev; + state->client = client; + state->nlist = nlist; + state->db_id = db_id; + state->recdb = recdb; + + ctdb_req_control_get_db_seqnum(&request, db_id); + subreq = ctdb_client_control_multi_send(mem_ctx, + ev, + client, + nlist->pnn_list, + nlist->count, + TIMEOUT(), + &request); + if (tevent_req_nomem(subreq, req)) { + return tevent_req_post(req, ev); + } + tevent_req_set_callback(subreq, collect_highseqnum_db_seqnum_done, + req); + + return req; +} + +static void collect_highseqnum_db_seqnum_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct collect_highseqnum_db_state *state = tevent_req_data( + req, struct collect_highseqnum_db_state); + struct ctdb_reply_control **reply; + int *err_list; + bool status; + unsigned int i; + int ret; + uint64_t seqnum, max_seqnum; + + status = ctdb_client_control_multi_recv(subreq, &ret, state, + &err_list, &reply); + TALLOC_FREE(subreq); + if (! status) { + int ret2; + uint32_t pnn; + + ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list, + state->nlist->count, + err_list, + &pnn); + if (ret2 != 0) { + D_ERR("control GET_DB_SEQNUM failed for db %s" + " on node %u, ret=%d\n", + recdb_name(state->recdb), pnn, ret2); + } else { + D_ERR("control GET_DB_SEQNUM failed for db %s," + " ret=%d\n", + recdb_name(state->recdb), ret); + } + tevent_req_error(req, ret); + return; + } + + max_seqnum = 0; + state->max_pnn = state->nlist->pnn_list[0]; + for (i=0; i<state->nlist->count; i++) { + ret = ctdb_reply_control_get_db_seqnum(reply[i], &seqnum); + if (ret != 0) { + tevent_req_error(req, EPROTO); + return; + } + + if (max_seqnum < seqnum) { + max_seqnum = seqnum; + state->max_pnn = state->nlist->pnn_list[i]; + } + } + + talloc_free(reply); + + D_INFO("Pull persistent db %s from node %d with seqnum 0x%"PRIx64"\n", + recdb_name(state->recdb), state->max_pnn, max_seqnum); + + subreq = pull_database_send(state, + state->ev, + state->client, + state->max_pnn, + state->recdb); + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, collect_highseqnum_db_pulldb_done, + req); +} + +static void collect_highseqnum_db_pulldb_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct collect_highseqnum_db_state *state = tevent_req_data( + req, struct collect_highseqnum_db_state); + int ret; + bool status; + + status = pull_database_recv(subreq, &ret); + TALLOC_FREE(subreq); + if (! status) { + node_list_ban_credits(state->nlist, state->max_pnn); + tevent_req_error(req, ret); + return; + } + + tevent_req_done(req); +} + +static bool collect_highseqnum_db_recv(struct tevent_req *req, int *perr) +{ + return generic_recv(req, perr); +} + +/* + * Collect all databases + */ + +struct collect_all_db_state { + struct tevent_context *ev; + struct ctdb_client_context *client; + struct node_list *nlist; + uint32_t db_id; + struct recdb_context *recdb; + + struct ctdb_pulldb pulldb; + unsigned int index; +}; + +static void collect_all_db_pulldb_done(struct tevent_req *subreq); + +static struct tevent_req *collect_all_db_send( + TALLOC_CTX *mem_ctx, + struct tevent_context *ev, + struct ctdb_client_context *client, + struct node_list *nlist, + uint32_t db_id, + struct recdb_context *recdb) +{ + struct tevent_req *req, *subreq; + struct collect_all_db_state *state; + + req = tevent_req_create(mem_ctx, &state, + struct collect_all_db_state); + if (req == NULL) { + return NULL; + } + + state->ev = ev; + state->client = client; + state->nlist = nlist; + state->db_id = db_id; + state->recdb = recdb; + state->index = 0; + + subreq = pull_database_send(state, + ev, + client, + nlist->pnn_list[state->index], + recdb); + if (tevent_req_nomem(subreq, req)) { + return tevent_req_post(req, ev); + } + tevent_req_set_callback(subreq, collect_all_db_pulldb_done, req); + + return req; +} + +static void collect_all_db_pulldb_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct collect_all_db_state *state = tevent_req_data( + req, struct collect_all_db_state); + int ret; + bool status; + + status = pull_database_recv(subreq, &ret); + TALLOC_FREE(subreq); + if (! status) { + node_list_ban_credits(state->nlist, + state->nlist->pnn_list[state->index]); + tevent_req_error(req, ret); + return; + } + + state->index += 1; + if (state->index == state->nlist->count) { + tevent_req_done(req); + return; + } + + subreq = pull_database_send(state, + state->ev, + state->client, + state->nlist->pnn_list[state->index], + state->recdb); + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, collect_all_db_pulldb_done, req); +} + +static bool collect_all_db_recv(struct tevent_req *req, int *perr) +{ + return generic_recv(req, perr); +} + + +/** + * For each database do the following: + * - Get DB name from all nodes + * - Attach database on missing nodes + * - Get DB path + * - Freeze database on all nodes + * - Start transaction on all nodes + * - Collect database from all nodes + * - Wipe database on all nodes + * - Push database to all nodes + * - Commit transaction on all nodes + * - Thaw database on all nodes + */ + +struct recover_db_state { + struct tevent_context *ev; + struct ctdb_client_context *client; + struct ctdb_tunable_list *tun_list; + struct node_list *nlist; + struct db *db; + + uint32_t destnode; + struct ctdb_transdb transdb; + + const char *db_name, *db_path; + struct recdb_context *recdb; +}; + +static void recover_db_name_done(struct tevent_req *subreq); +static void recover_db_create_missing_done(struct tevent_req *subreq); +static void recover_db_path_done(struct tevent_req *subreq); +static void recover_db_freeze_done(struct tevent_req *subreq); +static void recover_db_transaction_started(struct tevent_req *subreq); +static void recover_db_collect_done(struct tevent_req *subreq); +static void recover_db_wipedb_done(struct tevent_req *subreq); +static void recover_db_pushdb_done(struct tevent_req *subreq); +static void recover_db_transaction_committed(struct tevent_req *subreq); +static void recover_db_thaw_done(struct tevent_req *subreq); + +static struct tevent_req *recover_db_send(TALLOC_CTX *mem_ctx, + struct tevent_context *ev, + struct ctdb_client_context *client, + struct ctdb_tunable_list *tun_list, + struct node_list *nlist, + uint32_t generation, + struct db *db) +{ + struct tevent_req *req, *subreq; + struct recover_db_state *state; + struct ctdb_req_control request; + + req = tevent_req_create(mem_ctx, &state, struct recover_db_state); + if (req == NULL) { + return NULL; + } + + state->ev = ev; + state->client = client; + state->tun_list = tun_list; + state->nlist = nlist; + state->db = db; + + state->destnode = ctdb_client_pnn(client); + state->transdb.db_id = db->db_id; + state->transdb.tid = generation; + + ctdb_req_control_get_dbname(&request, db->db_id); + subreq = ctdb_client_control_multi_send(state, + ev, + client, + state->db->pnn_list, + state->db->num_nodes, + TIMEOUT(), + &request); + if (tevent_req_nomem(subreq, req)) { + return tevent_req_post(req, ev); + } + tevent_req_set_callback(subreq, recover_db_name_done, req); + + return req; +} + +static void recover_db_name_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct recover_db_state *state = tevent_req_data( + req, struct recover_db_state); + struct ctdb_reply_control **reply; + int *err_list; + unsigned int i; + int ret; + bool status; + + status = ctdb_client_control_multi_recv(subreq, + &ret, + state, + &err_list, + &reply); + TALLOC_FREE(subreq); + if (! status) { + int ret2; + uint32_t pnn; + + ret2 = ctdb_client_control_multi_error(state->db->pnn_list, + state->db->num_nodes, + err_list, + &pnn); + if (ret2 != 0) { + D_ERR("control GET_DBNAME failed on node %u," + " ret=%d\n", + pnn, + ret2); + } else { + D_ERR("control GET_DBNAME failed, ret=%d\n", + ret); + } + tevent_req_error(req, ret); + return; + } + + for (i = 0; i < state->db->num_nodes; i++) { + const char *db_name; + uint32_t pnn; + + pnn = state->nlist->pnn_list[i]; + + ret = ctdb_reply_control_get_dbname(reply[i], + state, + &db_name); + if (ret != 0) { + D_ERR("control GET_DBNAME failed on node %u " + "for db=0x%x, ret=%d\n", + pnn, + state->db->db_id, + ret); + tevent_req_error(req, EPROTO); + return; + } + + if (state->db_name == NULL) { + state->db_name = db_name; + continue; + } + + if (strcmp(state->db_name, db_name) != 0) { + D_ERR("Incompatible database name for 0x%"PRIx32" " + "(%s != %s) on node %"PRIu32"\n", + state->db->db_id, + db_name, + state->db_name, + pnn); + node_list_ban_credits(state->nlist, pnn); + tevent_req_error(req, ret); + return; + } + } + + talloc_free(reply); + + subreq = db_create_missing_send(state, + state->ev, + state->client, + state->nlist, + state->db_name, + state->db); + + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, recover_db_create_missing_done, req); +} + +static void recover_db_create_missing_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct recover_db_state *state = tevent_req_data( + req, struct recover_db_state); + struct ctdb_req_control request; + int ret; + bool status; + + /* Could sanity check the db_id here */ + status = db_create_missing_recv(subreq, &ret); + TALLOC_FREE(subreq); + if (! status) { + tevent_req_error(req, ret); + return; + } + + ctdb_req_control_getdbpath(&request, state->db->db_id); + subreq = ctdb_client_control_send(state, state->ev, state->client, + state->destnode, TIMEOUT(), + &request); + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, recover_db_path_done, req); +} + +static void recover_db_path_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct recover_db_state *state = tevent_req_data( + req, struct recover_db_state); + struct ctdb_reply_control *reply; + struct ctdb_req_control request; + int ret; + bool status; + + status = ctdb_client_control_recv(subreq, &ret, state, &reply); + TALLOC_FREE(subreq); + if (! status) { + D_ERR("control GETDBPATH failed for db %s, ret=%d\n", + state->db_name, ret); + tevent_req_error(req, ret); + return; + } + + ret = ctdb_reply_control_getdbpath(reply, state, &state->db_path); + if (ret != 0) { + D_ERR("control GETDBPATH failed for db %s, ret=%d\n", + state->db_name, ret); + tevent_req_error(req, EPROTO); + return; + } + + talloc_free(reply); + + ctdb_req_control_db_freeze(&request, state->db->db_id); + subreq = ctdb_client_control_multi_send(state, + state->ev, + state->client, + state->nlist->pnn_list, + state->nlist->count, + TIMEOUT(), + &request); + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, recover_db_freeze_done, req); +} + +static void recover_db_freeze_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct recover_db_state *state = tevent_req_data( + req, struct recover_db_state); + struct ctdb_req_control request; + int *err_list; + int ret; + bool status; + + status = ctdb_client_control_multi_recv(subreq, &ret, NULL, &err_list, + NULL); + TALLOC_FREE(subreq); + if (! status) { + int ret2; + uint32_t pnn; + + ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list, + state->nlist->count, + err_list, + &pnn); + if (ret2 != 0) { + D_ERR("control FREEZE_DB failed for db %s" + " on node %u, ret=%d\n", + state->db_name, pnn, ret2); + + node_list_ban_credits(state->nlist, pnn); + } else { + D_ERR("control FREEZE_DB failed for db %s, ret=%d\n", + state->db_name, ret); + } + tevent_req_error(req, ret); + return; + } + + ctdb_req_control_db_transaction_start(&request, &state->transdb); + subreq = ctdb_client_control_multi_send(state, + state->ev, + state->client, + state->nlist->pnn_list, + state->nlist->count, + TIMEOUT(), + &request); + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, recover_db_transaction_started, req); +} + +static void recover_db_transaction_started(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct recover_db_state *state = tevent_req_data( + req, struct recover_db_state); + int *err_list; + uint32_t flags; + int ret; + bool status; + + status = ctdb_client_control_multi_recv(subreq, &ret, NULL, &err_list, + NULL); + TALLOC_FREE(subreq); + if (! status) { + int ret2; + uint32_t pnn; + + ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list, + state->nlist->count, + err_list, + &pnn); + if (ret2 != 0) { + D_ERR("control TRANSACTION_DB failed for db=%s" + " on node %u, ret=%d\n", + state->db_name, pnn, ret2); + } else { + D_ERR("control TRANSACTION_DB failed for db=%s," + " ret=%d\n", state->db_name, ret); + } + tevent_req_error(req, ret); + return; + } + + flags = state->db->db_flags; + state->recdb = recdb_create(state, + state->db->db_id, + state->db_name, + state->db_path, + state->tun_list->database_hash_size, + flags & CTDB_DB_FLAGS_PERSISTENT); + if (tevent_req_nomem(state->recdb, req)) { + return; + } + + if ((flags & CTDB_DB_FLAGS_PERSISTENT) || + (flags & CTDB_DB_FLAGS_REPLICATED)) { + subreq = collect_highseqnum_db_send(state, + state->ev, + state->client, + state->nlist, + state->db->db_id, + state->recdb); + } else { + subreq = collect_all_db_send(state, + state->ev, + state->client, + state->nlist, + state->db->db_id, + state->recdb); + } + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, recover_db_collect_done, req); +} + +static void recover_db_collect_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct recover_db_state *state = tevent_req_data( + req, struct recover_db_state); + struct ctdb_req_control request; + int ret; + bool status; + + if ((state->db->db_flags & CTDB_DB_FLAGS_PERSISTENT) || + (state->db->db_flags & CTDB_DB_FLAGS_REPLICATED)) { + status = collect_highseqnum_db_recv(subreq, &ret); + } else { + status = collect_all_db_recv(subreq, &ret); + } + TALLOC_FREE(subreq); + if (! status) { + tevent_req_error(req, ret); + return; + } + + ctdb_req_control_wipe_database(&request, &state->transdb); + subreq = ctdb_client_control_multi_send(state, + state->ev, + state->client, + state->nlist->pnn_list, + state->nlist->count, + TIMEOUT(), + &request); + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, recover_db_wipedb_done, req); +} + +static void recover_db_wipedb_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct recover_db_state *state = tevent_req_data( + req, struct recover_db_state); + int *err_list; + int ret; + bool status; + + status = ctdb_client_control_multi_recv(subreq, &ret, NULL, &err_list, + NULL); + TALLOC_FREE(subreq); + if (! status) { + int ret2; + uint32_t pnn; + + ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list, + state->nlist->count, + err_list, + &pnn); + if (ret2 != 0) { + D_ERR("control WIPEDB failed for db %s on node %u," + " ret=%d\n", state->db_name, pnn, ret2); + } else { + D_ERR("control WIPEDB failed for db %s, ret=%d\n", + state->db_name, ret); + } + tevent_req_error(req, ret); + return; + } + + subreq = push_database_send(state, + state->ev, + state->client, + state->nlist->pnn_list, + state->nlist->count, + state->recdb, + state->tun_list->rec_buffer_size_limit); + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, recover_db_pushdb_done, req); +} + +static void recover_db_pushdb_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct recover_db_state *state = tevent_req_data( + req, struct recover_db_state); + struct ctdb_req_control request; + int ret; + bool status; + + status = push_database_recv(subreq, &ret); + TALLOC_FREE(subreq); + if (! status) { + tevent_req_error(req, ret); + return; + } + + TALLOC_FREE(state->recdb); + + ctdb_req_control_db_transaction_commit(&request, &state->transdb); + subreq = ctdb_client_control_multi_send(state, + state->ev, + state->client, + state->nlist->pnn_list, + state->nlist->count, + TIMEOUT(), + &request); + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, recover_db_transaction_committed, req); +} + +static void recover_db_transaction_committed(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct recover_db_state *state = tevent_req_data( + req, struct recover_db_state); + struct ctdb_req_control request; + int *err_list; + int ret; + bool status; + + status = ctdb_client_control_multi_recv(subreq, &ret, NULL, &err_list, + NULL); + TALLOC_FREE(subreq); + if (! status) { + int ret2; + uint32_t pnn; + + ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list, + state->nlist->count, + err_list, + &pnn); + if (ret2 != 0) { + D_ERR("control DB_TRANSACTION_COMMIT failed for db %s" + " on node %u, ret=%d\n", + state->db_name, pnn, ret2); + } else { + D_ERR("control DB_TRANSACTION_COMMIT failed for db %s," + " ret=%d\n", state->db_name, ret); + } + tevent_req_error(req, ret); + return; + } + + ctdb_req_control_db_thaw(&request, state->db->db_id); + subreq = ctdb_client_control_multi_send(state, + state->ev, + state->client, + state->nlist->pnn_list, + state->nlist->count, + TIMEOUT(), + &request); + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, recover_db_thaw_done, req); +} + +static void recover_db_thaw_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct recover_db_state *state = tevent_req_data( + req, struct recover_db_state); + int *err_list; + int ret; + bool status; + + status = ctdb_client_control_multi_recv(subreq, &ret, NULL, &err_list, + NULL); + TALLOC_FREE(subreq); + if (! status) { + int ret2; + uint32_t pnn; + + ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list, + state->nlist->count, + err_list, + &pnn); + if (ret2 != 0) { + D_ERR("control DB_THAW failed for db %s on node %u," + " ret=%d\n", state->db_name, pnn, ret2); + } else { + D_ERR("control DB_THAW failed for db %s, ret=%d\n", + state->db_name, ret); + } + tevent_req_error(req, ret); + return; + } + + tevent_req_done(req); +} + +static bool recover_db_recv(struct tevent_req *req) +{ + return generic_recv(req, NULL); +} + + +/* + * Start database recovery for each database + * + * Try to recover each database 5 times before failing recovery. + */ + +struct db_recovery_state { + struct tevent_context *ev; + struct db_list *dblist; + unsigned int num_replies; + unsigned int num_failed; +}; + +struct db_recovery_one_state { + struct tevent_req *req; + struct ctdb_client_context *client; + struct db_list *dblist; + struct ctdb_tunable_list *tun_list; + struct node_list *nlist; + uint32_t generation; + struct db *db; + int num_fails; +}; + +static void db_recovery_one_done(struct tevent_req *subreq); + +static struct tevent_req *db_recovery_send(TALLOC_CTX *mem_ctx, + struct tevent_context *ev, + struct ctdb_client_context *client, + struct db_list *dblist, + struct ctdb_tunable_list *tun_list, + struct node_list *nlist, + uint32_t generation) +{ + struct tevent_req *req, *subreq; + struct db_recovery_state *state; + struct db *db; + + req = tevent_req_create(mem_ctx, &state, struct db_recovery_state); + if (req == NULL) { + return NULL; + } + + state->ev = ev; + state->dblist = dblist; + state->num_replies = 0; + state->num_failed = 0; + + if (dblist->num_dbs == 0) { + tevent_req_done(req); + return tevent_req_post(req, ev); + } + + for (db = dblist->db; db != NULL; db = db->next) { + struct db_recovery_one_state *substate; + + substate = talloc_zero(state, struct db_recovery_one_state); + if (tevent_req_nomem(substate, req)) { + return tevent_req_post(req, ev); + } + + substate->req = req; + substate->client = client; + substate->dblist = dblist; + substate->tun_list = tun_list; + substate->nlist = nlist; + substate->generation = generation; + substate->db = db; + + subreq = recover_db_send(state, + ev, + client, + tun_list, + nlist, + generation, + substate->db); + if (tevent_req_nomem(subreq, req)) { + return tevent_req_post(req, ev); + } + tevent_req_set_callback(subreq, db_recovery_one_done, + substate); + D_NOTICE("recover database 0x%08x\n", substate->db->db_id); + } + + return req; +} + +static void db_recovery_one_done(struct tevent_req *subreq) +{ + struct db_recovery_one_state *substate = tevent_req_callback_data( + subreq, struct db_recovery_one_state); + struct tevent_req *req = substate->req; + struct db_recovery_state *state = tevent_req_data( + req, struct db_recovery_state); + bool status; + + status = recover_db_recv(subreq); + TALLOC_FREE(subreq); + + if (status) { + talloc_free(substate); + goto done; + } + + substate->num_fails += 1; + if (substate->num_fails < NUM_RETRIES) { + subreq = recover_db_send(state, + state->ev, + substate->client, + substate->tun_list, + substate->nlist, + substate->generation, + substate->db); + if (tevent_req_nomem(subreq, req)) { + goto failed; + } + tevent_req_set_callback(subreq, db_recovery_one_done, substate); + D_NOTICE("recover database 0x%08x, attempt %d\n", + substate->db->db_id, substate->num_fails+1); + return; + } + +failed: + state->num_failed += 1; + +done: + state->num_replies += 1; + + if (state->num_replies == state->dblist->num_dbs) { + tevent_req_done(req); + } +} + +static bool db_recovery_recv(struct tevent_req *req, unsigned int *count) +{ + struct db_recovery_state *state = tevent_req_data( + req, struct db_recovery_state); + int err; + + if (tevent_req_is_unix_error(req, &err)) { + *count = 0; + return false; + } + + *count = state->num_replies - state->num_failed; + + if (state->num_failed > 0) { + return false; + } + + return true; +} + +struct ban_node_state { + struct tevent_context *ev; + struct ctdb_client_context *client; + struct ctdb_tunable_list *tun_list; + struct node_list *nlist; + uint32_t destnode; + + uint32_t max_pnn; +}; + +static bool ban_node_check(struct tevent_req *req); +static void ban_node_check_done(struct tevent_req *subreq); +static void ban_node_done(struct tevent_req *subreq); + +static struct tevent_req *ban_node_send(TALLOC_CTX *mem_ctx, + struct tevent_context *ev, + struct ctdb_client_context *client, + struct ctdb_tunable_list *tun_list, + struct node_list *nlist) +{ + struct tevent_req *req; + struct ban_node_state *state; + bool ok; + + req = tevent_req_create(mem_ctx, &state, struct ban_node_state); + if (req == NULL) { + return NULL; + } + + state->ev = ev; + state->client = client; + state->tun_list = tun_list; + state->nlist = nlist; + state->destnode = ctdb_client_pnn(client); + + /* Bans are not enabled */ + if (state->tun_list->enable_bans == 0) { + D_ERR("Bans are not enabled\n"); + tevent_req_done(req); + return tevent_req_post(req, ev); + } + + ok = ban_node_check(req); + if (!ok) { + return tevent_req_post(req, ev); + } + + return req; +} + +static bool ban_node_check(struct tevent_req *req) +{ + struct tevent_req *subreq; + struct ban_node_state *state = tevent_req_data( + req, struct ban_node_state); + struct ctdb_req_control request; + unsigned max_credits = 0, i; + + for (i=0; i<state->nlist->count; i++) { + if (state->nlist->ban_credits[i] > max_credits) { + state->max_pnn = state->nlist->pnn_list[i]; + max_credits = state->nlist->ban_credits[i]; + } + } + + if (max_credits < NUM_RETRIES) { + tevent_req_done(req); + return false; + } + + ctdb_req_control_get_nodemap(&request); + subreq = ctdb_client_control_send(state, + state->ev, + state->client, + state->max_pnn, + TIMEOUT(), + &request); + if (tevent_req_nomem(subreq, req)) { + return false; + } + tevent_req_set_callback(subreq, ban_node_check_done, req); + + return true; +} + +static void ban_node_check_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct ban_node_state *state = tevent_req_data( + req, struct ban_node_state); + struct ctdb_reply_control *reply; + struct ctdb_node_map *nodemap; + struct ctdb_req_control request; + struct ctdb_ban_state ban; + unsigned int i; + int ret; + bool ok; + + ok = ctdb_client_control_recv(subreq, &ret, state, &reply); + TALLOC_FREE(subreq); + if (!ok) { + D_ERR("control GET_NODEMAP failed to node %u, ret=%d\n", + state->max_pnn, ret); + tevent_req_error(req, ret); + return; + } + + ret = ctdb_reply_control_get_nodemap(reply, state, &nodemap); + if (ret != 0) { + D_ERR("control GET_NODEMAP failed, ret=%d\n", ret); + tevent_req_error(req, ret); + return; + } + + for (i=0; i<nodemap->num; i++) { + if (nodemap->node[i].pnn != state->max_pnn) { + continue; + } + + /* If the node became inactive, reset ban_credits */ + if (nodemap->node[i].flags & NODE_FLAGS_INACTIVE) { + unsigned int j; + + for (j=0; j<state->nlist->count; j++) { + if (state->nlist->pnn_list[j] == + state->max_pnn) { + state->nlist->ban_credits[j] = 0; + break; + } + } + state->max_pnn = CTDB_UNKNOWN_PNN; + } + } + + talloc_free(nodemap); + talloc_free(reply); + + /* If node becomes inactive during recovery, pick next */ + if (state->max_pnn == CTDB_UNKNOWN_PNN) { + (void) ban_node_check(req); + return; + } + + ban = (struct ctdb_ban_state) { + .pnn = state->max_pnn, + .time = state->tun_list->recovery_ban_period, + }; + + D_ERR("Banning node %u for %u seconds\n", ban.pnn, ban.time); + + ctdb_req_control_set_ban_state(&request, &ban); + subreq = ctdb_client_control_send(state, + state->ev, + state->client, + ban.pnn, + TIMEOUT(), + &request); + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, ban_node_done, req); +} + +static void ban_node_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct ban_node_state *state = tevent_req_data( + req, struct ban_node_state); + struct ctdb_reply_control *reply; + int ret; + bool status; + + status = ctdb_client_control_recv(subreq, &ret, state, &reply); + TALLOC_FREE(subreq); + if (! status) { + tevent_req_error(req, ret); + return; + } + + ret = ctdb_reply_control_set_ban_state(reply); + if (ret != 0) { + D_ERR("control SET_BAN_STATE failed, ret=%d\n", ret); + tevent_req_error(req, ret); + return; + } + + talloc_free(reply); + tevent_req_done(req); +} + +static bool ban_node_recv(struct tevent_req *req, int *perr) +{ + if (tevent_req_is_unix_error(req, perr)) { + return false; + } + + return true; +} + +/* + * Run the parallel database recovery + * + * - Get tunables + * - Get nodemap from all nodes + * - Get capabilities from all nodes + * - Get dbmap + * - Set RECOVERY_ACTIVE + * - Send START_RECOVERY + * - Update vnnmap on all nodes + * - Run database recovery + * - Set RECOVERY_NORMAL + * - Send END_RECOVERY + */ + +struct recovery_state { + struct tevent_context *ev; + struct ctdb_client_context *client; + uint32_t generation; + uint32_t destnode; + struct node_list *nlist; + struct ctdb_tunable_list *tun_list; + struct ctdb_vnn_map *vnnmap; + struct db_list *dblist; +}; + +static void recovery_tunables_done(struct tevent_req *subreq); +static void recovery_nodemap_done(struct tevent_req *subreq); +static void recovery_nodemap_verify(struct tevent_req *subreq); +static void recovery_capabilities_done(struct tevent_req *subreq); +static void recovery_dbmap_done(struct tevent_req *subreq); +static void recovery_active_done(struct tevent_req *subreq); +static void recovery_start_recovery_done(struct tevent_req *subreq); +static void recovery_vnnmap_update_done(struct tevent_req *subreq); +static void recovery_db_recovery_done(struct tevent_req *subreq); +static void recovery_failed_done(struct tevent_req *subreq); +static void recovery_normal_done(struct tevent_req *subreq); +static void recovery_end_recovery_done(struct tevent_req *subreq); + +static struct tevent_req *recovery_send(TALLOC_CTX *mem_ctx, + struct tevent_context *ev, + struct ctdb_client_context *client, + uint32_t generation) +{ + struct tevent_req *req, *subreq; + struct recovery_state *state; + struct ctdb_req_control request; + + req = tevent_req_create(mem_ctx, &state, struct recovery_state); + if (req == NULL) { + return NULL; + } + + state->ev = ev; + state->client = client; + state->generation = generation; + state->destnode = ctdb_client_pnn(client); + + ctdb_req_control_get_all_tunables(&request); + subreq = ctdb_client_control_send(state, state->ev, state->client, + state->destnode, TIMEOUT(), + &request); + if (tevent_req_nomem(subreq, req)) { + return tevent_req_post(req, ev); + } + tevent_req_set_callback(subreq, recovery_tunables_done, req); + + return req; +} + +static void recovery_tunables_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct recovery_state *state = tevent_req_data( + req, struct recovery_state); + struct ctdb_reply_control *reply; + struct ctdb_req_control request; + int ret; + bool status; + + status = ctdb_client_control_recv(subreq, &ret, state, &reply); + TALLOC_FREE(subreq); + if (! status) { + D_ERR("control GET_ALL_TUNABLES failed, ret=%d\n", ret); + tevent_req_error(req, ret); + return; + } + + ret = ctdb_reply_control_get_all_tunables(reply, state, + &state->tun_list); + if (ret != 0) { + D_ERR("control GET_ALL_TUNABLES failed, ret=%d\n", ret); + tevent_req_error(req, EPROTO); + return; + } + + talloc_free(reply); + + recover_timeout = state->tun_list->recover_timeout; + + ctdb_req_control_get_nodemap(&request); + subreq = ctdb_client_control_send(state, state->ev, state->client, + state->destnode, TIMEOUT(), + &request); + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, recovery_nodemap_done, req); +} + +static void recovery_nodemap_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct recovery_state *state = tevent_req_data( + req, struct recovery_state); + struct ctdb_reply_control *reply; + struct ctdb_req_control request; + struct ctdb_node_map *nodemap; + unsigned int i; + bool status; + int ret; + + status = ctdb_client_control_recv(subreq, &ret, state, &reply); + TALLOC_FREE(subreq); + if (! status) { + D_ERR("control GET_NODEMAP failed to node %u, ret=%d\n", + state->destnode, ret); + tevent_req_error(req, ret); + return; + } + + ret = ctdb_reply_control_get_nodemap(reply, state, &nodemap); + if (ret != 0) { + D_ERR("control GET_NODEMAP failed, ret=%d\n", ret); + tevent_req_error(req, ret); + return; + } + + state->nlist = node_list_init(state, nodemap->num); + if (tevent_req_nomem(state->nlist, req)) { + return; + } + + for (i=0; i<nodemap->num; i++) { + bool ok; + + if (nodemap->node[i].flags & NODE_FLAGS_DISCONNECTED) { + continue; + } + + ok = node_list_add(state->nlist, nodemap->node[i].pnn); + if (!ok) { + tevent_req_error(req, EINVAL); + return; + } + } + + talloc_free(nodemap); + talloc_free(reply); + + /* Verify flags by getting local node information from each node */ + ctdb_req_control_get_nodemap(&request); + subreq = ctdb_client_control_multi_send(state, + state->ev, + state->client, + state->nlist->pnn_list, + state->nlist->count, + TIMEOUT(), + &request); + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, recovery_nodemap_verify, req); +} + +static void recovery_nodemap_verify(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct recovery_state *state = tevent_req_data( + req, struct recovery_state); + struct ctdb_req_control request; + struct ctdb_reply_control **reply; + struct node_list *nlist; + unsigned int i; + int *err_list; + int ret; + bool status; + + status = ctdb_client_control_multi_recv(subreq, + &ret, + state, + &err_list, + &reply); + TALLOC_FREE(subreq); + if (! status) { + int ret2; + uint32_t pnn; + + ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list, + state->nlist->count, + err_list, + &pnn); + if (ret2 != 0) { + D_ERR("control GET_NODEMAP failed on node %u," + " ret=%d\n", pnn, ret2); + } else { + D_ERR("control GET_NODEMAP failed, ret=%d\n", ret); + } + tevent_req_error(req, ret); + return; + } + + nlist = node_list_init(state, state->nlist->size); + if (tevent_req_nomem(nlist, req)) { + return; + } + + for (i=0; i<state->nlist->count; i++) { + struct ctdb_node_map *nodemap = NULL; + uint32_t pnn, flags; + unsigned int j; + bool ok; + + pnn = state->nlist->pnn_list[i]; + ret = ctdb_reply_control_get_nodemap(reply[i], + state, + &nodemap); + if (ret != 0) { + D_ERR("control GET_NODEMAP failed on node %u\n", pnn); + tevent_req_error(req, EPROTO); + return; + } + + flags = NODE_FLAGS_DISCONNECTED; + for (j=0; j<nodemap->num; j++) { + if (nodemap->node[j].pnn == pnn) { + flags = nodemap->node[j].flags; + break; + } + } + + TALLOC_FREE(nodemap); + + if (flags & NODE_FLAGS_INACTIVE) { + continue; + } + + ok = node_list_add(nlist, pnn); + if (!ok) { + tevent_req_error(req, EINVAL); + return; + } + } + + talloc_free(reply); + + talloc_free(state->nlist); + state->nlist = nlist; + + ctdb_req_control_get_capabilities(&request); + subreq = ctdb_client_control_multi_send(state, + state->ev, + state->client, + state->nlist->pnn_list, + state->nlist->count, + TIMEOUT(), + &request); + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, recovery_capabilities_done, req); +} + +static void recovery_capabilities_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct recovery_state *state = tevent_req_data( + req, struct recovery_state); + struct ctdb_reply_control **reply; + struct ctdb_req_control request; + int *err_list; + unsigned int i; + int ret; + bool status; + + status = ctdb_client_control_multi_recv(subreq, &ret, state, &err_list, + &reply); + TALLOC_FREE(subreq); + if (! status) { + int ret2; + uint32_t pnn; + + ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list, + state->nlist->count, + err_list, + &pnn); + if (ret2 != 0) { + D_ERR("control GET_CAPABILITIES failed on node %u," + " ret=%d\n", pnn, ret2); + } else { + D_ERR("control GET_CAPABILITIES failed, ret=%d\n", + ret); + } + tevent_req_error(req, ret); + return; + } + + for (i=0; i<state->nlist->count; i++) { + uint32_t caps; + + ret = ctdb_reply_control_get_capabilities(reply[i], &caps); + if (ret != 0) { + D_ERR("control GET_CAPABILITIES failed on node %u\n", + state->nlist->pnn_list[i]); + tevent_req_error(req, EPROTO); + return; + } + + state->nlist->caps[i] = caps; + } + + talloc_free(reply); + + ctdb_req_control_get_dbmap(&request); + subreq = ctdb_client_control_multi_send(state, + state->ev, + state->client, + state->nlist->pnn_list, + state->nlist->count, + TIMEOUT(), + &request); + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, recovery_dbmap_done, req); +} + +static void recovery_dbmap_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct recovery_state *state = tevent_req_data( + req, struct recovery_state); + struct ctdb_reply_control **reply; + struct ctdb_req_control request; + int *err_list; + unsigned int i, j; + int ret; + bool status; + + status = ctdb_client_control_multi_recv(subreq, + &ret, + state, + &err_list, + &reply); + TALLOC_FREE(subreq); + if (! status) { + int ret2; + uint32_t pnn; + + ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list, + state->nlist->count, + err_list, + &pnn); + if (ret2 != 0) { + D_ERR("control GET_DBMAP failed on node %u," + " ret=%d\n", pnn, ret2); + } else { + D_ERR("control GET_DBMAP failed, ret=%d\n", + ret); + } + tevent_req_error(req, ret); + return; + } + + state->dblist = db_list_init(state, state->nlist->count); + if (tevent_req_nomem(state->dblist, req)) { + D_ERR("memory allocation error\n"); + return; + } + + for (i = 0; i < state->nlist->count; i++) { + struct ctdb_dbid_map *dbmap = NULL; + uint32_t pnn; + + pnn = state->nlist->pnn_list[i]; + + ret = ctdb_reply_control_get_dbmap(reply[i], state, &dbmap); + if (ret != 0) { + D_ERR("control GET_DBMAP failed on node %u\n", + pnn); + tevent_req_error(req, EPROTO); + return; + } + + for (j = 0; j < dbmap->num; j++) { + ret = db_list_check_and_add(state->dblist, + dbmap->dbs[j].db_id, + dbmap->dbs[j].flags, + pnn); + if (ret != 0) { + D_ERR("failed to add database list entry, " + "ret=%d\n", + ret); + tevent_req_error(req, ret); + return; + } + } + + TALLOC_FREE(dbmap); + } + + ctdb_req_control_set_recmode(&request, CTDB_RECOVERY_ACTIVE); + subreq = ctdb_client_control_multi_send(state, + state->ev, + state->client, + state->nlist->pnn_list, + state->nlist->count, + TIMEOUT(), + &request); + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, recovery_active_done, req); +} + +static void recovery_active_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct recovery_state *state = tevent_req_data( + req, struct recovery_state); + struct ctdb_req_control request; + struct ctdb_vnn_map *vnnmap; + int *err_list; + int ret; + bool status; + + status = ctdb_client_control_multi_recv(subreq, &ret, NULL, &err_list, + NULL); + TALLOC_FREE(subreq); + if (! status) { + int ret2; + uint32_t pnn; + + ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list, + state->nlist->count, + err_list, + &pnn); + if (ret2 != 0) { + D_ERR("failed to set recovery mode ACTIVE on node %u," + " ret=%d\n", pnn, ret2); + } else { + D_ERR("failed to set recovery mode ACTIVE, ret=%d\n", + ret); + } + tevent_req_error(req, ret); + return; + } + + D_ERR("Set recovery mode to ACTIVE\n"); + + /* Calculate new VNNMAP */ + vnnmap = talloc_zero(state, struct ctdb_vnn_map); + if (tevent_req_nomem(vnnmap, req)) { + return; + } + + vnnmap->map = node_list_lmaster(state->nlist, vnnmap, &vnnmap->size); + if (tevent_req_nomem(vnnmap->map, req)) { + return; + } + + if (vnnmap->size == 0) { + D_WARNING("No active lmasters found. Adding recmaster anyway\n"); + vnnmap->map[0] = state->destnode; + vnnmap->size = 1; + } + + vnnmap->generation = state->generation; + + state->vnnmap = vnnmap; + + ctdb_req_control_start_recovery(&request); + subreq = ctdb_client_control_multi_send(state, + state->ev, + state->client, + state->nlist->pnn_list, + state->nlist->count, + TIMEOUT(), + &request); + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, recovery_start_recovery_done, req); +} + +static void recovery_start_recovery_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct recovery_state *state = tevent_req_data( + req, struct recovery_state); + struct ctdb_req_control request; + int *err_list; + int ret; + bool status; + + status = ctdb_client_control_multi_recv(subreq, &ret, NULL, &err_list, + NULL); + TALLOC_FREE(subreq); + if (! status) { + int ret2; + uint32_t pnn; + + ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list, + state->nlist->count, + err_list, + &pnn); + if (ret2 != 0) { + D_ERR("failed to run start_recovery event on node %u," + " ret=%d\n", pnn, ret2); + } else { + D_ERR("failed to run start_recovery event, ret=%d\n", + ret); + } + tevent_req_error(req, ret); + return; + } + + D_ERR("start_recovery event finished\n"); + + ctdb_req_control_setvnnmap(&request, state->vnnmap); + subreq = ctdb_client_control_multi_send(state, + state->ev, + state->client, + state->nlist->pnn_list, + state->nlist->count, + TIMEOUT(), + &request); + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, recovery_vnnmap_update_done, req); +} + +static void recovery_vnnmap_update_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct recovery_state *state = tevent_req_data( + req, struct recovery_state); + int *err_list; + int ret; + bool status; + + status = ctdb_client_control_multi_recv(subreq, &ret, NULL, &err_list, + NULL); + TALLOC_FREE(subreq); + if (! status) { + int ret2; + uint32_t pnn; + + ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list, + state->nlist->count, + err_list, + &pnn); + if (ret2 != 0) { + D_ERR("failed to update VNNMAP on node %u, ret=%d\n", + pnn, ret2); + } else { + D_ERR("failed to update VNNMAP, ret=%d\n", ret); + } + tevent_req_error(req, ret); + return; + } + + D_NOTICE("updated VNNMAP\n"); + + subreq = db_recovery_send(state, + state->ev, + state->client, + state->dblist, + state->tun_list, + state->nlist, + state->vnnmap->generation); + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, recovery_db_recovery_done, req); +} + +static void recovery_db_recovery_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct recovery_state *state = tevent_req_data( + req, struct recovery_state); + struct ctdb_req_control request; + bool status; + unsigned int count; + + status = db_recovery_recv(subreq, &count); + TALLOC_FREE(subreq); + + D_ERR("%d of %d databases recovered\n", count, state->dblist->num_dbs); + + if (! status) { + subreq = ban_node_send(state, + state->ev, + state->client, + state->tun_list, + state->nlist); + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, recovery_failed_done, req); + return; + } + + ctdb_req_control_set_recmode(&request, CTDB_RECOVERY_NORMAL); + subreq = ctdb_client_control_multi_send(state, + state->ev, + state->client, + state->nlist->pnn_list, + state->nlist->count, + TIMEOUT(), + &request); + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, recovery_normal_done, req); +} + +static void recovery_failed_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + int ret; + bool status; + + status = ban_node_recv(subreq, &ret); + TALLOC_FREE(subreq); + if (! status) { + D_ERR("failed to ban node, ret=%d\n", ret); + } + + tevent_req_error(req, EIO); +} + +static void recovery_normal_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct recovery_state *state = tevent_req_data( + req, struct recovery_state); + struct ctdb_req_control request; + int *err_list; + int ret; + bool status; + + status = ctdb_client_control_multi_recv(subreq, &ret, state, &err_list, + NULL); + TALLOC_FREE(subreq); + if (! status) { + int ret2; + uint32_t pnn; + + ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list, + state->nlist->count, + err_list, + &pnn); + if (ret2 != 0) { + D_ERR("failed to set recovery mode NORMAL on node %u," + " ret=%d\n", pnn, ret2); + } else { + D_ERR("failed to set recovery mode NORMAL, ret=%d\n", + ret); + } + tevent_req_error(req, ret); + return; + } + + D_ERR("Set recovery mode to NORMAL\n"); + + ctdb_req_control_end_recovery(&request); + subreq = ctdb_client_control_multi_send(state, + state->ev, + state->client, + state->nlist->pnn_list, + state->nlist->count, + TIMEOUT(), + &request); + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, recovery_end_recovery_done, req); +} + +static void recovery_end_recovery_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct recovery_state *state = tevent_req_data( + req, struct recovery_state); + int *err_list; + int ret; + bool status; + + status = ctdb_client_control_multi_recv(subreq, &ret, state, &err_list, + NULL); + TALLOC_FREE(subreq); + if (! status) { + int ret2; + uint32_t pnn; + + ret2 = ctdb_client_control_multi_error(state->nlist->pnn_list, + state->nlist->count, + err_list, + &pnn); + if (ret2 != 0) { + D_ERR("failed to run recovered event on node %u," + " ret=%d\n", pnn, ret2); + } else { + D_ERR("failed to run recovered event, ret=%d\n", ret); + } + tevent_req_error(req, ret); + return; + } + + D_ERR("recovered event finished\n"); + + tevent_req_done(req); +} + +static void recovery_recv(struct tevent_req *req, int *perr) +{ + generic_recv(req, perr); +} + +static void usage(const char *progname) +{ + fprintf(stderr, "\nUsage: %s <output-fd> <ctdb-socket-path> <generation>\n", + progname); +} + + +/* + * Arguments - log fd, write fd, socket path, generation + */ +int main(int argc, char *argv[]) +{ + int write_fd; + const char *sockpath; + TALLOC_CTX *mem_ctx = NULL; + struct tevent_context *ev; + struct ctdb_client_context *client; + bool status; + int ret = 0; + struct tevent_req *req; + uint32_t generation; + + if (argc != 4) { + usage(argv[0]); + exit(1); + } + + write_fd = atoi(argv[1]); + sockpath = argv[2]; + generation = (uint32_t)smb_strtoul(argv[3], + NULL, + 0, + &ret, + SMB_STR_STANDARD); + if (ret != 0) { + fprintf(stderr, "recovery: unable to initialize generation\n"); + goto failed; + } + + mem_ctx = talloc_new(NULL); + if (mem_ctx == NULL) { + fprintf(stderr, "recovery: talloc_new() failed\n"); + goto failed; + } + + ret = logging_init(mem_ctx, NULL, NULL, "ctdb-recovery"); + if (ret != 0) { + fprintf(stderr, "recovery: Unable to initialize logging\n"); + goto failed; + } + + ev = tevent_context_init(mem_ctx); + if (ev == NULL) { + D_ERR("tevent_context_init() failed\n"); + goto failed; + } + + status = logging_setup_sighup_handler(ev, mem_ctx, NULL, NULL); + if (!status) { + D_ERR("logging_setup_sighup_handler() failed\n"); + goto failed; + } + + ret = ctdb_client_init(mem_ctx, ev, sockpath, &client); + if (ret != 0) { + D_ERR("ctdb_client_init() failed, ret=%d\n", ret); + goto failed; + } + + req = recovery_send(mem_ctx, ev, client, generation); + if (req == NULL) { + D_ERR("database_recover_send() failed\n"); + goto failed; + } + + if (! tevent_req_poll(req, ev)) { + D_ERR("tevent_req_poll() failed\n"); + goto failed; + } + + recovery_recv(req, &ret); + TALLOC_FREE(req); + if (ret != 0) { + D_ERR("database recovery failed, ret=%d\n", ret); + goto failed; + } + + sys_write(write_fd, &ret, sizeof(ret)); + return 0; + +failed: + TALLOC_FREE(mem_ctx); + return 1; +} diff --git a/ctdb/server/ctdb_server.c b/ctdb/server/ctdb_server.c new file mode 100644 index 0000000..b602cee --- /dev/null +++ b/ctdb/server/ctdb_server.c @@ -0,0 +1,608 @@ +/* + ctdb main protocol code + + Copyright (C) Andrew Tridgell 2006 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "replace.h" +#include "system/network.h" +#include "system/filesys.h" + +#include <talloc.h> +#include <tevent.h> + +#include "lib/util/dlinklist.h" +#include "lib/util/debug.h" +#include "lib/util/samba_util.h" + +#include "ctdb_private.h" +#include "ctdb_client.h" + +#include "common/common.h" +#include "common/logging.h" + +/* + choose the transport we will use +*/ +int ctdb_set_transport(struct ctdb_context *ctdb, const char *transport) +{ + ctdb->transport = talloc_strdup(ctdb, transport); + CTDB_NO_MEMORY(ctdb, ctdb->transport); + + return 0; +} + +/* Return the node structure for nodeip, NULL if nodeip is invalid */ +struct ctdb_node *ctdb_ip_to_node(struct ctdb_context *ctdb, + const ctdb_sock_addr *nodeip) +{ + unsigned int nodeid; + + for (nodeid=0;nodeid<ctdb->num_nodes;nodeid++) { + if (ctdb->nodes[nodeid]->flags & NODE_FLAGS_DELETED) { + continue; + } + if (ctdb_same_ip(&ctdb->nodes[nodeid]->address, nodeip)) { + return ctdb->nodes[nodeid]; + } + } + + return NULL; +} + +/* Return the PNN for nodeip, CTDB_UNKNOWN_PNN if nodeip is invalid */ +uint32_t ctdb_ip_to_pnn(struct ctdb_context *ctdb, + const ctdb_sock_addr *nodeip) +{ + struct ctdb_node *node; + + node = ctdb_ip_to_node(ctdb, nodeip); + if (node == NULL) { + return CTDB_UNKNOWN_PNN; + } + + return node->pnn; +} + +/* Load a nodes list file into a nodes array */ +static int convert_node_map_to_list(struct ctdb_context *ctdb, + TALLOC_CTX *mem_ctx, + struct ctdb_node_map_old *node_map, + struct ctdb_node ***nodes, + uint32_t *num_nodes) +{ + unsigned int i; + + *nodes = talloc_zero_array(mem_ctx, + struct ctdb_node *, node_map->num); + CTDB_NO_MEMORY(ctdb, *nodes); + *num_nodes = node_map->num; + + for (i = 0; i < node_map->num; i++) { + struct ctdb_node *node; + + node = talloc_zero(*nodes, struct ctdb_node); + CTDB_NO_MEMORY(ctdb, node); + (*nodes)[i] = node; + + node->address = node_map->nodes[i].addr; + node->name = talloc_asprintf(node, "%s:%u", + ctdb_addr_to_str(&node->address), + ctdb_addr_to_port(&node->address)); + + node->flags = node_map->nodes[i].flags; + if (!(node->flags & NODE_FLAGS_DELETED)) { + node->flags = NODE_FLAGS_UNHEALTHY; + } + node->flags |= NODE_FLAGS_DISCONNECTED; + + node->pnn = i; + node->ctdb = ctdb; + node->dead_count = 0; + } + + return 0; +} + +/* Load the nodes list from a file */ +void ctdb_load_nodes_file(struct ctdb_context *ctdb) +{ + struct ctdb_node_map_old *node_map; + int ret; + + node_map = ctdb_read_nodes_file(ctdb, ctdb->nodes_file); + if (node_map == NULL) { + goto fail; + } + + TALLOC_FREE(ctdb->nodes); + ret = convert_node_map_to_list(ctdb, ctdb, node_map, + &ctdb->nodes, &ctdb->num_nodes); + if (ret == -1) { + goto fail; + } + + talloc_free(node_map); + return; + +fail: + DEBUG(DEBUG_ERR, ("Failed to load nodes file \"%s\"\n", + ctdb->nodes_file)); + talloc_free(node_map); + exit(1); +} + +/* + setup the local node address +*/ +int ctdb_set_address(struct ctdb_context *ctdb, const char *address) +{ + ctdb->address = talloc(ctdb, ctdb_sock_addr); + CTDB_NO_MEMORY(ctdb, ctdb->address); + + if (ctdb_parse_address(ctdb, address, ctdb->address) != 0) { + return -1; + } + + ctdb->name = talloc_asprintf(ctdb, "%s:%u", + ctdb_addr_to_str(ctdb->address), + ctdb_addr_to_port(ctdb->address)); + return 0; +} + + +/* + return the number of active nodes +*/ +uint32_t ctdb_get_num_active_nodes(struct ctdb_context *ctdb) +{ + unsigned int i; + uint32_t count=0; + for (i=0; i < ctdb->num_nodes; i++) { + if (!(ctdb->nodes[i]->flags & NODE_FLAGS_INACTIVE)) { + count++; + } + } + return count; +} + + +/* + called when we need to process a packet. This can be a requeued packet + after a lockwait, or a real packet from another node +*/ +void ctdb_input_pkt(struct ctdb_context *ctdb, struct ctdb_req_header *hdr) +{ + TALLOC_CTX *tmp_ctx; + + /* place the packet as a child of the tmp_ctx. We then use + talloc_free() below to free it. If any of the calls want + to keep it, then they will steal it somewhere else, and the + talloc_free() will only free the tmp_ctx */ + tmp_ctx = talloc_new(ctdb); + talloc_steal(tmp_ctx, hdr); + + DEBUG(DEBUG_DEBUG,(__location__ " ctdb request %u of type %u length %u from " + "node %u to %u\n", hdr->reqid, hdr->operation, hdr->length, + hdr->srcnode, hdr->destnode)); + + switch (hdr->operation) { + case CTDB_REQ_CALL: + case CTDB_REPLY_CALL: + case CTDB_REQ_DMASTER: + case CTDB_REPLY_DMASTER: + /* we don't allow these calls when banned */ + if (ctdb->nodes[ctdb->pnn]->flags & NODE_FLAGS_BANNED) { + DEBUG(DEBUG_DEBUG,(__location__ " ctdb operation %u" + " request %u" + " length %u from node %u to %u while node" + " is banned\n", + hdr->operation, hdr->reqid, + hdr->length, + hdr->srcnode, hdr->destnode)); + goto done; + } + + /* for ctdb_call inter-node operations verify that the + remote node that sent us the call is running in the + same generation instance as this node + */ + if (ctdb->vnn_map->generation != hdr->generation) { + DEBUG(DEBUG_DEBUG,(__location__ " ctdb operation %u" + " request %u" + " length %u from node %u to %u had an" + " invalid generation id:%u while our" + " generation id is:%u\n", + hdr->operation, hdr->reqid, + hdr->length, + hdr->srcnode, hdr->destnode, + hdr->generation, ctdb->vnn_map->generation)); + goto done; + } + } + + switch (hdr->operation) { + case CTDB_REQ_CALL: + CTDB_INCREMENT_STAT(ctdb, node.req_call); + ctdb_request_call(ctdb, hdr); + break; + + case CTDB_REPLY_CALL: + CTDB_INCREMENT_STAT(ctdb, node.reply_call); + ctdb_reply_call(ctdb, hdr); + break; + + case CTDB_REPLY_ERROR: + CTDB_INCREMENT_STAT(ctdb, node.reply_error); + ctdb_reply_error(ctdb, hdr); + break; + + case CTDB_REQ_DMASTER: + CTDB_INCREMENT_STAT(ctdb, node.req_dmaster); + ctdb_request_dmaster(ctdb, hdr); + break; + + case CTDB_REPLY_DMASTER: + CTDB_INCREMENT_STAT(ctdb, node.reply_dmaster); + ctdb_reply_dmaster(ctdb, hdr); + break; + + case CTDB_REQ_MESSAGE: + CTDB_INCREMENT_STAT(ctdb, node.req_message); + ctdb_request_message(ctdb, hdr); + break; + + case CTDB_REQ_CONTROL: + CTDB_INCREMENT_STAT(ctdb, node.req_control); + ctdb_request_control(ctdb, hdr); + break; + + case CTDB_REPLY_CONTROL: + CTDB_INCREMENT_STAT(ctdb, node.reply_control); + ctdb_reply_control(ctdb, hdr); + break; + + case CTDB_REQ_KEEPALIVE: + CTDB_INCREMENT_STAT(ctdb, keepalive_packets_recv); + ctdb_request_keepalive(ctdb, hdr); + break; + + case CTDB_REQ_TUNNEL: + CTDB_INCREMENT_STAT(ctdb, node.req_tunnel); + ctdb_request_tunnel(ctdb, hdr); + break; + + default: + DEBUG(DEBUG_CRIT,("%s: Packet with unknown operation %u\n", + __location__, hdr->operation)); + break; + } + +done: + talloc_free(tmp_ctx); +} + + +/* + called by the transport layer when a node is dead +*/ +void ctdb_node_dead(struct ctdb_node *node) +{ + if (node->ctdb->methods == NULL) { + DBG_ERR("Can not restart transport while shutting down\n"); + return; + } + node->ctdb->methods->restart(node); + + if (node->flags & NODE_FLAGS_DISCONNECTED) { + DEBUG(DEBUG_INFO,("%s: node %s is already marked disconnected: %u connected\n", + node->ctdb->name, node->name, + node->ctdb->num_connected)); + return; + } + node->ctdb->num_connected--; + node->flags |= NODE_FLAGS_DISCONNECTED | NODE_FLAGS_UNHEALTHY; + node->rx_cnt = 0; + node->dead_count = 0; + + DEBUG(DEBUG_ERR,("%s: node %s is dead: %u connected\n", + node->ctdb->name, node->name, node->ctdb->num_connected)); + ctdb_daemon_cancel_controls(node->ctdb, node); +} + +/* + called by the transport layer when a node is connected +*/ +void ctdb_node_connected(struct ctdb_node *node) +{ + if (!(node->flags & NODE_FLAGS_DISCONNECTED)) { + DEBUG(DEBUG_INFO,("%s: node %s is already marked connected: %u connected\n", + node->ctdb->name, node->name, + node->ctdb->num_connected)); + return; + } + node->ctdb->num_connected++; + node->dead_count = 0; + node->flags &= ~NODE_FLAGS_DISCONNECTED; + DEBUG(DEBUG_ERR, + ("%s: connected to %s - %u connected\n", + node->ctdb->name, node->name, node->ctdb->num_connected)); +} + +struct queue_next { + struct ctdb_context *ctdb; + struct ctdb_req_header *hdr; +}; + + +/* + triggered when a deferred packet is due + */ +static void queue_next_trigger(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *private_data) +{ + struct queue_next *q = talloc_get_type(private_data, struct queue_next); + ctdb_input_pkt(q->ctdb, q->hdr); + talloc_free(q); +} + +/* + defer a packet, so it is processed on the next event loop + this is used for sending packets to ourselves + */ +static void ctdb_defer_packet(struct ctdb_context *ctdb, struct ctdb_req_header *hdr) +{ + struct queue_next *q; + q = talloc(ctdb, struct queue_next); + if (q == NULL) { + DEBUG(DEBUG_ERR,(__location__ " Failed to allocate deferred packet\n")); + return; + } + q->ctdb = ctdb; + q->hdr = talloc_memdup(q, hdr, hdr->length); + if (q->hdr == NULL) { + talloc_free(q); + DEBUG(DEBUG_ERR,("Error copying deferred packet to self\n")); + return; + } +#if 0 + /* use this to put packets directly into our recv function */ + ctdb_input_pkt(q->ctdb, q->hdr); +#else + tevent_add_timer(ctdb->ev, q, timeval_zero(), queue_next_trigger, q); +#endif +} + + +/* + broadcast a packet to all nodes +*/ +static void ctdb_broadcast_packet_all(struct ctdb_context *ctdb, + struct ctdb_req_header *hdr) +{ + unsigned int i; + for (i=0; i < ctdb->num_nodes; i++) { + if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) { + continue; + } + hdr->destnode = ctdb->nodes[i]->pnn; + ctdb_queue_packet(ctdb, hdr); + } +} + +/* + broadcast a packet to all active nodes +*/ +static void ctdb_broadcast_packet_active(struct ctdb_context *ctdb, + struct ctdb_req_header *hdr) +{ + unsigned int i; + for (i = 0; i < ctdb->num_nodes; i++) { + if (ctdb->nodes[i]->flags & NODE_FLAGS_INACTIVE) { + continue; + } + + hdr->destnode = ctdb->nodes[i]->pnn; + ctdb_queue_packet(ctdb, hdr); + } +} + +/* + broadcast a packet to all connected nodes +*/ +static void ctdb_broadcast_packet_connected(struct ctdb_context *ctdb, + struct ctdb_req_header *hdr) +{ + unsigned int i; + for (i=0; i < ctdb->num_nodes; i++) { + if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) { + continue; + } + if (!(ctdb->nodes[i]->flags & NODE_FLAGS_DISCONNECTED)) { + hdr->destnode = ctdb->nodes[i]->pnn; + ctdb_queue_packet(ctdb, hdr); + } + } +} + +/* + queue a packet or die +*/ +void ctdb_queue_packet(struct ctdb_context *ctdb, struct ctdb_req_header *hdr) +{ + struct ctdb_node *node; + + switch (hdr->destnode) { + case CTDB_BROADCAST_ALL: + ctdb_broadcast_packet_all(ctdb, hdr); + return; + case CTDB_BROADCAST_ACTIVE: + ctdb_broadcast_packet_active(ctdb, hdr); + return; + case CTDB_BROADCAST_CONNECTED: + ctdb_broadcast_packet_connected(ctdb, hdr); + return; + } + + CTDB_INCREMENT_STAT(ctdb, node_packets_sent); + + if (!ctdb_validate_pnn(ctdb, hdr->destnode)) { + DEBUG(DEBUG_CRIT,(__location__ " can't send to node %u that does not exist\n", + hdr->destnode)); + return; + } + + node = ctdb->nodes[hdr->destnode]; + + if (node->flags & NODE_FLAGS_DELETED) { + DEBUG(DEBUG_ERR, (__location__ " Can not queue packet to DELETED node %d\n", hdr->destnode)); + return; + } + + if (node->pnn == ctdb->pnn) { + ctdb_defer_packet(ctdb, hdr); + return; + } + + if (ctdb->methods == NULL) { + DEBUG(DEBUG_ALERT, (__location__ " Can not queue packet. " + "Transport is DOWN\n")); + return; + } + + node->tx_cnt++; + if (ctdb->methods->queue_pkt(node, (uint8_t *)hdr, hdr->length) != 0) { + ctdb_fatal(ctdb, "Unable to queue packet\n"); + } +} + + + + +/* + a valgrind hack to allow us to get opcode specific backtraces + very ugly, and relies on no compiler optimisation! +*/ +void ctdb_queue_packet_opcode(struct ctdb_context *ctdb, struct ctdb_req_header *hdr, unsigned opcode) +{ + switch (opcode) { +#define DO_OP(x) case x: ctdb_queue_packet(ctdb, hdr); break + DO_OP(1); + DO_OP(2); + DO_OP(3); + DO_OP(4); + DO_OP(5); + DO_OP(6); + DO_OP(7); + DO_OP(8); + DO_OP(9); + DO_OP(10); + DO_OP(11); + DO_OP(12); + DO_OP(13); + DO_OP(14); + DO_OP(15); + DO_OP(16); + DO_OP(17); + DO_OP(18); + DO_OP(19); + DO_OP(20); + DO_OP(21); + DO_OP(22); + DO_OP(23); + DO_OP(24); + DO_OP(25); + DO_OP(26); + DO_OP(27); + DO_OP(28); + DO_OP(29); + DO_OP(30); + DO_OP(31); + DO_OP(32); + DO_OP(33); + DO_OP(34); + DO_OP(35); + DO_OP(36); + DO_OP(37); + DO_OP(38); + DO_OP(39); + DO_OP(40); + DO_OP(41); + DO_OP(42); + DO_OP(43); + DO_OP(44); + DO_OP(45); + DO_OP(46); + DO_OP(47); + DO_OP(48); + DO_OP(49); + DO_OP(50); + DO_OP(51); + DO_OP(52); + DO_OP(53); + DO_OP(54); + DO_OP(55); + DO_OP(56); + DO_OP(57); + DO_OP(58); + DO_OP(59); + DO_OP(60); + DO_OP(61); + DO_OP(62); + DO_OP(63); + DO_OP(64); + DO_OP(65); + DO_OP(66); + DO_OP(67); + DO_OP(68); + DO_OP(69); + DO_OP(70); + DO_OP(71); + DO_OP(72); + DO_OP(73); + DO_OP(74); + DO_OP(75); + DO_OP(76); + DO_OP(77); + DO_OP(78); + DO_OP(79); + DO_OP(80); + DO_OP(81); + DO_OP(82); + DO_OP(83); + DO_OP(84); + DO_OP(85); + DO_OP(86); + DO_OP(87); + DO_OP(88); + DO_OP(89); + DO_OP(90); + DO_OP(91); + DO_OP(92); + DO_OP(93); + DO_OP(94); + DO_OP(95); + DO_OP(96); + DO_OP(97); + DO_OP(98); + DO_OP(99); + DO_OP(100); + default: + ctdb_queue_packet(ctdb, hdr); + break; + } +} diff --git a/ctdb/server/ctdb_statistics.c b/ctdb/server/ctdb_statistics.c new file mode 100644 index 0000000..4cf8f9e --- /dev/null +++ b/ctdb/server/ctdb_statistics.c @@ -0,0 +1,93 @@ +/* + ctdb statistics code + + Copyright (C) Ronnie Sahlberg 2010 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "replace.h" +#include "system/network.h" +#include "system/time.h" + +#include <talloc.h> +#include <tevent.h> + +#include "lib/util/debug.h" +#include "lib/util/samba_util.h" + +#include "ctdb_private.h" + +#include "common/logging.h" + +static void ctdb_statistics_update(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *p) +{ + struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context); + + memmove(&ctdb->statistics_history[1], &ctdb->statistics_history[0], (MAX_STAT_HISTORY-1)*sizeof(struct ctdb_statistics)); + memcpy(&ctdb->statistics_history[0], &ctdb->statistics_current, sizeof(struct ctdb_statistics)); + ctdb->statistics_history[0].statistics_current_time = timeval_current(); + + + bzero(&ctdb->statistics_current, sizeof(struct ctdb_statistics)); + ctdb->statistics_current.statistics_start_time = timeval_current(); + + tevent_add_timer(ctdb->ev, ctdb, + timeval_current_ofs(ctdb->tunable.stat_history_interval, 0), + ctdb_statistics_update, ctdb); +} + +int ctdb_statistics_init(struct ctdb_context *ctdb) +{ + bzero(&ctdb->statistics, sizeof(struct ctdb_statistics)); + ctdb->statistics.statistics_start_time = timeval_current(); + + bzero(&ctdb->statistics_current, sizeof(struct ctdb_statistics)); + ctdb->statistics_current.statistics_start_time = timeval_current(); + + bzero(ctdb->statistics_history, sizeof(ctdb->statistics_history)); + + tevent_add_timer(ctdb->ev, ctdb, + timeval_current_ofs(ctdb->tunable.stat_history_interval, 0), + ctdb_statistics_update, ctdb); + return 0; +} + + +int32_t ctdb_control_get_stat_history(struct ctdb_context *ctdb, + struct ctdb_req_control_old *c, + TDB_DATA *outdata) +{ + int len; + struct ctdb_statistics_list_old *s; + + len = offsetof(struct ctdb_statistics_list_old, stats) + + MAX_STAT_HISTORY*sizeof(struct ctdb_statistics); + + s = talloc_size(outdata, len); + if (s == NULL) { + DEBUG(DEBUG_ERR,(__location__ " Failed to allocate statistics history structure\n")); + return -1; + } + + s->num = MAX_STAT_HISTORY; + memcpy(&s->stats[0], &ctdb->statistics_history[0], sizeof(ctdb->statistics_history)); + + outdata->dsize = len; + outdata->dptr = (uint8_t *)s; + + return 0; +} diff --git a/ctdb/server/ctdb_takeover.c b/ctdb/server/ctdb_takeover.c new file mode 100644 index 0000000..b622faf --- /dev/null +++ b/ctdb/server/ctdb_takeover.c @@ -0,0 +1,2751 @@ +/* + ctdb ip takeover code + + Copyright (C) Ronnie Sahlberg 2007 + Copyright (C) Andrew Tridgell 2007 + Copyright (C) Martin Schwenke 2011 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ +#include "replace.h" +#include "system/network.h" +#include "system/filesys.h" +#include "system/time.h" +#include "system/wait.h" + +#include <talloc.h> +#include <tevent.h> + +#include "lib/util/dlinklist.h" +#include "lib/util/debug.h" +#include "lib/util/samba_util.h" +#include "lib/util/sys_rw.h" +#include "lib/util/util_process.h" + +#include "protocol/protocol_util.h" + +#include "ctdb_private.h" +#include "ctdb_client.h" + +#include "common/reqid.h" +#include "common/system.h" +#include "common/system_socket.h" +#include "common/common.h" +#include "common/logging.h" + +#include "server/ctdb_config.h" + +#include "server/ipalloc.h" + +#define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0) + +#define CTDB_ARP_INTERVAL 1 +#define CTDB_ARP_REPEAT 3 + +struct ctdb_interface { + struct ctdb_interface *prev, *next; + const char *name; + bool link_up; + uint32_t references; +}; + +struct vnn_interface { + struct vnn_interface *prev, *next; + struct ctdb_interface *iface; +}; + +/* state associated with a public ip address */ +struct ctdb_vnn { + struct ctdb_vnn *prev, *next; + + struct ctdb_interface *iface; + struct vnn_interface *ifaces; + ctdb_sock_addr public_address; + uint8_t public_netmask_bits; + + /* + * The node number that is serving this public address - set + * to CTDB_UNKNOWN_PNN if node is serving it + */ + uint32_t pnn; + + /* List of clients to tickle for this public address */ + struct ctdb_tcp_array *tcp_array; + + /* whether we need to update the other nodes with changes to our list + of connected clients */ + bool tcp_update_needed; + + /* a context to hang sending gratious arp events off */ + TALLOC_CTX *takeover_ctx; + + /* Set to true any time an update to this VNN is in flight. + This helps to avoid races. */ + bool update_in_flight; + + /* If CTDB_CONTROL_DEL_PUBLIC_IP is received for this IP + * address then this flag is set. It will be deleted in the + * release IP callback. */ + bool delete_pending; +}; + +static const char *iface_string(const struct ctdb_interface *iface) +{ + return (iface != NULL ? iface->name : "__none__"); +} + +static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn) +{ + return iface_string(vnn->iface); +} + +static struct ctdb_interface *ctdb_find_iface(struct ctdb_context *ctdb, + const char *iface); + +static struct ctdb_interface * +ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface) +{ + struct ctdb_interface *i; + + if (strlen(iface) > CTDB_IFACE_SIZE) { + DEBUG(DEBUG_ERR, ("Interface name too long \"%s\"\n", iface)); + return NULL; + } + + /* Verify that we don't have an entry for this ip yet */ + i = ctdb_find_iface(ctdb, iface); + if (i != NULL) { + return i; + } + + /* create a new structure for this interface */ + i = talloc_zero(ctdb, struct ctdb_interface); + if (i == NULL) { + DEBUG(DEBUG_ERR, (__location__ " out of memory\n")); + return NULL; + } + i->name = talloc_strdup(i, iface); + if (i->name == NULL) { + DEBUG(DEBUG_ERR, (__location__ " out of memory\n")); + talloc_free(i); + return NULL; + } + + i->link_up = true; + + DLIST_ADD(ctdb->ifaces, i); + + return i; +} + +static bool vnn_has_interface(struct ctdb_vnn *vnn, + const struct ctdb_interface *iface) +{ + struct vnn_interface *i; + + for (i = vnn->ifaces; i != NULL; i = i->next) { + if (iface == i->iface) { + return true; + } + } + + return false; +} + +/* If any interfaces now have no possible IPs then delete them. This + * implementation is naive (i.e. simple) rather than clever + * (i.e. complex). Given that this is run on delip and that operation + * is rare, this doesn't need to be efficient - it needs to be + * foolproof. One alternative is reference counting, where the logic + * is distributed and can, therefore, be broken in multiple places. + * Another alternative is to build a red-black tree of interfaces that + * can have addresses (by walking ctdb->vnn once) and then walking + * ctdb->ifaces once and deleting those not in the tree. Let's go to + * one of those if the naive implementation causes problems... :-) + */ +static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb, + struct ctdb_vnn *vnn) +{ + struct ctdb_interface *i, *next; + + /* For each interface, check if there's an IP using it. */ + for (i = ctdb->ifaces; i != NULL; i = next) { + struct ctdb_vnn *tv; + bool found; + next = i->next; + + /* Only consider interfaces named in the given VNN. */ + if (!vnn_has_interface(vnn, i)) { + continue; + } + + /* Search for a vnn with this interface. */ + found = false; + for (tv=ctdb->vnn; tv; tv=tv->next) { + if (vnn_has_interface(tv, i)) { + found = true; + break; + } + } + + if (!found) { + /* None of the VNNs are using this interface. */ + DLIST_REMOVE(ctdb->ifaces, i); + talloc_free(i); + } + } +} + + +static struct ctdb_interface *ctdb_find_iface(struct ctdb_context *ctdb, + const char *iface) +{ + struct ctdb_interface *i; + + for (i=ctdb->ifaces;i;i=i->next) { + if (strcmp(i->name, iface) == 0) { + return i; + } + } + + return NULL; +} + +static struct ctdb_interface *ctdb_vnn_best_iface(struct ctdb_context *ctdb, + struct ctdb_vnn *vnn) +{ + struct vnn_interface *i; + struct ctdb_interface *cur = NULL; + struct ctdb_interface *best = NULL; + + for (i = vnn->ifaces; i != NULL; i = i->next) { + + cur = i->iface; + + if (!cur->link_up) { + continue; + } + + if (best == NULL) { + best = cur; + continue; + } + + if (cur->references < best->references) { + best = cur; + continue; + } + } + + return best; +} + +static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb, + struct ctdb_vnn *vnn) +{ + struct ctdb_interface *best = NULL; + + if (vnn->iface) { + DEBUG(DEBUG_INFO, (__location__ " public address '%s' " + "still assigned to iface '%s'\n", + ctdb_addr_to_str(&vnn->public_address), + ctdb_vnn_iface_string(vnn))); + return 0; + } + + best = ctdb_vnn_best_iface(ctdb, vnn); + if (best == NULL) { + DEBUG(DEBUG_ERR, (__location__ " public address '%s' " + "cannot assign to iface any iface\n", + ctdb_addr_to_str(&vnn->public_address))); + return -1; + } + + vnn->iface = best; + best->references++; + vnn->pnn = ctdb->pnn; + + DEBUG(DEBUG_INFO, (__location__ " public address '%s' " + "now assigned to iface '%s' refs[%d]\n", + ctdb_addr_to_str(&vnn->public_address), + ctdb_vnn_iface_string(vnn), + best->references)); + return 0; +} + +static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb, + struct ctdb_vnn *vnn) +{ + DEBUG(DEBUG_INFO, (__location__ " public address '%s' " + "now unassigned (old iface '%s' refs[%d])\n", + ctdb_addr_to_str(&vnn->public_address), + ctdb_vnn_iface_string(vnn), + vnn->iface?vnn->iface->references:0)); + if (vnn->iface) { + vnn->iface->references--; + } + vnn->iface = NULL; + if (vnn->pnn == ctdb->pnn) { + vnn->pnn = CTDB_UNKNOWN_PNN; + } +} + +static bool ctdb_vnn_available(struct ctdb_context *ctdb, + struct ctdb_vnn *vnn) +{ + uint32_t flags; + struct vnn_interface *i; + + /* Nodes that are not RUNNING can not host IPs */ + if (ctdb->runstate != CTDB_RUNSTATE_RUNNING) { + return false; + } + + flags = ctdb->nodes[ctdb->pnn]->flags; + if ((flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED)) != 0) { + return false; + } + + if (vnn->delete_pending) { + return false; + } + + if (vnn->iface && vnn->iface->link_up) { + return true; + } + + for (i = vnn->ifaces; i != NULL; i = i->next) { + if (i->iface->link_up) { + return true; + } + } + + return false; +} + +struct ctdb_takeover_arp { + struct ctdb_context *ctdb; + uint32_t count; + ctdb_sock_addr addr; + struct ctdb_tcp_array *tcparray; + struct ctdb_vnn *vnn; +}; + + +/* + lists of tcp endpoints + */ +struct ctdb_tcp_list { + struct ctdb_tcp_list *prev, *next; + struct ctdb_client *client; + struct ctdb_connection connection; +}; + +/* + send a gratuitous arp + */ +static void ctdb_control_send_arp(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *private_data) +{ + struct ctdb_takeover_arp *arp = talloc_get_type(private_data, + struct ctdb_takeover_arp); + int ret; + struct ctdb_tcp_array *tcparray; + const char *iface; + + /* IP address might have been released between sends */ + if (arp->vnn->iface == NULL) { + DBG_INFO("Cancelling ARP send for released IP %s\n", + ctdb_addr_to_str(&arp->vnn->public_address)); + talloc_free(arp); + return; + } + + iface = ctdb_vnn_iface_string(arp->vnn); + ret = ctdb_sys_send_arp(&arp->addr, iface); + if (ret != 0) { + DBG_ERR("Failed to send ARP on interface %s: %s\n", + iface, strerror(ret)); + } + + tcparray = arp->tcparray; + if (tcparray) { + unsigned int i; + + for (i=0;i<tcparray->num;i++) { + struct ctdb_connection *tcon; + char buf[128]; + + tcon = &tcparray->connections[i]; + ret = ctdb_connection_to_buf(buf, + sizeof(buf), + tcon, + false, + " -> "); + if (ret != 0) { + strlcpy(buf, "UNKNOWN", sizeof(buf)); + } + D_INFO("Send TCP tickle ACK: %s\n", buf); + ret = ctdb_sys_send_tcp( + &tcon->src, + &tcon->dst, + 0, 0, 0); + if (ret != 0) { + DBG_ERR("Failed to send TCP tickle ACK: %s\n", + buf); + } + } + } + + arp->count++; + + if (arp->count == CTDB_ARP_REPEAT) { + talloc_free(arp); + return; + } + + tevent_add_timer(arp->ctdb->ev, arp->vnn->takeover_ctx, + timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), + ctdb_control_send_arp, arp); +} + +static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb, + struct ctdb_vnn *vnn) +{ + struct ctdb_takeover_arp *arp; + struct ctdb_tcp_array *tcparray; + + if (!vnn->takeover_ctx) { + vnn->takeover_ctx = talloc_new(vnn); + if (!vnn->takeover_ctx) { + return -1; + } + } + + arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp); + if (!arp) { + return -1; + } + + arp->ctdb = ctdb; + arp->addr = vnn->public_address; + arp->vnn = vnn; + + tcparray = vnn->tcp_array; + if (tcparray) { + /* add all of the known tcp connections for this IP to the + list of tcp connections to send tickle acks for */ + arp->tcparray = talloc_steal(arp, tcparray); + + vnn->tcp_array = NULL; + vnn->tcp_update_needed = true; + } + + tevent_add_timer(arp->ctdb->ev, vnn->takeover_ctx, + timeval_zero(), ctdb_control_send_arp, arp); + + return 0; +} + +struct ctdb_do_takeip_state { + struct ctdb_req_control_old *c; + struct ctdb_vnn *vnn; +}; + +/* + called when takeip event finishes + */ +static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status, + void *private_data) +{ + struct ctdb_do_takeip_state *state = + talloc_get_type(private_data, struct ctdb_do_takeip_state); + int32_t ret; + TDB_DATA data; + + if (status != 0) { + if (status == -ETIMEDOUT) { + ctdb_ban_self(ctdb); + } + DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n", + ctdb_addr_to_str(&state->vnn->public_address), + ctdb_vnn_iface_string(state->vnn))); + ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL); + + talloc_free(state); + return; + } + + if (ctdb->do_checkpublicip) { + + ret = ctdb_announce_vnn_iface(ctdb, state->vnn); + if (ret != 0) { + ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL); + talloc_free(state); + return; + } + + } + + data.dptr = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address); + data.dsize = strlen((char *)data.dptr) + 1; + DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr)); + + ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data); + + + /* the control succeeded */ + ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL); + talloc_free(state); + return; +} + +static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state) +{ + state->vnn->update_in_flight = false; + return 0; +} + +/* + take over an ip address + */ +static int32_t ctdb_do_takeip(struct ctdb_context *ctdb, + struct ctdb_req_control_old *c, + struct ctdb_vnn *vnn) +{ + int ret; + struct ctdb_do_takeip_state *state; + + if (vnn->update_in_flight) { + DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected " + "update for this IP already in flight\n", + ctdb_addr_to_str(&vnn->public_address), + vnn->public_netmask_bits)); + return -1; + } + + ret = ctdb_vnn_assign_iface(ctdb, vnn); + if (ret != 0) { + DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to " + "assign a usable interface\n", + ctdb_addr_to_str(&vnn->public_address), + vnn->public_netmask_bits)); + return -1; + } + + state = talloc(vnn, struct ctdb_do_takeip_state); + CTDB_NO_MEMORY(ctdb, state); + + state->c = NULL; + state->vnn = vnn; + + vnn->update_in_flight = true; + talloc_set_destructor(state, ctdb_takeip_destructor); + + DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n", + ctdb_addr_to_str(&vnn->public_address), + vnn->public_netmask_bits, + ctdb_vnn_iface_string(vnn))); + + ret = ctdb_event_script_callback(ctdb, + state, + ctdb_do_takeip_callback, + state, + CTDB_EVENT_TAKE_IP, + "%s %s %u", + ctdb_vnn_iface_string(vnn), + ctdb_addr_to_str(&vnn->public_address), + vnn->public_netmask_bits); + + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n", + ctdb_addr_to_str(&vnn->public_address), + ctdb_vnn_iface_string(vnn))); + talloc_free(state); + return -1; + } + + state->c = talloc_steal(ctdb, c); + return 0; +} + +struct ctdb_do_updateip_state { + struct ctdb_req_control_old *c; + struct ctdb_interface *old; + struct ctdb_vnn *vnn; +}; + +/* + called when updateip event finishes + */ +static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status, + void *private_data) +{ + struct ctdb_do_updateip_state *state = + talloc_get_type(private_data, struct ctdb_do_updateip_state); + + if (status != 0) { + if (status == -ETIMEDOUT) { + ctdb_ban_self(ctdb); + } + DEBUG(DEBUG_ERR, + ("Failed update of IP %s from interface %s to %s\n", + ctdb_addr_to_str(&state->vnn->public_address), + iface_string(state->old), + ctdb_vnn_iface_string(state->vnn))); + + /* + * All we can do is reset the old interface + * and let the next run fix it + */ + ctdb_vnn_unassign_iface(ctdb, state->vnn); + state->vnn->iface = state->old; + state->vnn->iface->references++; + + ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL); + talloc_free(state); + return; + } + + /* the control succeeded */ + ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL); + talloc_free(state); + return; +} + +static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state) +{ + state->vnn->update_in_flight = false; + return 0; +} + +/* + update (move) an ip address + */ +static int32_t ctdb_do_updateip(struct ctdb_context *ctdb, + struct ctdb_req_control_old *c, + struct ctdb_vnn *vnn) +{ + int ret; + struct ctdb_do_updateip_state *state; + struct ctdb_interface *old = vnn->iface; + const char *old_name = iface_string(old); + const char *new_name; + + if (vnn->update_in_flight) { + DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected " + "update for this IP already in flight\n", + ctdb_addr_to_str(&vnn->public_address), + vnn->public_netmask_bits)); + return -1; + } + + ctdb_vnn_unassign_iface(ctdb, vnn); + ret = ctdb_vnn_assign_iface(ctdb, vnn); + if (ret != 0) { + DEBUG(DEBUG_ERR,("Update of IP %s/%u failed to " + "assign a usable interface (old iface '%s')\n", + ctdb_addr_to_str(&vnn->public_address), + vnn->public_netmask_bits, + old_name)); + return -1; + } + + if (old == vnn->iface) { + /* A benign update from one interface onto itself. + * no need to run the eventscripts in this case, just return + * success. + */ + ctdb_request_control_reply(ctdb, c, NULL, 0, NULL); + return 0; + } + + state = talloc(vnn, struct ctdb_do_updateip_state); + CTDB_NO_MEMORY(ctdb, state); + + state->c = NULL; + state->old = old; + state->vnn = vnn; + + vnn->update_in_flight = true; + talloc_set_destructor(state, ctdb_updateip_destructor); + + new_name = ctdb_vnn_iface_string(vnn); + DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from " + "interface %s to %s\n", + ctdb_addr_to_str(&vnn->public_address), + vnn->public_netmask_bits, + old_name, + new_name)); + + ret = ctdb_event_script_callback(ctdb, + state, + ctdb_do_updateip_callback, + state, + CTDB_EVENT_UPDATE_IP, + "%s %s %s %u", + old_name, + new_name, + ctdb_addr_to_str(&vnn->public_address), + vnn->public_netmask_bits); + if (ret != 0) { + DEBUG(DEBUG_ERR, + ("Failed update IP %s from interface %s to %s\n", + ctdb_addr_to_str(&vnn->public_address), + old_name, new_name)); + talloc_free(state); + return -1; + } + + state->c = talloc_steal(ctdb, c); + return 0; +} + +/* + Find the vnn of the node that has a public ip address + returns -1 if the address is not known as a public address + */ +static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr) +{ + struct ctdb_vnn *vnn; + + for (vnn=ctdb->vnn;vnn;vnn=vnn->next) { + if (ctdb_same_ip(&vnn->public_address, addr)) { + return vnn; + } + } + + return NULL; +} + +/* + take over an ip address + */ +int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb, + struct ctdb_req_control_old *c, + TDB_DATA indata, + bool *async_reply) +{ + int ret; + struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr; + struct ctdb_vnn *vnn; + bool have_ip = false; + bool do_updateip = false; + bool do_takeip = false; + struct ctdb_interface *best_iface = NULL; + + if (pip->pnn != ctdb->pnn) { + DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' " + "with pnn %d, but we're node %d\n", + ctdb_addr_to_str(&pip->addr), + pip->pnn, ctdb->pnn)); + return -1; + } + + /* update out vnn list */ + vnn = find_public_ip_vnn(ctdb, &pip->addr); + if (vnn == NULL) { + DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n", + ctdb_addr_to_str(&pip->addr))); + return 0; + } + + if (ctdb_config.failover_disabled == 0 && ctdb->do_checkpublicip) { + have_ip = ctdb_sys_have_ip(&pip->addr); + } + best_iface = ctdb_vnn_best_iface(ctdb, vnn); + if (best_iface == NULL) { + DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find" + "a usable interface (old %s, have_ip %d)\n", + ctdb_addr_to_str(&vnn->public_address), + vnn->public_netmask_bits, + ctdb_vnn_iface_string(vnn), + have_ip)); + return -1; + } + + if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != CTDB_UNKNOWN_PNN) { + DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, " + "and we have it on iface[%s], but it was assigned to node %d" + "and we are node %d, banning ourself\n", + ctdb_addr_to_str(&vnn->public_address), + ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn)); + ctdb_ban_self(ctdb); + return -1; + } + + if (vnn->pnn == CTDB_UNKNOWN_PNN && have_ip) { + /* This will cause connections to be reset and + * reestablished. However, this is a very unusual + * situation and doing this will completely repair the + * inconsistency in the VNN. + */ + DEBUG(DEBUG_WARNING, + (__location__ + " Doing updateip for IP %s already on an interface\n", + ctdb_addr_to_str(&vnn->public_address))); + do_updateip = true; + } + + if (vnn->iface) { + if (vnn->iface != best_iface) { + if (!vnn->iface->link_up) { + do_updateip = true; + } else if (vnn->iface->references > (best_iface->references + 1)) { + /* only move when the rebalance gains something */ + do_updateip = true; + } + } + } + + if (!have_ip) { + if (do_updateip) { + ctdb_vnn_unassign_iface(ctdb, vnn); + do_updateip = false; + } + do_takeip = true; + } + + if (do_takeip) { + ret = ctdb_do_takeip(ctdb, c, vnn); + if (ret != 0) { + return -1; + } + } else if (do_updateip) { + ret = ctdb_do_updateip(ctdb, c, vnn); + if (ret != 0) { + return -1; + } + } else { + /* + * The interface is up and the kernel known the ip + * => do nothing + */ + DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n", + ctdb_addr_to_str(&pip->addr), + vnn->public_netmask_bits, + ctdb_vnn_iface_string(vnn))); + return 0; + } + + /* tell ctdb_control.c that we will be replying asynchronously */ + *async_reply = true; + + return 0; +} + +static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn) +{ + DLIST_REMOVE(ctdb->vnn, vnn); + ctdb_vnn_unassign_iface(ctdb, vnn); + ctdb_remove_orphaned_ifaces(ctdb, vnn); + talloc_free(vnn); +} + +static struct ctdb_vnn *release_ip_post(struct ctdb_context *ctdb, + struct ctdb_vnn *vnn, + ctdb_sock_addr *addr) +{ + TDB_DATA data; + + /* Send a message to all clients of this node telling them + * that the cluster has been reconfigured and they should + * close any connections on this IP address + */ + data.dptr = (uint8_t *)ctdb_addr_to_str(addr); + data.dsize = strlen((char *)data.dptr)+1; + DEBUG(DEBUG_INFO, ("Sending RELEASE_IP message for %s\n", data.dptr)); + ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data); + + ctdb_vnn_unassign_iface(ctdb, vnn); + + /* Process the IP if it has been marked for deletion */ + if (vnn->delete_pending) { + do_delete_ip(ctdb, vnn); + return NULL; + } + + return vnn; +} + +struct release_ip_callback_state { + struct ctdb_req_control_old *c; + ctdb_sock_addr *addr; + struct ctdb_vnn *vnn; + uint32_t target_pnn; +}; + +/* + called when releaseip event finishes + */ +static void release_ip_callback(struct ctdb_context *ctdb, int status, + void *private_data) +{ + struct release_ip_callback_state *state = + talloc_get_type(private_data, struct release_ip_callback_state); + + if (status == -ETIMEDOUT) { + ctdb_ban_self(ctdb); + } + + if (ctdb_config.failover_disabled == 0 && ctdb->do_checkpublicip) { + if (ctdb_sys_have_ip(state->addr)) { + DEBUG(DEBUG_ERR, + ("IP %s still hosted during release IP callback, failing\n", + ctdb_addr_to_str(state->addr))); + ctdb_request_control_reply(ctdb, state->c, + NULL, -1, NULL); + talloc_free(state); + return; + } + } + + state->vnn->pnn = state->target_pnn; + state->vnn = release_ip_post(ctdb, state->vnn, state->addr); + + /* the control succeeded */ + ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL); + talloc_free(state); +} + +static int ctdb_releaseip_destructor(struct release_ip_callback_state *state) +{ + if (state->vnn != NULL) { + state->vnn->update_in_flight = false; + } + return 0; +} + +/* + release an ip address + */ +int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, + struct ctdb_req_control_old *c, + TDB_DATA indata, + bool *async_reply) +{ + int ret; + struct release_ip_callback_state *state; + struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr; + struct ctdb_vnn *vnn; + const char *iface; + + /* update our vnn list */ + vnn = find_public_ip_vnn(ctdb, &pip->addr); + if (vnn == NULL) { + DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n", + ctdb_addr_to_str(&pip->addr))); + return 0; + } + + /* stop any previous arps */ + talloc_free(vnn->takeover_ctx); + vnn->takeover_ctx = NULL; + + /* RELEASE_IP controls are sent to all nodes that should not + * be hosting a particular IP. This serves 2 purposes. The + * first is to help resolve any inconsistencies. If a node + * does unexpectedly host an IP then it will be released. The + * 2nd is to use a "redundant release" to tell non-takeover + * nodes where an IP is moving to. This is how "ctdb ip" can + * report the (likely) location of an IP by only asking the + * local node. Redundant releases need to update the PNN but + * are otherwise ignored. + */ + if (ctdb_config.failover_disabled == 0 && ctdb->do_checkpublicip) { + if (!ctdb_sys_have_ip(&pip->addr)) { + DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n", + ctdb_addr_to_str(&pip->addr), + vnn->public_netmask_bits, + ctdb_vnn_iface_string(vnn))); + vnn->pnn = pip->pnn; + ctdb_vnn_unassign_iface(ctdb, vnn); + return 0; + } + } else { + if (vnn->iface == NULL) { + DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n", + ctdb_addr_to_str(&pip->addr), + vnn->public_netmask_bits)); + vnn->pnn = pip->pnn; + return 0; + } + } + + /* There is a potential race between take_ip and us because we + * update the VNN via a callback that run when the + * eventscripts have been run. Avoid the race by allowing one + * update to be in flight at a time. + */ + if (vnn->update_in_flight) { + DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected " + "update for this IP already in flight\n", + ctdb_addr_to_str(&vnn->public_address), + vnn->public_netmask_bits)); + return -1; + } + + iface = ctdb_vnn_iface_string(vnn); + + DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s node:%d\n", + ctdb_addr_to_str(&pip->addr), + vnn->public_netmask_bits, + iface, + pip->pnn)); + + state = talloc(ctdb, struct release_ip_callback_state); + if (state == NULL) { + ctdb_set_error(ctdb, "Out of memory at %s:%d", + __FILE__, __LINE__); + return -1; + } + + state->c = NULL; + state->addr = talloc(state, ctdb_sock_addr); + if (state->addr == NULL) { + ctdb_set_error(ctdb, "Out of memory at %s:%d", + __FILE__, __LINE__); + talloc_free(state); + return -1; + } + *state->addr = pip->addr; + state->target_pnn = pip->pnn; + state->vnn = vnn; + + vnn->update_in_flight = true; + talloc_set_destructor(state, ctdb_releaseip_destructor); + + ret = ctdb_event_script_callback(ctdb, + state, release_ip_callback, state, + CTDB_EVENT_RELEASE_IP, + "%s %s %u", + iface, + ctdb_addr_to_str(&pip->addr), + vnn->public_netmask_bits); + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n", + ctdb_addr_to_str(&pip->addr), + ctdb_vnn_iface_string(vnn))); + talloc_free(state); + return -1; + } + + /* tell the control that we will be reply asynchronously */ + *async_reply = true; + state->c = talloc_steal(state, c); + return 0; +} + +static int ctdb_add_public_address(struct ctdb_context *ctdb, + ctdb_sock_addr *addr, + unsigned mask, const char *ifaces, + bool check_address) +{ + struct ctdb_vnn *vnn; + char *tmp; + const char *iface; + + /* Verify that we don't have an entry for this IP yet */ + for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) { + if (ctdb_same_sockaddr(addr, &vnn->public_address)) { + D_ERR("Duplicate public IP address '%s'\n", + ctdb_addr_to_str(addr)); + return -1; + } + } + + /* Create a new VNN structure for this IP address */ + vnn = talloc_zero(ctdb, struct ctdb_vnn); + if (vnn == NULL) { + DBG_ERR("Memory allocation error\n"); + return -1; + } + tmp = talloc_strdup(vnn, ifaces); + if (tmp == NULL) { + DBG_ERR("Memory allocation error\n"); + talloc_free(vnn); + return -1; + } + for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) { + struct vnn_interface *vnn_iface; + struct ctdb_interface *i; + + if (!ctdb_sys_check_iface_exists(iface)) { + D_ERR("Unknown interface %s for public address %s\n", + iface, + ctdb_addr_to_str(addr)); + talloc_free(vnn); + return -1; + } + + i = ctdb_add_local_iface(ctdb, iface); + if (i == NULL) { + D_ERR("Failed to add interface '%s' " + "for public address %s\n", + iface, + ctdb_addr_to_str(addr)); + talloc_free(vnn); + return -1; + } + + vnn_iface = talloc_zero(vnn, struct vnn_interface); + if (vnn_iface == NULL) { + DBG_ERR("Memory allocation error\n"); + talloc_free(vnn); + return -1; + } + + vnn_iface->iface = i; + DLIST_ADD_END(vnn->ifaces, vnn_iface); + } + talloc_free(tmp); + vnn->public_address = *addr; + vnn->public_netmask_bits = mask; + vnn->pnn = -1; + + DLIST_ADD(ctdb->vnn, vnn); + + return 0; +} + +/* + setup the public address lists from a file +*/ +int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses) +{ + bool ok; + char **lines; + int nlines; + int i; + + /* If no public addresses file given then try the default */ + if (ctdb->public_addresses_file == NULL) { + const char *b = getenv("CTDB_BASE"); + if (b == NULL) { + DBG_ERR("CTDB_BASE not set\n"); + return -1; + } + ctdb->public_addresses_file = talloc_asprintf( + ctdb, "%s/%s", b, "public_addresses"); + if (ctdb->public_addresses_file == NULL) { + DBG_ERR("Out of memory\n"); + return -1; + } + } + + /* If the file doesn't exist then warn and do nothing */ + ok = file_exist(ctdb->public_addresses_file); + if (!ok) { + D_WARNING("Not loading public addresses, no file %s\n", + ctdb->public_addresses_file); + return 0; + } + + lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb); + if (lines == NULL) { + ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file); + return -1; + } + while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) { + nlines--; + } + + for (i=0;i<nlines;i++) { + unsigned mask; + ctdb_sock_addr addr; + const char *addrstr; + const char *ifaces; + char *tok, *line; + int ret; + + line = lines[i]; + while ((*line == ' ') || (*line == '\t')) { + line++; + } + if (*line == '#') { + continue; + } + if (strcmp(line, "") == 0) { + continue; + } + tok = strtok(line, " \t"); + addrstr = tok; + + tok = strtok(NULL, " \t"); + if (tok == NULL) { + D_ERR("No interface specified at line %u " + "of public addresses file\n", i+1); + talloc_free(lines); + return -1; + } + ifaces = tok; + + if (addrstr == NULL) { + D_ERR("Badly formed line %u in public address list\n", + i+1); + talloc_free(lines); + return -1; + } + + ret = ctdb_sock_addr_mask_from_string(addrstr, &addr, &mask); + if (ret != 0) { + D_ERR("Badly formed line %u in public address list\n", + i+1); + talloc_free(lines); + return -1; + } + + if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) { + DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1)); + talloc_free(lines); + return -1; + } + } + + + D_NOTICE("Loaded public addresses from %s\n", + ctdb->public_addresses_file); + + talloc_free(lines); + return 0; +} + +/* + destroy a ctdb_tcp_list structure + */ +static int ctdb_tcp_list_destructor(struct ctdb_tcp_list *tcp) +{ + struct ctdb_client *client = tcp->client; + struct ctdb_connection *conn = &tcp->connection; + char conn_str[132] = { 0, }; + int ret; + + ret = ctdb_connection_to_buf(conn_str, + sizeof(conn_str), + conn, + false, + " -> "); + if (ret != 0) { + strlcpy(conn_str, "UNKNOWN", sizeof(conn_str)); + } + + D_DEBUG("removing client TCP connection %s " + "(client_id %u pid %d)\n", + conn_str, client->client_id, client->pid); + + DLIST_REMOVE(client->tcp_list, tcp); + + /* + * We don't call ctdb_remove_connection(vnn, conn) here + * as we want the caller to decide if it's called + * directly (local only) or indirectly via a + * CTDB_CONTROL_TCP_REMOVE broadcast + */ + + return 0; +} + +/* + called by a client to inform us of a TCP connection that it is managing + that should tickled with an ACK when IP takeover is done + */ +int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id, + TDB_DATA indata) +{ + struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client); + struct ctdb_connection *tcp_sock = NULL; + struct ctdb_tcp_list *tcp; + struct ctdb_connection t; + int ret; + TDB_DATA data; + struct ctdb_vnn *vnn; + char conn_str[132] = { 0, }; + + /* If we don't have public IPs, tickles are useless */ + if (ctdb->vnn == NULL) { + return 0; + } + + tcp_sock = (struct ctdb_connection *)indata.dptr; + + ctdb_canonicalize_ip_inplace(&tcp_sock->src); + ctdb_canonicalize_ip_inplace(&tcp_sock->dst); + + ret = ctdb_connection_to_buf(conn_str, + sizeof(conn_str), + tcp_sock, + false, + " -> "); + if (ret != 0) { + strlcpy(conn_str, "UNKNOWN", sizeof(conn_str)); + } + + vnn = find_public_ip_vnn(ctdb, &tcp_sock->dst); + if (vnn == NULL) { + D_ERR("Could not register TCP connection %s - " + "not a public address (client_id %u pid %u)\n", + conn_str, client_id, client->pid); + return 0; + } + + if (vnn->pnn != ctdb->pnn) { + D_ERR("Attempt to register tcp client for IP %s we don't hold - " + "failing (client_id %u pid %u)\n", + ctdb_addr_to_str(&tcp_sock->dst), + client_id, client->pid); + /* failing this call will tell smbd to die */ + return -1; + } + + tcp = talloc(client, struct ctdb_tcp_list); + CTDB_NO_MEMORY(ctdb, tcp); + tcp->client = client; + + tcp->connection.src = tcp_sock->src; + tcp->connection.dst = tcp_sock->dst; + + DLIST_ADD(client->tcp_list, tcp); + talloc_set_destructor(tcp, ctdb_tcp_list_destructor); + + t.src = tcp_sock->src; + t.dst = tcp_sock->dst; + + data.dptr = (uint8_t *)&t; + data.dsize = sizeof(t); + + D_INFO("Registered TCP connection %s (client_id %u pid %u)\n", + conn_str, client_id, client->pid); + + /* tell all nodes about this tcp connection */ + ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, + CTDB_CONTROL_TCP_ADD, + 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL); + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n")); + return -1; + } + + return 0; +} + +static bool ctdb_client_remove_tcp(struct ctdb_client *client, + const struct ctdb_connection *conn) +{ + struct ctdb_tcp_list *tcp = NULL; + struct ctdb_tcp_list *tcp_next = NULL; + bool found = false; + + for (tcp = client->tcp_list; tcp != NULL; tcp = tcp_next) { + bool same; + + tcp_next = tcp->next; + + same = ctdb_connection_same(conn, &tcp->connection); + if (!same) { + continue; + } + + TALLOC_FREE(tcp); + found = true; + } + + return found; +} + +/* + called by a client to inform us of a TCP connection that was disconnected + */ +int32_t ctdb_control_tcp_client_disconnected(struct ctdb_context *ctdb, + uint32_t client_id, + TDB_DATA indata) +{ + struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client); + struct ctdb_connection *tcp_sock = NULL; + int ret; + TDB_DATA data; + char conn_str[132] = { 0, }; + bool found = false; + + tcp_sock = (struct ctdb_connection *)indata.dptr; + + ctdb_canonicalize_ip_inplace(&tcp_sock->src); + ctdb_canonicalize_ip_inplace(&tcp_sock->dst); + + ret = ctdb_connection_to_buf(conn_str, + sizeof(conn_str), + tcp_sock, + false, + " -> "); + if (ret != 0) { + strlcpy(conn_str, "UNKNOWN", sizeof(conn_str)); + } + + found = ctdb_client_remove_tcp(client, tcp_sock); + if (!found) { + DBG_DEBUG("TCP connection %s not found " + "(client_id %u pid %u).\n", + conn_str, client_id, client->pid); + return 0; + } + + D_INFO("deregistered TCP connection %s " + "(client_id %u pid %u)\n", + conn_str, client_id, client->pid); + + data.dptr = (uint8_t *)tcp_sock; + data.dsize = sizeof(*tcp_sock); + + /* tell all nodes about this tcp connection is gone */ + ret = ctdb_daemon_send_control(ctdb, + CTDB_BROADCAST_CONNECTED, + 0, + CTDB_CONTROL_TCP_REMOVE, + 0, + CTDB_CTRL_FLAG_NOREPLY, + data, + NULL, + NULL); + if (ret != 0) { + DBG_ERR("Failed to send CTDB_CONTROL_TCP_REMOVE: %s\n", + conn_str); + return -1; + } + + return 0; +} + +/* + called by a client to inform us of a TCP connection was passed to a different + "client" (typically with multichannel to another smbd process). + */ +int32_t ctdb_control_tcp_client_passed(struct ctdb_context *ctdb, + uint32_t client_id, + TDB_DATA indata) +{ + struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client); + struct ctdb_connection *tcp_sock = NULL; + int ret; + char conn_str[132] = { 0, }; + bool found = false; + + tcp_sock = (struct ctdb_connection *)indata.dptr; + + ctdb_canonicalize_ip_inplace(&tcp_sock->src); + ctdb_canonicalize_ip_inplace(&tcp_sock->dst); + + ret = ctdb_connection_to_buf(conn_str, + sizeof(conn_str), + tcp_sock, + false, + " -> "); + if (ret != 0) { + strlcpy(conn_str, "UNKNOWN", sizeof(conn_str)); + } + + found = ctdb_client_remove_tcp(client, tcp_sock); + if (!found) { + DBG_DEBUG("TCP connection from %s not found " + "(client_id %u pid %u).\n", + conn_str, client_id, client->pid); + return 0; + } + + D_INFO("TCP connection from %s " + "(client_id %u pid %u) passed to another client\n", + conn_str, client_id, client->pid); + + /* + * We don't call CTDB_CONTROL_TCP_REMOVE + * nor ctdb_remove_connection() as the connection + * is still alive, but handled by another client + */ + + return 0; +} + +/* + find a tcp address on a list + */ +static struct ctdb_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, + struct ctdb_connection *tcp) +{ + unsigned int i; + + if (array == NULL) { + return NULL; + } + + for (i=0;i<array->num;i++) { + if (ctdb_same_sockaddr(&array->connections[i].src, &tcp->src) && + ctdb_same_sockaddr(&array->connections[i].dst, &tcp->dst)) { + return &array->connections[i]; + } + } + return NULL; +} + + + +/* + called by a daemon to inform us of a TCP connection that one of its + clients managing that should tickled with an ACK when IP takeover is + done + */ +int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed) +{ + struct ctdb_connection *p = (struct ctdb_connection *)indata.dptr; + struct ctdb_tcp_array *tcparray; + struct ctdb_connection tcp; + struct ctdb_vnn *vnn; + + /* If we don't have public IPs, tickles are useless */ + if (ctdb->vnn == NULL) { + return 0; + } + + vnn = find_public_ip_vnn(ctdb, &p->dst); + if (vnn == NULL) { + DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n", + ctdb_addr_to_str(&p->dst))); + + return -1; + } + + + tcparray = vnn->tcp_array; + + /* If this is the first tickle */ + if (tcparray == NULL) { + tcparray = talloc(vnn, struct ctdb_tcp_array); + CTDB_NO_MEMORY(ctdb, tcparray); + vnn->tcp_array = tcparray; + + tcparray->num = 0; + tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_connection)); + CTDB_NO_MEMORY(ctdb, tcparray->connections); + + tcparray->connections[tcparray->num].src = p->src; + tcparray->connections[tcparray->num].dst = p->dst; + tcparray->num++; + + if (tcp_update_needed) { + vnn->tcp_update_needed = true; + } + return 0; + } + + + /* Do we already have this tickle ?*/ + tcp.src = p->src; + tcp.dst = p->dst; + if (ctdb_tcp_find(tcparray, &tcp) != NULL) { + DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n", + ctdb_addr_to_str(&tcp.dst), + ntohs(tcp.dst.ip.sin_port), + vnn->pnn)); + return 0; + } + + /* A new tickle, we must add it to the array */ + tcparray->connections = talloc_realloc(tcparray, tcparray->connections, + struct ctdb_connection, + tcparray->num+1); + CTDB_NO_MEMORY(ctdb, tcparray->connections); + + tcparray->connections[tcparray->num].src = p->src; + tcparray->connections[tcparray->num].dst = p->dst; + tcparray->num++; + + DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n", + ctdb_addr_to_str(&tcp.dst), + ntohs(tcp.dst.ip.sin_port), + vnn->pnn)); + + if (tcp_update_needed) { + vnn->tcp_update_needed = true; + } + + return 0; +} + + +static void ctdb_remove_connection(struct ctdb_vnn *vnn, struct ctdb_connection *conn) +{ + struct ctdb_connection *tcpp; + + if (vnn == NULL) { + return; + } + + /* if the array is empty we can't remove it + and we don't need to do anything + */ + if (vnn->tcp_array == NULL) { + DEBUG(DEBUG_INFO,("Trying to remove tickle that doesn't exist (array is empty) %s:%u\n", + ctdb_addr_to_str(&conn->dst), + ntohs(conn->dst.ip.sin_port))); + return; + } + + + /* See if we know this connection + if we don't know this connection then we don't need to do anything + */ + tcpp = ctdb_tcp_find(vnn->tcp_array, conn); + if (tcpp == NULL) { + DEBUG(DEBUG_INFO,("Trying to remove tickle that doesn't exist %s:%u\n", + ctdb_addr_to_str(&conn->dst), + ntohs(conn->dst.ip.sin_port))); + return; + } + + + /* We need to remove this entry from the array. + Instead of allocating a new array and copying data to it + we cheat and just copy the last entry in the existing array + to the entry that is to be removed and just shring the + ->num field + */ + *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1]; + vnn->tcp_array->num--; + + /* If we deleted the last entry we also need to remove the entire array + */ + if (vnn->tcp_array->num == 0) { + talloc_free(vnn->tcp_array); + vnn->tcp_array = NULL; + } + + vnn->tcp_update_needed = true; + + DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n", + ctdb_addr_to_str(&conn->src), + ntohs(conn->src.ip.sin_port))); +} + + +/* + called by a daemon to inform us of a TCP connection that one of its + clients used are no longer needed in the tickle database + */ +int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata) +{ + struct ctdb_vnn *vnn; + struct ctdb_connection *conn = (struct ctdb_connection *)indata.dptr; + + /* If we don't have public IPs, tickles are useless */ + if (ctdb->vnn == NULL) { + return 0; + } + + vnn = find_public_ip_vnn(ctdb, &conn->dst); + if (vnn == NULL) { + DEBUG(DEBUG_ERR, + (__location__ " unable to find public address %s\n", + ctdb_addr_to_str(&conn->dst))); + return 0; + } + + ctdb_remove_connection(vnn, conn); + + return 0; +} + + +static void ctdb_send_set_tcp_tickles_for_all(struct ctdb_context *ctdb, + bool force); + +/* + Called when another daemon starts - causes all tickles for all + public addresses we are serving to be sent to the new node on the + next check. This actually causes the tickles to be sent to the + other node immediately. In case there is an error, the periodic + timer will send the updates on timer event. This is simple and + doesn't require careful error handling. + */ +int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn) +{ + DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n", + (unsigned long) pnn)); + + ctdb_send_set_tcp_tickles_for_all(ctdb, true); + return 0; +} + + +/* + called when a client structure goes away - hook to remove + elements from the tcp_list in all daemons + */ +void ctdb_takeover_client_destructor_hook(struct ctdb_client *client) +{ + while (client->tcp_list) { + struct ctdb_vnn *vnn; + struct ctdb_tcp_list *tcp = client->tcp_list; + struct ctdb_connection *conn = &tcp->connection; + + vnn = find_public_ip_vnn(client->ctdb, + &conn->dst); + + /* If the IP address is hosted on this node then + * remove the connection. */ + if (vnn != NULL && vnn->pnn == client->ctdb->pnn) { + ctdb_remove_connection(vnn, conn); + } + + /* Otherwise this function has been called because the + * server IP address has been released to another node + * and the client has exited. This means that we + * should not delete the connection information. The + * takeover node processes connections too. */ + + /* + * The destructor removes from the list + */ + TALLOC_FREE(tcp); + } +} + + +void ctdb_release_all_ips(struct ctdb_context *ctdb) +{ + struct ctdb_vnn *vnn, *next; + int count = 0; + + if (ctdb_config.failover_disabled == 1) { + return; + } + + for (vnn = ctdb->vnn; vnn != NULL; vnn = next) { + /* vnn can be freed below in release_ip_post() */ + next = vnn->next; + + if (!ctdb_sys_have_ip(&vnn->public_address)) { + ctdb_vnn_unassign_iface(ctdb, vnn); + continue; + } + + /* Don't allow multiple releases at once. Some code, + * particularly ctdb_tickle_sentenced_connections() is + * not re-entrant */ + if (vnn->update_in_flight) { + DEBUG(DEBUG_WARNING, + (__location__ + " Not releasing IP %s/%u on interface %s, an update is already in progress\n", + ctdb_addr_to_str(&vnn->public_address), + vnn->public_netmask_bits, + ctdb_vnn_iface_string(vnn))); + continue; + } + vnn->update_in_flight = true; + + DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n", + ctdb_addr_to_str(&vnn->public_address), + vnn->public_netmask_bits, + ctdb_vnn_iface_string(vnn))); + + ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u", + ctdb_vnn_iface_string(vnn), + ctdb_addr_to_str(&vnn->public_address), + vnn->public_netmask_bits); + /* releaseip timeouts are converted to success, so to + * detect failures just check if the IP address is + * still there... + */ + if (ctdb_sys_have_ip(&vnn->public_address)) { + DEBUG(DEBUG_ERR, + (__location__ + " IP address %s not released\n", + ctdb_addr_to_str(&vnn->public_address))); + vnn->update_in_flight = false; + continue; + } + + vnn = release_ip_post(ctdb, vnn, &vnn->public_address); + if (vnn != NULL) { + vnn->update_in_flight = false; + } + count++; + } + + DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count)); +} + + +/* + get list of public IPs + */ +int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, + struct ctdb_req_control_old *c, TDB_DATA *outdata) +{ + int i, num, len; + struct ctdb_public_ip_list_old *ips; + struct ctdb_vnn *vnn; + bool only_available = false; + + if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) { + only_available = true; + } + + /* count how many public ip structures we have */ + num = 0; + for (vnn=ctdb->vnn;vnn;vnn=vnn->next) { + num++; + } + + len = offsetof(struct ctdb_public_ip_list_old, ips) + + num*sizeof(struct ctdb_public_ip); + ips = talloc_zero_size(outdata, len); + CTDB_NO_MEMORY(ctdb, ips); + + i = 0; + for (vnn=ctdb->vnn;vnn;vnn=vnn->next) { + if (only_available && !ctdb_vnn_available(ctdb, vnn)) { + continue; + } + ips->ips[i].pnn = vnn->pnn; + ips->ips[i].addr = vnn->public_address; + i++; + } + ips->num = i; + len = offsetof(struct ctdb_public_ip_list_old, ips) + + i*sizeof(struct ctdb_public_ip); + + outdata->dsize = len; + outdata->dptr = (uint8_t *)ips; + + return 0; +} + + +int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb, + struct ctdb_req_control_old *c, + TDB_DATA indata, + TDB_DATA *outdata) +{ + int i, num, len; + ctdb_sock_addr *addr; + struct ctdb_public_ip_info_old *info; + struct ctdb_vnn *vnn; + struct vnn_interface *iface; + + addr = (ctdb_sock_addr *)indata.dptr; + + vnn = find_public_ip_vnn(ctdb, addr); + if (vnn == NULL) { + DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, " + "'%s'not a public address\n", + ctdb_addr_to_str(addr))); + return -1; + } + + /* count how many public ip structures we have */ + num = 0; + for (iface = vnn->ifaces; iface != NULL; iface = iface->next) { + num++; + } + + len = offsetof(struct ctdb_public_ip_info_old, ifaces) + + num*sizeof(struct ctdb_iface); + info = talloc_zero_size(outdata, len); + CTDB_NO_MEMORY(ctdb, info); + + info->ip.addr = vnn->public_address; + info->ip.pnn = vnn->pnn; + info->active_idx = 0xFFFFFFFF; + + i = 0; + for (iface = vnn->ifaces; iface != NULL; iface = iface->next) { + struct ctdb_interface *cur; + + cur = iface->iface; + if (vnn->iface == cur) { + info->active_idx = i; + } + strncpy(info->ifaces[i].name, cur->name, + sizeof(info->ifaces[i].name)); + info->ifaces[i].name[sizeof(info->ifaces[i].name)-1] = '\0'; + info->ifaces[i].link_state = cur->link_up; + info->ifaces[i].references = cur->references; + + i++; + } + info->num = i; + len = offsetof(struct ctdb_public_ip_info_old, ifaces) + + i*sizeof(struct ctdb_iface); + + outdata->dsize = len; + outdata->dptr = (uint8_t *)info; + + return 0; +} + +int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb, + struct ctdb_req_control_old *c, + TDB_DATA *outdata) +{ + int i, num, len; + struct ctdb_iface_list_old *ifaces; + struct ctdb_interface *cur; + + /* count how many public ip structures we have */ + num = 0; + for (cur=ctdb->ifaces;cur;cur=cur->next) { + num++; + } + + len = offsetof(struct ctdb_iface_list_old, ifaces) + + num*sizeof(struct ctdb_iface); + ifaces = talloc_zero_size(outdata, len); + CTDB_NO_MEMORY(ctdb, ifaces); + + i = 0; + for (cur=ctdb->ifaces;cur;cur=cur->next) { + strncpy(ifaces->ifaces[i].name, cur->name, + sizeof(ifaces->ifaces[i].name)); + ifaces->ifaces[i].name[sizeof(ifaces->ifaces[i].name)-1] = '\0'; + ifaces->ifaces[i].link_state = cur->link_up; + ifaces->ifaces[i].references = cur->references; + i++; + } + ifaces->num = i; + len = offsetof(struct ctdb_iface_list_old, ifaces) + + i*sizeof(struct ctdb_iface); + + outdata->dsize = len; + outdata->dptr = (uint8_t *)ifaces; + + return 0; +} + +int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb, + struct ctdb_req_control_old *c, + TDB_DATA indata) +{ + struct ctdb_iface *info; + struct ctdb_interface *iface; + bool link_up = false; + + info = (struct ctdb_iface *)indata.dptr; + + if (info->name[CTDB_IFACE_SIZE] != '\0') { + int len = strnlen(info->name, CTDB_IFACE_SIZE); + DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n", + len, len, info->name)); + return -1; + } + + switch (info->link_state) { + case 0: + link_up = false; + break; + case 1: + link_up = true; + break; + default: + DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n", + (unsigned int)info->link_state)); + return -1; + } + + if (info->references != 0) { + DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n", + (unsigned int)info->references)); + return -1; + } + + iface = ctdb_find_iface(ctdb, info->name); + if (iface == NULL) { + return -1; + } + + if (link_up == iface->link_up) { + return 0; + } + + DEBUG(DEBUG_ERR, + ("iface[%s] has changed it's link status %s => %s\n", + iface->name, + iface->link_up?"up":"down", + link_up?"up":"down")); + + iface->link_up = link_up; + return 0; +} + + +/* + called by a daemon to inform us of the entire list of TCP tickles for + a particular public address. + this control should only be sent by the node that is currently serving + that public address. + */ +int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata) +{ + struct ctdb_tickle_list_old *list = (struct ctdb_tickle_list_old *)indata.dptr; + struct ctdb_tcp_array *tcparray; + struct ctdb_vnn *vnn; + + /* We must at least have tickles.num or else we can't verify the size + of the received data blob + */ + if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)) { + DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list. Not enough data for the tickle.num field\n")); + return -1; + } + + /* verify that the size of data matches what we expect */ + if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections) + + sizeof(struct ctdb_connection) * list->num) { + DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list\n")); + return -1; + } + + DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n", + ctdb_addr_to_str(&list->addr))); + + vnn = find_public_ip_vnn(ctdb, &list->addr); + if (vnn == NULL) { + DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n", + ctdb_addr_to_str(&list->addr))); + + return 1; + } + + if (vnn->pnn == ctdb->pnn) { + DEBUG(DEBUG_INFO, + ("Ignoring redundant set tcp tickle list, this node hosts '%s'\n", + ctdb_addr_to_str(&list->addr))); + return 0; + } + + /* remove any old ticklelist we might have */ + talloc_free(vnn->tcp_array); + vnn->tcp_array = NULL; + + tcparray = talloc(vnn, struct ctdb_tcp_array); + CTDB_NO_MEMORY(ctdb, tcparray); + + tcparray->num = list->num; + + tcparray->connections = talloc_array(tcparray, struct ctdb_connection, tcparray->num); + CTDB_NO_MEMORY(ctdb, tcparray->connections); + + memcpy(tcparray->connections, &list->connections[0], + sizeof(struct ctdb_connection)*tcparray->num); + + /* We now have a new fresh tickle list array for this vnn */ + vnn->tcp_array = tcparray; + + return 0; +} + +/* + called to return the full list of tickles for the puclic address associated + with the provided vnn + */ +int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata) +{ + ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr; + struct ctdb_tickle_list_old *list; + struct ctdb_tcp_array *tcparray; + unsigned int num, i; + struct ctdb_vnn *vnn; + unsigned port; + + vnn = find_public_ip_vnn(ctdb, addr); + if (vnn == NULL) { + DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", + ctdb_addr_to_str(addr))); + + return 1; + } + + port = ctdb_addr_to_port(addr); + + tcparray = vnn->tcp_array; + num = 0; + if (tcparray != NULL) { + if (port == 0) { + /* All connections */ + num = tcparray->num; + } else { + /* Count connections for port */ + for (i = 0; i < tcparray->num; i++) { + if (port == ctdb_addr_to_port(&tcparray->connections[i].dst)) { + num++; + } + } + } + } + + outdata->dsize = offsetof(struct ctdb_tickle_list_old, connections) + + sizeof(struct ctdb_connection) * num; + + outdata->dptr = talloc_size(outdata, outdata->dsize); + CTDB_NO_MEMORY(ctdb, outdata->dptr); + list = (struct ctdb_tickle_list_old *)outdata->dptr; + + list->addr = *addr; + list->num = num; + + if (num == 0) { + return 0; + } + + num = 0; + for (i = 0; i < tcparray->num; i++) { + if (port == 0 || \ + port == ctdb_addr_to_port(&tcparray->connections[i].dst)) { + list->connections[num] = tcparray->connections[i]; + num++; + } + } + + return 0; +} + + +/* + set the list of all tcp tickles for a public address + */ +static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb, + ctdb_sock_addr *addr, + struct ctdb_tcp_array *tcparray) +{ + int ret, num; + TDB_DATA data; + struct ctdb_tickle_list_old *list; + + if (tcparray) { + num = tcparray->num; + } else { + num = 0; + } + + data.dsize = offsetof(struct ctdb_tickle_list_old, connections) + + sizeof(struct ctdb_connection) * num; + data.dptr = talloc_size(ctdb, data.dsize); + CTDB_NO_MEMORY(ctdb, data.dptr); + + list = (struct ctdb_tickle_list_old *)data.dptr; + list->addr = *addr; + list->num = num; + if (tcparray) { + memcpy(&list->connections[0], tcparray->connections, sizeof(struct ctdb_connection) * num); + } + + ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0, + CTDB_CONTROL_SET_TCP_TICKLE_LIST, + 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL); + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n")); + return -1; + } + + talloc_free(data.dptr); + + return ret; +} + +static void ctdb_send_set_tcp_tickles_for_all(struct ctdb_context *ctdb, + bool force) +{ + struct ctdb_vnn *vnn; + int ret; + + for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) { + /* we only send out updates for public addresses that + we have taken over + */ + if (ctdb->pnn != vnn->pnn) { + continue; + } + + /* We only send out the updates if we need to */ + if (!force && !vnn->tcp_update_needed) { + continue; + } + + ret = ctdb_send_set_tcp_tickles_for_ip(ctdb, + &vnn->public_address, + vnn->tcp_array); + if (ret != 0) { + D_ERR("Failed to send the tickle update for ip %s\n", + ctdb_addr_to_str(&vnn->public_address)); + vnn->tcp_update_needed = true; + } else { + D_INFO("Sent tickle update for ip %s\n", + ctdb_addr_to_str(&vnn->public_address)); + vnn->tcp_update_needed = false; + } + } + +} + +/* + perform tickle updates if required + */ +static void ctdb_update_tcp_tickles(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *private_data) +{ + struct ctdb_context *ctdb = talloc_get_type( + private_data, struct ctdb_context); + + ctdb_send_set_tcp_tickles_for_all(ctdb, false); + + tevent_add_timer(ctdb->ev, ctdb->tickle_update_context, + timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), + ctdb_update_tcp_tickles, ctdb); +} + +/* + start periodic update of tcp tickles + */ +void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb) +{ + ctdb->tickle_update_context = talloc_new(ctdb); + + tevent_add_timer(ctdb->ev, ctdb->tickle_update_context, + timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), + ctdb_update_tcp_tickles, ctdb); +} + + + + +struct control_gratious_arp { + struct ctdb_context *ctdb; + ctdb_sock_addr addr; + const char *iface; + int count; +}; + +/* + send a control_gratuitous arp + */ +static void send_gratious_arp(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *private_data) +{ + int ret; + struct control_gratious_arp *arp = talloc_get_type(private_data, + struct control_gratious_arp); + + ret = ctdb_sys_send_arp(&arp->addr, arp->iface); + if (ret != 0) { + DBG_ERR("Failed to send gratuitous ARP on iface %s: %s\n", + arp->iface, strerror(ret)); + } + + + arp->count++; + if (arp->count == CTDB_ARP_REPEAT) { + talloc_free(arp); + return; + } + + tevent_add_timer(arp->ctdb->ev, arp, + timeval_current_ofs(CTDB_ARP_INTERVAL, 0), + send_gratious_arp, arp); +} + + +/* + send a gratious arp + */ +int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata) +{ + struct ctdb_addr_info_old *gratious_arp = (struct ctdb_addr_info_old *)indata.dptr; + struct control_gratious_arp *arp; + + /* verify the size of indata */ + if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) { + DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", + (unsigned)indata.dsize, + (unsigned)offsetof(struct ctdb_addr_info_old, iface))); + return -1; + } + if (indata.dsize != + ( offsetof(struct ctdb_addr_info_old, iface) + + gratious_arp->len ) ){ + + DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes " + "but should be %u bytes\n", + (unsigned)indata.dsize, + (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+gratious_arp->len))); + return -1; + } + + + arp = talloc(ctdb, struct control_gratious_arp); + CTDB_NO_MEMORY(ctdb, arp); + + arp->ctdb = ctdb; + arp->addr = gratious_arp->addr; + arp->iface = talloc_strdup(arp, gratious_arp->iface); + CTDB_NO_MEMORY(ctdb, arp->iface); + arp->count = 0; + + tevent_add_timer(arp->ctdb->ev, arp, + timeval_zero(), send_gratious_arp, arp); + + return 0; +} + +int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata) +{ + struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr; + int ret; + + /* verify the size of indata */ + if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) { + DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n")); + return -1; + } + if (indata.dsize != + ( offsetof(struct ctdb_addr_info_old, iface) + + pub->len ) ){ + + DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes " + "but should be %u bytes\n", + (unsigned)indata.dsize, + (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len))); + return -1; + } + + DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr))); + + ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true); + + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n")); + return -1; + } + + return 0; +} + +int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata) +{ + struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr; + struct ctdb_vnn *vnn; + + /* verify the size of indata */ + if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) { + DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n")); + return -1; + } + if (indata.dsize != + ( offsetof(struct ctdb_addr_info_old, iface) + + pub->len ) ){ + + DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes " + "but should be %u bytes\n", + (unsigned)indata.dsize, + (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len))); + return -1; + } + + DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr))); + + /* walk over all public addresses until we find a match */ + for (vnn=ctdb->vnn;vnn;vnn=vnn->next) { + if (ctdb_same_ip(&vnn->public_address, &pub->addr)) { + if (vnn->pnn == ctdb->pnn) { + /* This IP is currently being hosted. + * Defer the deletion until the next + * takeover run. "ctdb reloadips" will + * always cause a takeover run. "ctdb + * delip" will now need an explicit + * "ctdb ipreallocated" afterwards. */ + vnn->delete_pending = true; + } else { + /* This IP is not hosted on the + * current node so just delete it + * now. */ + do_delete_ip(ctdb, vnn); + } + + return 0; + } + } + + DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n", + ctdb_addr_to_str(&pub->addr))); + return -1; +} + + +struct ipreallocated_callback_state { + struct ctdb_req_control_old *c; +}; + +static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb, + int status, void *p) +{ + struct ipreallocated_callback_state *state = + talloc_get_type(p, struct ipreallocated_callback_state); + TDB_DATA data = { .dsize = 0, }; + + if (status != 0) { + DEBUG(DEBUG_ERR, + (" \"ipreallocated\" event script failed (status %d)\n", + status)); + if (status == -ETIMEDOUT) { + ctdb_ban_self(ctdb); + } + } + + D_INFO("Sending IPREALLOCATED message\n"); + ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_IPREALLOCATED, data); + + ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL); + talloc_free(state); +} + +/* A control to run the ipreallocated event */ +int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb, + struct ctdb_req_control_old *c, + bool *async_reply) +{ + int ret; + struct ipreallocated_callback_state *state; + + state = talloc(ctdb, struct ipreallocated_callback_state); + CTDB_NO_MEMORY(ctdb, state); + + DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n")); + + ret = ctdb_event_script_callback(ctdb, state, + ctdb_ipreallocated_callback, state, + CTDB_EVENT_IPREALLOCATED, + "%s", ""); + + if (ret != 0) { + DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n")); + talloc_free(state); + return -1; + } + + /* tell the control that we will be reply asynchronously */ + state->c = talloc_steal(state, c); + *async_reply = true; + + return 0; +} + + +struct ctdb_reloadips_handle { + struct ctdb_context *ctdb; + struct ctdb_req_control_old *c; + int status; + int fd[2]; + pid_t child; + struct tevent_fd *fde; +}; + +static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h) +{ + if (h == h->ctdb->reload_ips) { + h->ctdb->reload_ips = NULL; + } + if (h->c != NULL) { + ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL); + h->c = NULL; + } + ctdb_kill(h->ctdb, h->child, SIGKILL); + return 0; +} + +static void ctdb_reloadips_timeout_event(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *private_data) +{ + struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle); + + talloc_free(h); +} + +static void ctdb_reloadips_child_handler(struct tevent_context *ev, + struct tevent_fd *fde, + uint16_t flags, void *private_data) +{ + struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle); + + char res; + int ret; + + ret = sys_read(h->fd[0], &res, 1); + if (ret < 1 || res != 0) { + DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n")); + res = 1; + } + h->status = res; + + talloc_free(h); +} + +static int ctdb_reloadips_child(struct ctdb_context *ctdb) +{ + TALLOC_CTX *mem_ctx = talloc_new(NULL); + struct ctdb_public_ip_list_old *ips; + struct ctdb_vnn *vnn; + struct client_async_data *async_data; + struct timeval timeout; + TDB_DATA data; + struct ctdb_client_control_state *state; + bool first_add; + unsigned int i; + int ret; + + CTDB_NO_MEMORY(ctdb, mem_ctx); + + /* Read IPs from local node */ + ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(), + CTDB_CURRENT_NODE, mem_ctx, &ips); + if (ret != 0) { + DEBUG(DEBUG_ERR, + ("Unable to fetch public IPs from local node\n")); + talloc_free(mem_ctx); + return -1; + } + + /* Read IPs file - this is safe since this is a child process */ + ctdb->vnn = NULL; + if (ctdb_set_public_addresses(ctdb, false) != 0) { + DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n")); + talloc_free(mem_ctx); + return -1; + } + + async_data = talloc_zero(mem_ctx, struct client_async_data); + CTDB_NO_MEMORY(ctdb, async_data); + + /* Compare IPs between node and file for IPs to be deleted */ + for (i = 0; i < ips->num; i++) { + /* */ + for (vnn = ctdb->vnn; vnn; vnn = vnn->next) { + if (ctdb_same_ip(&vnn->public_address, + &ips->ips[i].addr)) { + /* IP is still in file */ + break; + } + } + + if (vnn == NULL) { + /* Delete IP ips->ips[i] */ + struct ctdb_addr_info_old *pub; + + DEBUG(DEBUG_NOTICE, + ("IP %s no longer configured, deleting it\n", + ctdb_addr_to_str(&ips->ips[i].addr))); + + pub = talloc_zero(mem_ctx, struct ctdb_addr_info_old); + CTDB_NO_MEMORY(ctdb, pub); + + pub->addr = ips->ips[i].addr; + pub->mask = 0; + pub->len = 0; + + timeout = TAKEOVER_TIMEOUT(); + + data.dsize = offsetof(struct ctdb_addr_info_old, + iface) + pub->len; + data.dptr = (uint8_t *)pub; + + state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0, + CTDB_CONTROL_DEL_PUBLIC_IP, + 0, data, async_data, + &timeout, NULL); + if (state == NULL) { + DEBUG(DEBUG_ERR, + (__location__ + " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n")); + goto failed; + } + + ctdb_client_async_add(async_data, state); + } + } + + /* Compare IPs between node and file for IPs to be added */ + first_add = true; + for (vnn = ctdb->vnn; vnn; vnn = vnn->next) { + for (i = 0; i < ips->num; i++) { + if (ctdb_same_ip(&vnn->public_address, + &ips->ips[i].addr)) { + /* IP already on node */ + break; + } + } + if (i == ips->num) { + /* Add IP ips->ips[i] */ + struct ctdb_addr_info_old *pub; + const char *ifaces = NULL; + uint32_t len; + struct vnn_interface *iface = NULL; + + DEBUG(DEBUG_NOTICE, + ("New IP %s configured, adding it\n", + ctdb_addr_to_str(&vnn->public_address))); + if (first_add) { + uint32_t pnn = ctdb_get_pnn(ctdb); + + data.dsize = sizeof(pnn); + data.dptr = (uint8_t *)&pnn; + + ret = ctdb_client_send_message( + ctdb, + CTDB_BROADCAST_CONNECTED, + CTDB_SRVID_REBALANCE_NODE, + data); + if (ret != 0) { + DEBUG(DEBUG_WARNING, + ("Failed to send message to force node reallocation - IPs may be unbalanced\n")); + } + + first_add = false; + } + + ifaces = vnn->ifaces->iface->name; + iface = vnn->ifaces->next; + while (iface != NULL) { + ifaces = talloc_asprintf(vnn, "%s,%s", ifaces, + iface->iface->name); + iface = iface->next; + } + + len = strlen(ifaces) + 1; + pub = talloc_zero_size(mem_ctx, + offsetof(struct ctdb_addr_info_old, iface) + len); + CTDB_NO_MEMORY(ctdb, pub); + + pub->addr = vnn->public_address; + pub->mask = vnn->public_netmask_bits; + pub->len = len; + memcpy(&pub->iface[0], ifaces, pub->len); + + timeout = TAKEOVER_TIMEOUT(); + + data.dsize = offsetof(struct ctdb_addr_info_old, + iface) + pub->len; + data.dptr = (uint8_t *)pub; + + state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0, + CTDB_CONTROL_ADD_PUBLIC_IP, + 0, data, async_data, + &timeout, NULL); + if (state == NULL) { + DEBUG(DEBUG_ERR, + (__location__ + " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n")); + goto failed; + } + + ctdb_client_async_add(async_data, state); + } + } + + if (ctdb_client_async_wait(ctdb, async_data) != 0) { + DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n")); + goto failed; + } + + talloc_free(mem_ctx); + return 0; + +failed: + talloc_free(mem_ctx); + return -1; +} + +/* This control is sent to force the node to re-read the public addresses file + and drop any addresses we should nnot longer host, and add new addresses + that we are now able to host +*/ +int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control_old *c, bool *async_reply) +{ + struct ctdb_reloadips_handle *h; + pid_t parent = getpid(); + + if (ctdb->reload_ips != NULL) { + talloc_free(ctdb->reload_ips); + ctdb->reload_ips = NULL; + } + + h = talloc(ctdb, struct ctdb_reloadips_handle); + CTDB_NO_MEMORY(ctdb, h); + h->ctdb = ctdb; + h->c = NULL; + h->status = -1; + + if (pipe(h->fd) == -1) { + DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n")); + talloc_free(h); + return -1; + } + + h->child = ctdb_fork(ctdb); + if (h->child == (pid_t)-1) { + DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n")); + close(h->fd[0]); + close(h->fd[1]); + talloc_free(h); + return -1; + } + + /* child process */ + if (h->child == 0) { + signed char res = 0; + + close(h->fd[0]); + + prctl_set_comment("ctdb_reloadips"); + if (switch_from_server_to_client(ctdb) != 0) { + DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n")); + res = -1; + } else { + res = ctdb_reloadips_child(ctdb); + if (res != 0) { + DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n")); + } + } + + sys_write(h->fd[1], &res, 1); + ctdb_wait_for_process_to_exit(parent); + _exit(0); + } + + h->c = talloc_steal(h, c); + + close(h->fd[1]); + set_close_on_exec(h->fd[0]); + + talloc_set_destructor(h, ctdb_reloadips_destructor); + + + h->fde = tevent_add_fd(ctdb->ev, h, h->fd[0], TEVENT_FD_READ, + ctdb_reloadips_child_handler, (void *)h); + tevent_fd_set_auto_close(h->fde); + + tevent_add_timer(ctdb->ev, h, timeval_current_ofs(120, 0), + ctdb_reloadips_timeout_event, h); + + /* we reply later */ + *async_reply = true; + return 0; +} diff --git a/ctdb/server/ctdb_takeover_helper.c b/ctdb/server/ctdb_takeover_helper.c new file mode 100644 index 0000000..c088970 --- /dev/null +++ b/ctdb/server/ctdb_takeover_helper.c @@ -0,0 +1,1276 @@ +/* + CTDB IP takeover helper + + Copyright (C) Martin Schwenke 2016 + + Based on ctdb_recovery_helper.c + Copyright (C) Amitay Isaacs 2015 + + and ctdb_takeover.c + Copyright (C) Ronnie Sahlberg 2007 + Copyright (C) Andrew Tridgell 2007 + Copyright (C) Martin Schwenke 2011 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "replace.h" +#include "system/network.h" +#include "system/filesys.h" + +#include <popt.h> +#include <talloc.h> +#include <tevent.h> + +#include "lib/util/debug.h" +#include "lib/util/strv.h" +#include "lib/util/strv_util.h" +#include "lib/util/sys_rw.h" +#include "lib/util/time.h" +#include "lib/util/tevent_unix.h" + +#include "protocol/protocol.h" +#include "protocol/protocol_api.h" +#include "protocol/protocol_util.h" +#include "client/client.h" + +#include "common/logging.h" + +#include "server/ipalloc.h" + +static int takeover_timeout = 9; + +#define TIMEOUT() timeval_current_ofs(takeover_timeout, 0) + +/* + * Utility functions + */ + +static bool generic_recv(struct tevent_req *req, int *perr) +{ + int err; + + if (tevent_req_is_unix_error(req, &err)) { + if (perr != NULL) { + *perr = err; + } + return false; + } + + return true; +} + +static enum ipalloc_algorithm +determine_algorithm(const struct ctdb_tunable_list *tunables) +{ + switch (tunables->ip_alloc_algorithm) { + case 0: + return IPALLOC_DETERMINISTIC; + case 1: + return IPALLOC_NONDETERMINISTIC; + case 2: + return IPALLOC_LCP2; + default: + return IPALLOC_LCP2; + }; +} + +/**********************************************************************/ + +struct get_public_ips_state { + uint32_t *pnns; + int count; + struct ctdb_public_ip_list *ips; + uint32_t *ban_credits; +}; + +static void get_public_ips_done(struct tevent_req *subreq); + +static struct tevent_req *get_public_ips_send( + TALLOC_CTX *mem_ctx, + struct tevent_context *ev, + struct ctdb_client_context *client, + uint32_t *pnns, + int count, int num_nodes, + uint32_t *ban_credits, + bool available_only) +{ + struct tevent_req *req, *subreq; + struct get_public_ips_state *state; + struct ctdb_req_control request; + + req = tevent_req_create(mem_ctx, &state, struct get_public_ips_state); + if (req == NULL) { + return NULL; + } + + state->pnns = pnns; + state->count = count; + state->ban_credits = ban_credits; + + state->ips = talloc_zero_array(state, + struct ctdb_public_ip_list, + num_nodes); + if (tevent_req_nomem(state->ips, req)) { + return tevent_req_post(req, ev); + } + + /* Short circuit if no nodes being asked for IPs */ + if (state->count == 0) { + tevent_req_done(req); + return tevent_req_post(req, ev); + } + + ctdb_req_control_get_public_ips(&request, available_only); + subreq = ctdb_client_control_multi_send(mem_ctx, ev, client, + state->pnns, + state->count, + TIMEOUT(), &request); + if (tevent_req_nomem(subreq, req)) { + return tevent_req_post(req, ev); + } + tevent_req_set_callback(subreq, get_public_ips_done, req); + + return req; +} + +static void get_public_ips_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct get_public_ips_state *state = tevent_req_data( + req, struct get_public_ips_state); + struct ctdb_reply_control **reply; + int *err_list; + int ret, i; + bool status, found_errors; + + status = ctdb_client_control_multi_recv(subreq, &ret, state, &err_list, + &reply); + TALLOC_FREE(subreq); + if (! status) { + for (i = 0; i < state->count; i++) { + if (err_list[i] != 0) { + uint32_t pnn = state->pnns[i]; + + D_ERR("control GET_PUBLIC_IPS failed on " + "node %u, ret=%d\n", pnn, err_list[i]); + + state->ban_credits[pnn]++; + } + } + + tevent_req_error(req, ret); + return; + } + + found_errors = false; + for (i = 0; i < state->count; i++) { + uint32_t pnn; + struct ctdb_public_ip_list *ips; + + pnn = state->pnns[i]; + ret = ctdb_reply_control_get_public_ips(reply[i], state->ips, + &ips); + if (ret != 0) { + D_ERR("control GET_PUBLIC_IPS failed on " + "node %u\n", pnn); + state->ban_credits[pnn]++; + found_errors = true; + continue; + } + + D_INFO("Fetched public IPs from node %u\n", pnn); + state->ips[pnn] = *ips; + } + + if (found_errors) { + tevent_req_error(req, EIO); + return; + } + + talloc_free(reply); + + tevent_req_done(req); +} + +static bool get_public_ips_recv(struct tevent_req *req, int *perr, + TALLOC_CTX *mem_ctx, + struct ctdb_public_ip_list **ips) +{ + struct get_public_ips_state *state = tevent_req_data( + req, struct get_public_ips_state); + int err; + + if (tevent_req_is_unix_error(req, &err)) { + if (perr != NULL) { + *perr = err; + } + return false; + } + + *ips = talloc_steal(mem_ctx, state->ips); + + return true; +} + +/**********************************************************************/ + +struct release_ip_state { + int num_sent; + int num_replies; + int num_fails; + int err_any; + uint32_t *ban_credits; +}; + +struct release_ip_one_state { + struct tevent_req *req; + uint32_t *pnns; + int count; + const char *ip_str; +}; + +static void release_ip_done(struct tevent_req *subreq); + +static struct tevent_req *release_ip_send(TALLOC_CTX *mem_ctx, + struct tevent_context *ev, + struct ctdb_client_context *client, + uint32_t *pnns, + int count, + struct timeval timeout, + struct public_ip_list *all_ips, + uint32_t *ban_credits) +{ + struct tevent_req *req, *subreq; + struct release_ip_state *state; + struct ctdb_req_control request; + struct public_ip_list *tmp_ip; + + req = tevent_req_create(mem_ctx, &state, struct release_ip_state); + if (req == NULL) { + return NULL; + } + + state->num_sent = 0; + state->num_replies = 0; + state->num_fails = 0; + state->ban_credits = ban_credits; + + /* Send a RELEASE_IP to all nodes that should not be hosting + * each IP. For each IP, all but one of these will be + * redundant. However, the redundant ones are used to tell + * nodes which node should be hosting the IP so that commands + * like "ctdb ip" can display a particular nodes idea of who + * is hosting what. */ + for (tmp_ip = all_ips; tmp_ip != NULL; tmp_ip = tmp_ip->next) { + struct release_ip_one_state *substate; + struct ctdb_public_ip ip; + int i; + + substate = talloc_zero(state, struct release_ip_one_state); + if (tevent_req_nomem(substate, req)) { + return tevent_req_post(req, ev); + } + + substate->pnns = talloc_zero_array(substate, uint32_t, count); + if (tevent_req_nomem(substate->pnns, req)) { + return tevent_req_post(req, ev); + } + + substate->count = 0; + substate->req = req; + + substate->ip_str = ctdb_sock_addr_to_string(substate, + &tmp_ip->addr, + false); + if (tevent_req_nomem(substate->ip_str, req)) { + return tevent_req_post(req, ev); + } + + for (i = 0; i < count; i++) { + uint32_t pnn = pnns[i]; + + /* Skip this node if IP is not known */ + if (! bitmap_query(tmp_ip->known_on, pnn)) { + continue; + } + + /* If pnn is not the node that should be + * hosting the IP then add it to the list of + * nodes that need to do a release. */ + if (tmp_ip->pnn != pnn) { + substate->pnns[substate->count] = pnn; + substate->count++; + } + } + + if (substate->count == 0) { + /* No releases to send for this address... */ + TALLOC_FREE(substate); + continue; + } + + ip.pnn = tmp_ip->pnn; + ip.addr = tmp_ip->addr; + ctdb_req_control_release_ip(&request, &ip); + subreq = ctdb_client_control_multi_send(state, ev, client, + substate->pnns, + substate->count, + timeout,/* cumulative */ + &request); + if (tevent_req_nomem(subreq, req)) { + return tevent_req_post(req, ev); + } + tevent_req_set_callback(subreq, release_ip_done, substate); + + state->num_sent++; + } + + /* None sent, finished... */ + if (state->num_sent == 0) { + tevent_req_done(req); + return tevent_req_post(req, ev); + } + + return req; +} + +static void release_ip_done(struct tevent_req *subreq) +{ + struct release_ip_one_state *substate = tevent_req_callback_data( + subreq, struct release_ip_one_state); + struct tevent_req *req = substate->req; + struct release_ip_state *state = tevent_req_data( + req, struct release_ip_state); + int ret, i; + int *err_list; + bool status, found_errors; + + status = ctdb_client_control_multi_recv(subreq, &ret, state, + &err_list, NULL); + TALLOC_FREE(subreq); + + if (status) { + D_INFO("RELEASE_IP %s succeeded on %d nodes\n", + substate->ip_str, substate->count); + goto done; + } + + /* Get some clear error messages out of err_list and count + * banning credits + */ + found_errors = false; + for (i = 0; i < substate->count; i++) { + int err = err_list[i]; + if (err != 0) { + uint32_t pnn = substate->pnns[i]; + + D_ERR("RELEASE_IP %s failed on node %u, " + "ret=%d\n", substate->ip_str, pnn, err); + + state->ban_credits[pnn]++; + state->err_any = err; + found_errors = true; + } + } + if (! found_errors) { + D_ERR("RELEASE_IP %s internal error, ret=%d\n", + substate->ip_str, ret); + state->err_any = EIO; + } + + state->num_fails++; + +done: + talloc_free(substate); + + state->num_replies++; + + if (state->num_replies < state->num_sent) { + /* Not all replies received, don't go further */ + return; + } + + if (state->num_fails > 0) { + tevent_req_error(req, state->err_any); + return; + } + + tevent_req_done(req); +} + +static bool release_ip_recv(struct tevent_req *req, int *perr) +{ + return generic_recv(req, perr); +} + +/**********************************************************************/ + +struct take_ip_state { + int num_sent; + int num_replies; + int num_fails; + int err_any; + uint32_t *ban_credits; +}; + +struct take_ip_one_state { + struct tevent_req *req; + uint32_t pnn; + const char *ip_str; +}; + +static void take_ip_done(struct tevent_req *subreq); + +static struct tevent_req *take_ip_send(TALLOC_CTX *mem_ctx, + struct tevent_context *ev, + struct ctdb_client_context *client, + struct timeval timeout, + struct public_ip_list *all_ips, + uint32_t *ban_credits) +{ + struct tevent_req *req, *subreq; + struct take_ip_state *state; + struct ctdb_req_control request; + struct public_ip_list *tmp_ip; + + req = tevent_req_create(mem_ctx, &state, struct take_ip_state); + if (req == NULL) { + return NULL; + } + + state->num_sent = 0; + state->num_replies = 0; + state->num_fails = 0; + state->ban_credits = ban_credits; + + /* For each IP, send a TAKOVER_IP to the node that should be + * hosting it. Many of these will often be redundant (since + * the allocation won't have changed) but they can be useful + * to recover from inconsistencies. */ + for (tmp_ip = all_ips; tmp_ip != NULL; tmp_ip = tmp_ip->next) { + struct take_ip_one_state *substate; + struct ctdb_public_ip ip; + + if (tmp_ip->pnn == CTDB_UNKNOWN_PNN) { + /* IP will be unassigned */ + continue; + } + + substate = talloc_zero(state, struct take_ip_one_state); + if (tevent_req_nomem(substate, req)) { + return tevent_req_post(req, ev); + } + + substate->req = req; + substate->pnn = tmp_ip->pnn; + + substate->ip_str = ctdb_sock_addr_to_string(substate, + &tmp_ip->addr, + false); + if (tevent_req_nomem(substate->ip_str, req)) { + return tevent_req_post(req, ev); + } + + ip.pnn = tmp_ip->pnn; + ip.addr = tmp_ip->addr; + ctdb_req_control_takeover_ip(&request, &ip); + subreq = ctdb_client_control_send( + state, ev, client, tmp_ip->pnn, + timeout, /* cumulative */ + &request); + if (tevent_req_nomem(subreq, req)) { + return tevent_req_post(req, ev); + } + tevent_req_set_callback(subreq, take_ip_done, substate); + + state->num_sent++; + } + + /* None sent, finished... */ + if (state->num_sent == 0) { + tevent_req_done(req); + return tevent_req_post(req, ev); + } + + return req; +} + +static void take_ip_done(struct tevent_req *subreq) +{ + struct take_ip_one_state *substate = tevent_req_callback_data( + subreq, struct take_ip_one_state); + struct tevent_req *req = substate->req; + struct ctdb_reply_control *reply; + struct take_ip_state *state = tevent_req_data( + req, struct take_ip_state); + int ret = 0; + bool status; + + status = ctdb_client_control_recv(subreq, &ret, state, &reply); + TALLOC_FREE(subreq); + + if (! status) { + D_ERR("TAKEOVER_IP %s failed to node %u, ret=%d\n", + substate->ip_str, substate->pnn, ret); + goto fail; + } + + ret = ctdb_reply_control_takeover_ip(reply); + if (ret != 0) { + D_ERR("TAKEOVER_IP %s failed on node %u, ret=%d\n", + substate->ip_str, substate->pnn, ret); + goto fail; + } + + D_INFO("TAKEOVER_IP %s succeeded on node %u\n", + substate->ip_str, substate->pnn); + goto done; + +fail: + state->ban_credits[substate->pnn]++; + state->num_fails++; + state->err_any = ret; + +done: + talloc_free(substate); + + state->num_replies++; + + if (state->num_replies < state->num_sent) { + /* Not all replies received, don't go further */ + return; + } + + if (state->num_fails > 0) { + tevent_req_error(req, state->err_any); + return; + } + + tevent_req_done(req); +} + +static bool take_ip_recv(struct tevent_req *req, int *perr) +{ + return generic_recv(req, perr); +} + +/**********************************************************************/ + +struct ipreallocated_state { + uint32_t *pnns; + int count; + uint32_t *ban_credits; +}; + +static void ipreallocated_done(struct tevent_req *subreq); + +static struct tevent_req *ipreallocated_send(TALLOC_CTX *mem_ctx, + struct tevent_context *ev, + struct ctdb_client_context *client, + uint32_t *pnns, + int count, + struct timeval timeout, + uint32_t *ban_credits) +{ + struct tevent_req *req, *subreq; + struct ipreallocated_state *state; + struct ctdb_req_control request; + + req = tevent_req_create(mem_ctx, &state, struct ipreallocated_state); + if (req == NULL) { + return NULL; + } + + state->pnns = pnns; + state->count = count; + state->ban_credits = ban_credits; + + ctdb_req_control_ipreallocated(&request); + subreq = ctdb_client_control_multi_send(state, ev, client, + pnns, count, + timeout, /* cumulative */ + &request); + if (tevent_req_nomem(subreq, req)) { + return tevent_req_post(req, ev); + } + tevent_req_set_callback(subreq, ipreallocated_done, req); + + return req; +} + +static void ipreallocated_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct ipreallocated_state *state = tevent_req_data( + req, struct ipreallocated_state); + int *err_list = NULL; + int ret, i; + bool status, found_errors; + + status = ctdb_client_control_multi_recv(subreq, &ret, state, + &err_list, NULL); + TALLOC_FREE(subreq); + + if (status) { + D_INFO("IPREALLOCATED succeeded on %d nodes\n", state->count); + tevent_req_done(req); + return; + } + + /* Get some clear error messages out of err_list and count + * banning credits + */ + found_errors = false; + for (i = 0; i < state->count; i++) { + int err = err_list[i]; + if (err != 0) { + uint32_t pnn = state->pnns[i]; + + D_ERR("IPREALLOCATED failed on node %u, ret=%d\n", + pnn, err); + + state->ban_credits[pnn]++; + found_errors = true; + } + } + + if (! found_errors) { + D_ERR("IPREALLOCATED internal error, ret=%d\n", ret); + } + + tevent_req_error(req, ret); +} + +static bool ipreallocated_recv(struct tevent_req *req, int *perr) +{ + return generic_recv(req, perr); +} + +/**********************************************************************/ + +/* + * Recalculate the allocation of public IPs to nodes and have the + * nodes host their allocated addresses. + * + * - Get tunables + * - Get nodemap + * - Initialise IP allocation state. Pass: + * + algorithm to be used; + * + various tunables (NoIPTakeover, NoIPFailback) + * + list of nodes to force rebalance (internal structure, currently + * no way to fetch, only used by LCP2 for nodes that have had new + * IP addresses added). + * - Set IP flags for IP allocation based on node map + * - Retrieve known and available IP addresses (done separately so + * values can be faked in unit testing) + * - Use ipalloc_set_public_ips() to set known and available IP + * addresses for allocation + * - If cluster can't host IP addresses then jump to IPREALLOCATED + * - Run IP allocation algorithm + * - Send RELEASE_IP to all nodes for IPs they should not host + * - Send TAKE_IP to all nodes for IPs they should host + * - Send IPREALLOCATED to all nodes + */ + +struct takeover_state { + struct tevent_context *ev; + struct ctdb_client_context *client; + struct timeval timeout; + unsigned int num_nodes; + uint32_t *pnns_connected; + int num_connected; + uint32_t *pnns_active; + int num_active; + uint32_t destnode; + uint32_t *force_rebalance_nodes; + struct ctdb_tunable_list *tun_list; + struct ipalloc_state *ipalloc_state; + struct ctdb_public_ip_list *known_ips; + struct public_ip_list *all_ips; + uint32_t *ban_credits; +}; + +static void takeover_tunables_done(struct tevent_req *subreq); +static void takeover_nodemap_done(struct tevent_req *subreq); +static void takeover_known_ips_done(struct tevent_req *subreq); +static void takeover_avail_ips_done(struct tevent_req *subreq); +static void takeover_release_ip_done(struct tevent_req *subreq); +static void takeover_take_ip_done(struct tevent_req *subreq); +static void takeover_ipreallocated(struct tevent_req *req); +static void takeover_ipreallocated_done(struct tevent_req *subreq); +static void takeover_failed(struct tevent_req *subreq, int ret); +static void takeover_failed_done(struct tevent_req *subreq); + +static struct tevent_req *takeover_send(TALLOC_CTX *mem_ctx, + struct tevent_context *ev, + struct ctdb_client_context *client, + uint32_t *force_rebalance_nodes) +{ + struct tevent_req *req, *subreq; + struct takeover_state *state; + struct ctdb_req_control request; + + req = tevent_req_create(mem_ctx, &state, struct takeover_state); + if (req == NULL) { + return NULL; + } + + state->ev = ev; + state->client = client; + state->force_rebalance_nodes = force_rebalance_nodes; + state->destnode = ctdb_client_pnn(client); + + ctdb_req_control_get_all_tunables(&request); + subreq = ctdb_client_control_send(state, state->ev, state->client, + state->destnode, TIMEOUT(), + &request); + if (tevent_req_nomem(subreq, req)) { + return tevent_req_post(req, ev); + } + tevent_req_set_callback(subreq, takeover_tunables_done, req); + + return req; +} + +static void takeover_tunables_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct takeover_state *state = tevent_req_data( + req, struct takeover_state); + struct ctdb_reply_control *reply; + struct ctdb_req_control request; + int ret; + bool status; + + status = ctdb_client_control_recv(subreq, &ret, state, &reply); + TALLOC_FREE(subreq); + if (! status) { + D_ERR("control GET_ALL_TUNABLES failed, ret=%d\n", ret); + tevent_req_error(req, ret); + return; + } + + ret = ctdb_reply_control_get_all_tunables(reply, state, + &state->tun_list); + if (ret != 0) { + D_ERR("control GET_ALL_TUNABLES failed, ret=%d\n", ret); + tevent_req_error(req, ret); + return; + } + + talloc_free(reply); + + takeover_timeout = state->tun_list->takeover_timeout; + + ctdb_req_control_get_nodemap(&request); + subreq = ctdb_client_control_send(state, state->ev, state->client, + state->destnode, TIMEOUT(), + &request); + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, takeover_nodemap_done, req); +} + +static void takeover_nodemap_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct takeover_state *state = tevent_req_data( + req, struct takeover_state); + struct ctdb_reply_control *reply; + bool status; + int ret; + struct ctdb_node_map *nodemap; + const char *ptr; + + status = ctdb_client_control_recv(subreq, &ret, state, &reply); + TALLOC_FREE(subreq); + if (! status) { + D_ERR("control GET_NODEMAP failed to node %u, ret=%d\n", + state->destnode, ret); + tevent_req_error(req, ret); + return; + } + + ret = ctdb_reply_control_get_nodemap(reply, state, &nodemap); + if (ret != 0) { + D_ERR("control GET_NODEMAP failed, ret=%d\n", ret); + tevent_req_error(req, ret); + return; + } + + state->num_nodes = nodemap->num; + + state->num_connected = list_of_connected_nodes(nodemap, + CTDB_UNKNOWN_PNN, state, + &state->pnns_connected); + if (state->num_connected <= 0) { + tevent_req_error(req, ENOMEM); + return; + } + + state->num_active = list_of_active_nodes(nodemap, + CTDB_UNKNOWN_PNN, state, + &state->pnns_active); + if (state->num_active <= 0) { + tevent_req_error(req, ENOMEM); + return; + } + + /* Default timeout for early jump to IPREALLOCATED. See below + * for explanation of 3 times... + */ + state->timeout = timeval_current_ofs(3 * takeover_timeout, 0); + + state->ban_credits = talloc_zero_array(state, uint32_t, + state->num_nodes); + if (tevent_req_nomem(state->ban_credits, req)) { + return; + } + + ptr = getenv("CTDB_DISABLE_IP_FAILOVER"); + if (ptr != NULL) { + /* IP failover is completely disabled so just send out + * ipreallocated event. + */ + takeover_ipreallocated(req); + return; + } + + state->ipalloc_state = + ipalloc_state_init( + state, state->num_nodes, + determine_algorithm(state->tun_list), + (state->tun_list->no_ip_takeover != 0), + (state->tun_list->no_ip_failback != 0), + state->force_rebalance_nodes); + if (tevent_req_nomem(state->ipalloc_state, req)) { + return; + } + + subreq = get_public_ips_send(state, state->ev, state->client, + state->pnns_connected, state->num_connected, + state->num_nodes, state->ban_credits, + false); + if (tevent_req_nomem(subreq, req)) { + return; + } + + tevent_req_set_callback(subreq, takeover_known_ips_done, req); +} + +static void takeover_known_ips_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct takeover_state *state = tevent_req_data( + req, struct takeover_state); + int ret; + bool status; + uint32_t *pnns = NULL; + int count, i; + + status = get_public_ips_recv(subreq, &ret, state, &state->known_ips); + TALLOC_FREE(subreq); + + if (! status) { + D_ERR("Failed to fetch known public IPs\n"); + takeover_failed(req, ret); + return; + } + + /* Get available IPs from active nodes that actually have known IPs */ + + pnns = talloc_zero_array(state, uint32_t, state->num_active); + if (tevent_req_nomem(pnns, req)) { + return; + } + + count = 0; + for (i = 0; i < state->num_active; i++) { + uint32_t pnn = state->pnns_active[i]; + + /* If pnn has IPs then fetch available IPs from it */ + if (state->known_ips[pnn].num > 0) { + pnns[count] = pnn; + count++; + } + } + + subreq = get_public_ips_send(state, state->ev, state->client, + pnns, count, + state->num_nodes, state->ban_credits, + true); + if (tevent_req_nomem(subreq, req)) { + return; + } + + tevent_req_set_callback(subreq, takeover_avail_ips_done, req); +} + +static void takeover_avail_ips_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct takeover_state *state = tevent_req_data( + req, struct takeover_state); + bool status; + int ret; + struct ctdb_public_ip_list *available_ips; + + status = get_public_ips_recv(subreq, &ret, state, &available_ips); + TALLOC_FREE(subreq); + + if (! status) { + D_ERR("Failed to fetch available public IPs\n"); + takeover_failed(req, ret); + return; + } + + ipalloc_set_public_ips(state->ipalloc_state, + state->known_ips, available_ips); + + if (! ipalloc_can_host_ips(state->ipalloc_state)) { + D_NOTICE("No nodes available to host public IPs yet\n"); + takeover_ipreallocated(req); + return; + } + + /* Do the IP reassignment calculations */ + state->all_ips = ipalloc(state->ipalloc_state); + if (tevent_req_nomem(state->all_ips, req)) { + return; + } + + /* Each of the following stages (RELEASE_IP, TAKEOVER_IP, + * IPREALLOCATED) notionally has a timeout of TakeoverTimeout + * seconds. However, RELEASE_IP can take longer due to TCP + * connection killing, so sometimes needs more time. + * Therefore, use a cumulative timeout of TakeoverTimeout * 3 + * seconds across all 3 stages. No explicit expiry checks are + * needed before each stage because tevent is smart enough to + * fire the timeouts even if they are in the past. Initialise + * this here so it explicitly covers the stages we're + * interested in but, in particular, not the time taken by the + * ipalloc(). + */ + state->timeout = timeval_current_ofs(3 * takeover_timeout, 0); + + subreq = release_ip_send(state, state->ev, state->client, + state->pnns_connected, state->num_connected, + state->timeout, state->all_ips, + state->ban_credits); + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, takeover_release_ip_done, req); +} + +static void takeover_release_ip_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct takeover_state *state = tevent_req_data( + req, struct takeover_state); + int ret; + bool status; + + status = release_ip_recv(subreq, &ret); + TALLOC_FREE(subreq); + + if (! status) { + takeover_failed(req, ret); + return; + } + + /* All released, now for takeovers */ + + subreq = take_ip_send(state, state->ev, state->client, + state->timeout, state->all_ips, + state->ban_credits); + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, takeover_take_ip_done, req); +} + +static void takeover_take_ip_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + int ret = 0; + bool status; + + status = take_ip_recv(subreq, &ret); + TALLOC_FREE(subreq); + + if (! status) { + takeover_failed(req, ret); + return; + } + + takeover_ipreallocated(req); +} + +static void takeover_ipreallocated(struct tevent_req *req) +{ + struct takeover_state *state = tevent_req_data( + req, struct takeover_state); + struct tevent_req *subreq; + + subreq = ipreallocated_send(state, state->ev, state->client, + state->pnns_connected, + state->num_connected, + state->timeout, + state->ban_credits); + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, takeover_ipreallocated_done, req); +} + +static void takeover_ipreallocated_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + int ret; + bool status; + + status = ipreallocated_recv(subreq, &ret); + TALLOC_FREE(subreq); + + if (! status) { + takeover_failed(req, ret); + return; + } + + tevent_req_done(req); +} + +struct takeover_failed_state { + struct tevent_req *req; + int ret; +}; + +void takeover_failed(struct tevent_req *req, int ret) +{ + struct takeover_state *state = tevent_req_data( + req, struct takeover_state); + struct tevent_req *subreq; + uint32_t max_pnn = CTDB_UNKNOWN_PNN; + unsigned int max_credits = 0; + uint32_t pnn; + + /* Check that bans are enabled */ + if (state->tun_list->enable_bans == 0) { + tevent_req_error(req, ret); + return; + } + + for (pnn = 0; pnn < state->num_nodes; pnn++) { + if (state->ban_credits[pnn] > max_credits) { + max_pnn = pnn; + max_credits = state->ban_credits[pnn]; + } + } + + if (max_credits > 0) { + struct ctdb_req_message message; + struct takeover_failed_state *substate; + + D_WARNING("Assigning banning credits to node %u\n", max_pnn); + + substate = talloc_zero(state, struct takeover_failed_state); + if (tevent_req_nomem(substate, req)) { + return; + } + substate->req = req; + substate->ret = ret; + + message.srvid = CTDB_SRVID_BANNING; + message.data.pnn = max_pnn; + + subreq = ctdb_client_message_send( + state, state->ev, state->client, + ctdb_client_pnn(state->client), + &message); + if (subreq == NULL) { + D_ERR("failed to assign banning credits\n"); + tevent_req_error(req, ret); + return; + } + tevent_req_set_callback(subreq, takeover_failed_done, substate); + } else { + tevent_req_error(req, ret); + } +} + +static void takeover_failed_done(struct tevent_req *subreq) +{ + struct takeover_failed_state *substate = tevent_req_callback_data( + subreq, struct takeover_failed_state); + struct tevent_req *req = substate->req; + int ret; + bool status; + + status = ctdb_client_message_recv(subreq, &ret); + TALLOC_FREE(subreq); + if (! status) { + D_ERR("failed to assign banning credits, ret=%d\n", ret); + } + + ret = substate->ret; + talloc_free(substate); + tevent_req_error(req, ret); +} + +static void takeover_recv(struct tevent_req *req, int *perr) +{ + generic_recv(req, perr); +} + +static uint32_t *parse_node_list(TALLOC_CTX *mem_ctx, const char* s) +{ + char *strv = NULL; + int num, i, ret; + char *t; + uint32_t *nodes; + + ret = strv_split(mem_ctx, &strv, s, ","); + if (ret != 0) { + D_ERR("out of memory\n"); + return NULL; + } + + num = strv_count(strv); + + nodes = talloc_array(mem_ctx, uint32_t, num); + if (nodes == NULL) { + D_ERR("out of memory\n"); + return NULL; + } + + t = NULL; + for (i = 0; i < num; i++) { + t = strv_next(strv, t); + nodes[i] = atoi(t); + } + + return nodes; +} + +static void usage(const char *progname) +{ + fprintf(stderr, + "\nUsage: %s <output-fd> <ctdb-socket-path> " + "[<force-rebalance-nodes>]\n", + progname); +} + +/* + * Arguments - write fd, socket path + */ +int main(int argc, const char *argv[]) +{ + int write_fd; + const char *sockpath; + TALLOC_CTX *mem_ctx; + struct tevent_context *ev; + struct ctdb_client_context *client; + bool status; + int ret; + struct tevent_req *req; + uint32_t *force_rebalance_nodes = NULL; + + if (argc < 3 || argc > 4) { + usage(argv[0]); + exit(1); + } + + write_fd = atoi(argv[1]); + sockpath = argv[2]; + + mem_ctx = talloc_new(NULL); + if (mem_ctx == NULL) { + fprintf(stderr, "talloc_new() failed\n"); + ret = ENOMEM; + goto done; + } + + if (argc == 4) { + force_rebalance_nodes = parse_node_list(mem_ctx, argv[3]); + if (force_rebalance_nodes == NULL) { + usage(argv[0]); + ret = EINVAL; + goto done; + } + } + + ret = logging_init(mem_ctx, NULL, NULL, "ctdb-takeover"); + if (ret != 0) { + fprintf(stderr, + "ctdb-takeover: Unable to initialize logging\n"); + goto done; + } + + ev = tevent_context_init(mem_ctx); + if (ev == NULL) { + D_ERR("tevent_context_init() failed\n"); + ret = ENOMEM; + goto done; + } + + status = logging_setup_sighup_handler(ev, mem_ctx, NULL, NULL); + if (!status) { + D_ERR("logging_setup_sighup_handler() failed\n"); + ret = ENOMEM; + goto done; + } + + ret = ctdb_client_init(mem_ctx, ev, sockpath, &client); + if (ret != 0) { + D_ERR("ctdb_client_init() failed, ret=%d\n", ret); + goto done; + } + + req = takeover_send(mem_ctx, ev, client, force_rebalance_nodes); + if (req == NULL) { + D_ERR("takeover_send() failed\n"); + ret = 1; + goto done; + } + + if (! tevent_req_poll(req, ev)) { + D_ERR("tevent_req_poll() failed\n"); + ret = 1; + goto done; + } + + takeover_recv(req, &ret); + TALLOC_FREE(req); + if (ret != 0) { + D_ERR("takeover run failed, ret=%d\n", ret); + } + +done: + sys_write_v(write_fd, &ret, sizeof(ret)); + + talloc_free(mem_ctx); + return ret; +} diff --git a/ctdb/server/ctdb_traverse.c b/ctdb/server/ctdb_traverse.c new file mode 100644 index 0000000..4865dcc --- /dev/null +++ b/ctdb/server/ctdb_traverse.c @@ -0,0 +1,781 @@ +/* + efficient async ctdb traverse + + Copyright (C) Andrew Tridgell 2007 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "replace.h" +#include "system/filesys.h" +#include "system/network.h" +#include "system/wait.h" +#include "system/time.h" + +#include <talloc.h> +#include <tevent.h> + +#include "lib/tdb_wrap/tdb_wrap.h" +#include "lib/util/dlinklist.h" +#include "lib/util/debug.h" +#include "lib/util/samba_util.h" +#include "lib/util/sys_rw.h" +#include "lib/util/util_process.h" + +#include "ctdb_private.h" +#include "ctdb_client.h" + +#include "common/reqid.h" +#include "common/system.h" +#include "common/common.h" +#include "common/logging.h" + +typedef void (*ctdb_traverse_fn_t)(void *private_data, TDB_DATA key, TDB_DATA data); + +/* + handle returned to caller - freeing this handler will kill the child and + terminate the traverse + */ +struct ctdb_traverse_local_handle { + struct ctdb_traverse_local_handle *next, *prev; + struct ctdb_db_context *ctdb_db; + int fd[2]; + pid_t child; + uint64_t srvid; + uint32_t client_reqid; + uint32_t reqid; + int srcnode; + void *private_data; + ctdb_traverse_fn_t callback; + bool withemptyrecords; + struct tevent_fd *fde; + int records_failed; + int records_sent; +}; + +/* + * called when traverse is completed by child or on error + */ +static void ctdb_traverse_child_handler(struct tevent_context *ev, struct tevent_fd *fde, + uint16_t flags, void *private_data) +{ + struct ctdb_traverse_local_handle *h = talloc_get_type(private_data, + struct ctdb_traverse_local_handle); + ctdb_traverse_fn_t callback = h->callback; + void *p = h->private_data; + int res; + ssize_t n; + + /* Read the number of records sent by traverse child */ + n = sys_read(h->fd[0], &res, sizeof(res)); + if (n < 0 || n != sizeof(res)) { + /* Traverse child failed */ + DEBUG(DEBUG_ERR, ("Local traverse failed db:%s reqid:%d\n", + h->ctdb_db->db_name, h->reqid)); + } else if (res < 0) { + /* Traverse failed */ + res = -res; + DEBUG(DEBUG_ERR, ("Local traverse failed db:%s reqid:%d records:%d\n", + h->ctdb_db->db_name, h->reqid, res)); + } else { + DEBUG(DEBUG_INFO, ("Local traverse end db:%s reqid:%d records:%d\n", + h->ctdb_db->db_name, h->reqid, res)); + } + + callback(p, tdb_null, tdb_null); +} + +/* + destroy a in-flight traverse operation + */ +static int traverse_local_destructor(struct ctdb_traverse_local_handle *h) +{ + DLIST_REMOVE(h->ctdb_db->traverse, h); + ctdb_kill(h->ctdb_db->ctdb, h->child, SIGKILL); + return 0; +} + +/* + callback from tdb_traverse_read() + */ +static int ctdb_traverse_local_fn(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p) +{ + struct ctdb_traverse_local_handle *h = talloc_get_type(p, + struct ctdb_traverse_local_handle); + struct ctdb_rec_data_old *d; + struct ctdb_ltdb_header *hdr; + int res, status; + TDB_DATA outdata; + + hdr = (struct ctdb_ltdb_header *)data.dptr; + + if (ctdb_db_volatile(h->ctdb_db)) { + /* filter out zero-length records */ + if (!h->withemptyrecords && + data.dsize <= sizeof(struct ctdb_ltdb_header)) + { + return 0; + } + + /* filter out non-authoritative records */ + if (hdr->dmaster != h->ctdb_db->ctdb->pnn) { + return 0; + } + } + + d = ctdb_marshall_record(h, h->reqid, key, NULL, data); + if (d == NULL) { + /* error handling is tricky in this child code .... */ + h->records_failed++; + return -1; + } + + outdata.dptr = (uint8_t *)d; + outdata.dsize = d->length; + + res = ctdb_control(h->ctdb_db->ctdb, h->srcnode, 0, CTDB_CONTROL_TRAVERSE_DATA, + CTDB_CTRL_FLAG_NOREPLY, outdata, NULL, NULL, &status, NULL, NULL); + if (res != 0 || status != 0) { + h->records_failed++; + return -1; + } + + h->records_sent++; + return 0; +} + +struct traverse_all_state { + struct ctdb_context *ctdb; + struct ctdb_traverse_local_handle *h; + uint32_t reqid; + uint32_t srcnode; + uint32_t client_reqid; + uint64_t srvid; + bool withemptyrecords; +}; + +/* + setup a non-blocking traverse of a local ltdb. The callback function + will be called on every record in the local ltdb. To stop the + traverse, talloc_free() the traverse_handle. + + The traverse is finished when the callback is called with tdb_null for key and data + */ +static struct ctdb_traverse_local_handle *ctdb_traverse_local(struct ctdb_db_context *ctdb_db, + ctdb_traverse_fn_t callback, + struct traverse_all_state *all_state) +{ + struct ctdb_traverse_local_handle *h; + int ret; + + h = talloc_zero(all_state, struct ctdb_traverse_local_handle); + if (h == NULL) { + return NULL; + } + + ret = pipe(h->fd); + + if (ret != 0) { + talloc_free(h); + return NULL; + } + + h->child = ctdb_fork(ctdb_db->ctdb); + + if (h->child == (pid_t)-1) { + close(h->fd[0]); + close(h->fd[1]); + talloc_free(h); + return NULL; + } + + h->callback = callback; + h->private_data = all_state; + h->ctdb_db = ctdb_db; + h->client_reqid = all_state->client_reqid; + h->reqid = all_state->reqid; + h->srvid = all_state->srvid; + h->srcnode = all_state->srcnode; + h->withemptyrecords = all_state->withemptyrecords; + + if (h->child == 0) { + /* start the traverse in the child */ + int res, status; + pid_t parent = getpid(); + struct ctdb_context *ctdb = ctdb_db->ctdb; + struct ctdb_rec_data_old *d; + TDB_DATA outdata; + + close(h->fd[0]); + + prctl_set_comment("ctdb_traverse"); + if (switch_from_server_to_client(ctdb) != 0) { + DEBUG(DEBUG_CRIT, ("Failed to switch traverse child into client mode\n")); + _exit(0); + } + + d = ctdb_marshall_record(h, h->reqid, tdb_null, NULL, tdb_null); + if (d == NULL) { + res = 0; + sys_write(h->fd[1], &res, sizeof(int)); + _exit(0); + } + + res = tdb_traverse_read(ctdb_db->ltdb->tdb, ctdb_traverse_local_fn, h); + if (res == -1 || h->records_failed > 0) { + /* traverse failed */ + res = -(h->records_sent); + } else { + res = h->records_sent; + } + + /* Wait till all the data is flushed from output queue */ + while (ctdb_queue_length(ctdb->daemon.queue) > 0) { + tevent_loop_once(ctdb->ev); + } + + /* End traverse by sending empty record */ + outdata.dptr = (uint8_t *)d; + outdata.dsize = d->length; + ret = ctdb_control(ctdb, h->srcnode, 0, + CTDB_CONTROL_TRAVERSE_DATA, + CTDB_CTRL_FLAG_NOREPLY, outdata, + NULL, NULL, &status, NULL, NULL); + if (ret == -1 || status == -1) { + if (res > 0) { + res = -res; + } + } + + sys_write(h->fd[1], &res, sizeof(res)); + + ctdb_wait_for_process_to_exit(parent); + _exit(0); + } + + close(h->fd[1]); + set_close_on_exec(h->fd[0]); + + talloc_set_destructor(h, traverse_local_destructor); + + DLIST_ADD(ctdb_db->traverse, h); + + h->fde = tevent_add_fd(ctdb_db->ctdb->ev, h, h->fd[0], TEVENT_FD_READ, + ctdb_traverse_child_handler, h); + if (h->fde == NULL) { + close(h->fd[0]); + talloc_free(h); + return NULL; + } + tevent_fd_set_auto_close(h->fde); + + return h; +} + + +struct ctdb_traverse_all_handle { + struct ctdb_context *ctdb; + struct ctdb_db_context *ctdb_db; + uint32_t reqid; + ctdb_traverse_fn_t callback; + void *private_data; + uint32_t null_count; + bool timedout; +}; + +/* + destroy a traverse_all op + */ +static int ctdb_traverse_all_destructor(struct ctdb_traverse_all_handle *state) +{ + reqid_remove(state->ctdb->idr, state->reqid); + return 0; +} + +/* called when a traverse times out */ +static void ctdb_traverse_all_timeout(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *private_data) +{ + struct ctdb_traverse_all_handle *state = talloc_get_type(private_data, struct ctdb_traverse_all_handle); + + DEBUG(DEBUG_ERR,(__location__ " Traverse all timeout on database:%s\n", state->ctdb_db->db_name)); + CTDB_INCREMENT_STAT(state->ctdb, timeouts.traverse); + + state->timedout = true; + state->callback(state->private_data, tdb_null, tdb_null); +} + + +struct traverse_start_state { + struct ctdb_context *ctdb; + struct ctdb_traverse_all_handle *h; + uint32_t srcnode; + uint32_t reqid; + uint32_t db_id; + uint64_t srvid; + bool withemptyrecords; + int num_records; +}; + + +/* + setup a cluster-wide non-blocking traverse of a ctdb. The + callback function will be called on every record in the local + ltdb. To stop the traverse, talloc_free() the traverse_handle. + + The traverse is finished when the callback is called with tdb_null + for key and data + */ +static struct ctdb_traverse_all_handle *ctdb_daemon_traverse_all(struct ctdb_db_context *ctdb_db, + ctdb_traverse_fn_t callback, + struct traverse_start_state *start_state) +{ + struct ctdb_traverse_all_handle *state; + struct ctdb_context *ctdb = ctdb_db->ctdb; + int ret; + TDB_DATA data; + struct ctdb_traverse_all r; + struct ctdb_traverse_all_ext r_ext; + uint32_t destination; + + state = talloc(start_state, struct ctdb_traverse_all_handle); + if (state == NULL) { + return NULL; + } + + state->ctdb = ctdb; + state->ctdb_db = ctdb_db; + state->reqid = reqid_new(ctdb_db->ctdb->idr, state); + state->callback = callback; + state->private_data = start_state; + state->null_count = 0; + state->timedout = false; + + talloc_set_destructor(state, ctdb_traverse_all_destructor); + + if (start_state->withemptyrecords) { + r_ext.db_id = ctdb_db->db_id; + r_ext.reqid = state->reqid; + r_ext.pnn = ctdb->pnn; + r_ext.client_reqid = start_state->reqid; + r_ext.srvid = start_state->srvid; + r_ext.withemptyrecords = start_state->withemptyrecords; + + data.dptr = (uint8_t *)&r_ext; + data.dsize = sizeof(r_ext); + } else { + r.db_id = ctdb_db->db_id; + r.reqid = state->reqid; + r.pnn = ctdb->pnn; + r.client_reqid = start_state->reqid; + r.srvid = start_state->srvid; + + data.dptr = (uint8_t *)&r; + data.dsize = sizeof(r); + } + + if (ctdb_db_volatile(ctdb_db)) { + /* volatile database, traverse all active nodes */ + destination = CTDB_BROADCAST_ACTIVE; + } else { + unsigned int i; + /* persistent database, traverse one node, preferably + * the local one + */ + destination = ctdb->pnn; + /* check we are in the vnnmap */ + for (i=0; i < ctdb->vnn_map->size; i++) { + if (ctdb->vnn_map->map[i] == ctdb->pnn) { + break; + } + } + /* if we are not in the vnn map we just pick the first + * node instead + */ + if (i == ctdb->vnn_map->size) { + destination = ctdb->vnn_map->map[0]; + } + } + + /* tell all the nodes in the cluster to start sending records to this + * node, or if it is a persistent database, just tell the local + * node + */ + + if (start_state->withemptyrecords) { + ret = ctdb_daemon_send_control(ctdb, destination, 0, + CTDB_CONTROL_TRAVERSE_ALL_EXT, + 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL); + } else { + ret = ctdb_daemon_send_control(ctdb, destination, 0, + CTDB_CONTROL_TRAVERSE_ALL, + 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL); + } + + if (ret != 0) { + talloc_free(state); + return NULL; + } + + DEBUG(DEBUG_NOTICE,("Starting traverse on DB %s (id %d)\n", + ctdb_db->db_name, state->reqid)); + + /* timeout the traverse */ + tevent_add_timer(ctdb->ev, state, + timeval_current_ofs(ctdb->tunable.traverse_timeout, 0), + ctdb_traverse_all_timeout, state); + + return state; +} + +/* + called when local traverse ends + */ +static void traverse_all_callback(void *p, TDB_DATA key, TDB_DATA data) +{ + struct traverse_all_state *state = talloc_get_type(p, struct traverse_all_state); + + /* we're done */ + talloc_free(state); +} + +/* + * extended version to take the "withemptyrecords" parameter" + */ +int32_t ctdb_control_traverse_all_ext(struct ctdb_context *ctdb, TDB_DATA data, TDB_DATA *outdata) +{ + struct ctdb_traverse_all_ext *c = (struct ctdb_traverse_all_ext *)data.dptr; + struct traverse_all_state *state; + struct ctdb_db_context *ctdb_db; + + if (data.dsize != sizeof(struct ctdb_traverse_all_ext)) { + DEBUG(DEBUG_ERR,(__location__ " Invalid size in ctdb_control_traverse_all_ext\n")); + return -1; + } + + ctdb_db = find_ctdb_db(ctdb, c->db_id); + if (ctdb_db == NULL) { + return -1; + } + + if (ctdb_db->unhealthy_reason) { + if (ctdb->tunable.allow_unhealthy_db_read == 0) { + DEBUG(DEBUG_ERR,("db(%s) unhealty in ctdb_control_traverse_all: %s\n", + ctdb_db->db_name, ctdb_db->unhealthy_reason)); + return -1; + } + DEBUG(DEBUG_WARNING,("warn: db(%s) unhealty in ctdb_control_traverse_all: %s\n", + ctdb_db->db_name, ctdb_db->unhealthy_reason)); + } + + state = talloc(ctdb_db, struct traverse_all_state); + if (state == NULL) { + return -1; + } + + state->reqid = c->reqid; + state->srcnode = c->pnn; + state->ctdb = ctdb; + state->client_reqid = c->client_reqid; + state->srvid = c->srvid; + state->withemptyrecords = c->withemptyrecords; + + state->h = ctdb_traverse_local(ctdb_db, traverse_all_callback, state); + if (state->h == NULL) { + talloc_free(state); + return -1; + } + + return 0; +} + +/* + called when a CTDB_CONTROL_TRAVERSE_ALL control comes in. We then + setup a traverse of our local ltdb, sending the records as + CTDB_CONTROL_TRAVERSE_DATA records back to the originator + */ +int32_t ctdb_control_traverse_all(struct ctdb_context *ctdb, TDB_DATA data, TDB_DATA *outdata) +{ + struct ctdb_traverse_all *c = (struct ctdb_traverse_all *)data.dptr; + struct traverse_all_state *state; + struct ctdb_db_context *ctdb_db; + + if (data.dsize != sizeof(struct ctdb_traverse_all)) { + DEBUG(DEBUG_ERR,(__location__ " Invalid size in ctdb_control_traverse_all\n")); + return -1; + } + + ctdb_db = find_ctdb_db(ctdb, c->db_id); + if (ctdb_db == NULL) { + return -1; + } + + if (ctdb_db->unhealthy_reason) { + if (ctdb->tunable.allow_unhealthy_db_read == 0) { + DEBUG(DEBUG_ERR,("db(%s) unhealty in ctdb_control_traverse_all: %s\n", + ctdb_db->db_name, ctdb_db->unhealthy_reason)); + return -1; + } + DEBUG(DEBUG_WARNING,("warn: db(%s) unhealty in ctdb_control_traverse_all: %s\n", + ctdb_db->db_name, ctdb_db->unhealthy_reason)); + } + + state = talloc(ctdb_db, struct traverse_all_state); + if (state == NULL) { + return -1; + } + + state->reqid = c->reqid; + state->srcnode = c->pnn; + state->ctdb = ctdb; + state->client_reqid = c->client_reqid; + state->srvid = c->srvid; + state->withemptyrecords = false; + + state->h = ctdb_traverse_local(ctdb_db, traverse_all_callback, state); + if (state->h == NULL) { + talloc_free(state); + return -1; + } + + return 0; +} + + +/* + called when a CTDB_CONTROL_TRAVERSE_DATA control comes in. We then + call the traverse_all callback with the record + */ +int32_t ctdb_control_traverse_data(struct ctdb_context *ctdb, TDB_DATA data, TDB_DATA *outdata) +{ + struct ctdb_rec_data_old *d = (struct ctdb_rec_data_old *)data.dptr; + struct ctdb_traverse_all_handle *state; + TDB_DATA key; + ctdb_traverse_fn_t callback; + void *private_data; + + if (data.dsize < sizeof(uint32_t) || data.dsize != d->length) { + DEBUG(DEBUG_ERR,("Bad record size in ctdb_control_traverse_data\n")); + return -1; + } + + state = reqid_find(ctdb->idr, d->reqid, struct ctdb_traverse_all_handle); + if (state == NULL || d->reqid != state->reqid) { + /* traverse might have been terminated already */ + return -1; + } + + key.dsize = d->keylen; + key.dptr = &d->data[0]; + data.dsize = d->datalen; + data.dptr = &d->data[d->keylen]; + + if (key.dsize == 0 && data.dsize == 0) { + state->null_count++; + /* Persistent databases are only scanned on one node (the local + * node) + */ + if (ctdb_db_volatile(state->ctdb_db)) { + if (state->null_count != ctdb_get_num_active_nodes(ctdb)) { + return 0; + } + } + } + + callback = state->callback; + private_data = state->private_data; + + callback(private_data, key, data); + return 0; +} + +/* + kill a in-progress traverse, used when a client disconnects + */ +int32_t ctdb_control_traverse_kill(struct ctdb_context *ctdb, TDB_DATA data, + TDB_DATA *outdata, uint32_t srcnode) +{ + struct ctdb_traverse_start *d = (struct ctdb_traverse_start *)data.dptr; + struct ctdb_db_context *ctdb_db; + struct ctdb_traverse_local_handle *t; + + ctdb_db = find_ctdb_db(ctdb, d->db_id); + if (ctdb_db == NULL) { + return -1; + } + + for (t=ctdb_db->traverse; t; t=t->next) { + if (t->client_reqid == d->reqid && + t->srvid == d->srvid) { + talloc_free(t); + break; + } + } + + return 0; +} + + +/* + this is called when a client disconnects during a traverse + we need to notify all the nodes taking part in the search that they + should kill their traverse children + */ +static int ctdb_traverse_start_destructor(struct traverse_start_state *state) +{ + struct ctdb_traverse_start r; + TDB_DATA data; + + DEBUG(DEBUG_ERR,(__location__ " Traverse cancelled by client disconnect for database:0x%08x\n", state->db_id)); + r.db_id = state->db_id; + r.reqid = state->reqid; + r.srvid = state->srvid; + + data.dptr = (uint8_t *)&r; + data.dsize = sizeof(r); + + ctdb_daemon_send_control(state->ctdb, CTDB_BROADCAST_CONNECTED, 0, + CTDB_CONTROL_TRAVERSE_KILL, + 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL); + return 0; +} + +/* + callback which sends records as messages to the client + */ +static void traverse_start_callback(void *p, TDB_DATA key, TDB_DATA data) +{ + struct traverse_start_state *state; + struct ctdb_rec_data_old *d; + TDB_DATA cdata; + + state = talloc_get_type(p, struct traverse_start_state); + + d = ctdb_marshall_record(state, state->reqid, key, NULL, data); + if (d == NULL) { + return; + } + + cdata.dptr = (uint8_t *)d; + cdata.dsize = d->length; + + srvid_dispatch(state->ctdb->srv, state->srvid, 0, cdata); + if (key.dsize == 0 && data.dsize == 0) { + DEBUG(DEBUG_NOTICE, ("Ending traverse on DB %s (id %d), records %d\n", + state->h->ctdb_db->db_name, state->h->reqid, + state->num_records)); + + if (state->h->timedout) { + /* timed out, send TRAVERSE_KILL control */ + talloc_free(state); + } else { + /* end of traverse */ + talloc_set_destructor(state, NULL); + talloc_free(state); + } + } else { + state->num_records++; + } +} + + +/** + * start a traverse_all - called as a control from a client. + * extended version to take the "withemptyrecords" parameter. + */ +int32_t ctdb_control_traverse_start_ext(struct ctdb_context *ctdb, + TDB_DATA data, + TDB_DATA *outdata, + uint32_t srcnode, + uint32_t client_id) +{ + struct ctdb_traverse_start_ext *d = (struct ctdb_traverse_start_ext *)data.dptr; + struct traverse_start_state *state; + struct ctdb_db_context *ctdb_db; + struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client); + + if (client == NULL) { + DEBUG(DEBUG_ERR,(__location__ " No client found\n")); + return -1; + } + + if (data.dsize != sizeof(*d)) { + DEBUG(DEBUG_ERR,("Bad record size in ctdb_control_traverse_start\n")); + return -1; + } + + ctdb_db = find_ctdb_db(ctdb, d->db_id); + if (ctdb_db == NULL) { + return -1; + } + + if (ctdb_db->unhealthy_reason) { + if (ctdb->tunable.allow_unhealthy_db_read == 0) { + DEBUG(DEBUG_ERR,("db(%s) unhealty in ctdb_control_traverse_start: %s\n", + ctdb_db->db_name, ctdb_db->unhealthy_reason)); + return -1; + } + DEBUG(DEBUG_WARNING,("warn: db(%s) unhealty in ctdb_control_traverse_start: %s\n", + ctdb_db->db_name, ctdb_db->unhealthy_reason)); + } + + state = talloc(client, struct traverse_start_state); + if (state == NULL) { + return -1; + } + + state->srcnode = srcnode; + state->reqid = d->reqid; + state->srvid = d->srvid; + state->db_id = d->db_id; + state->ctdb = ctdb; + state->withemptyrecords = d->withemptyrecords; + state->num_records = 0; + + state->h = ctdb_daemon_traverse_all(ctdb_db, traverse_start_callback, state); + if (state->h == NULL) { + talloc_free(state); + return -1; + } + + talloc_set_destructor(state, ctdb_traverse_start_destructor); + + return 0; +} + +/** + * start a traverse_all - called as a control from a client. + */ +int32_t ctdb_control_traverse_start(struct ctdb_context *ctdb, + TDB_DATA data, + TDB_DATA *outdata, + uint32_t srcnode, + uint32_t client_id) +{ + struct ctdb_traverse_start *d = (struct ctdb_traverse_start *)data.dptr; + struct ctdb_traverse_start_ext d2; + TDB_DATA data2; + + ZERO_STRUCT(d2); + d2.db_id = d->db_id; + d2.reqid = d->reqid; + d2.srvid = d->srvid; + d2.withemptyrecords = false; + + data2.dsize = sizeof(d2); + data2.dptr = (uint8_t *)&d2; + + return ctdb_control_traverse_start_ext(ctdb, data2, outdata, srcnode, client_id); +} diff --git a/ctdb/server/ctdb_tunables.c b/ctdb/server/ctdb_tunables.c new file mode 100644 index 0000000..0dce656 --- /dev/null +++ b/ctdb/server/ctdb_tunables.c @@ -0,0 +1,170 @@ +/* + ctdb tunables code + + Copyright (C) Andrew Tridgell 2007 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ +#include "replace.h" +#include "system/network.h" + +#include <talloc.h> +#include <tdb.h> + +#include "lib/util/debug.h" + +#include "ctdb_private.h" + +#include "common/common.h" +#include "common/logging.h" +#include "common/path.h" +#include "common/tunable.h" + +/* + set all tunables to defaults + */ +void ctdb_tunables_set_defaults(struct ctdb_context *ctdb) +{ + ctdb_tunable_set_defaults(&ctdb->tunable); +} + + +/* + get a tunable + */ +int32_t ctdb_control_get_tunable(struct ctdb_context *ctdb, TDB_DATA indata, + TDB_DATA *outdata) +{ + struct ctdb_control_get_tunable *t = + (struct ctdb_control_get_tunable *)indata.dptr; + char *name; + uint32_t val; + bool ret; + + if (indata.dsize < sizeof(*t) || + t->length > indata.dsize - offsetof(struct ctdb_control_get_tunable, name)) { + DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_get_tunable\n")); + return -1; + } + + name = talloc_strndup(ctdb, (char*)t->name, t->length); + CTDB_NO_MEMORY(ctdb, name); + + ret = ctdb_tunable_get_value(&ctdb->tunable, name, &val); + talloc_free(name); + if (! ret) { + return -EINVAL; + } + + outdata->dptr = (uint8_t *)talloc(outdata, uint32_t); + CTDB_NO_MEMORY(ctdb, outdata->dptr); + + *(uint32_t *)outdata->dptr = val; + outdata->dsize = sizeof(uint32_t); + + return 0; +} + + +/* + set a tunable + */ +int32_t ctdb_control_set_tunable(struct ctdb_context *ctdb, TDB_DATA indata) +{ + struct ctdb_tunable_old *t = + (struct ctdb_tunable_old *)indata.dptr; + char *name; + int ret; + bool obsolete; + + if (indata.dsize < sizeof(*t) || + t->length > indata.dsize - offsetof(struct ctdb_tunable_old, name)) { + DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tunable\n")); + return -1; + } + + name = talloc_strndup(ctdb, (char *)t->name, t->length); + CTDB_NO_MEMORY(ctdb, name); + + ret = ctdb_tunable_set_value(&ctdb->tunable, name, t->value, + &obsolete); + if (! ret) { + talloc_free(name); + return -1; + } + + if (obsolete) { + DEBUG(DEBUG_WARNING, + ("Setting obsolete tunable \"%s\"\n", name)); + talloc_free(name); + return 1; + } + + talloc_free(name); + return 0; +} + +/* + list tunables + */ +int32_t ctdb_control_list_tunables(struct ctdb_context *ctdb, TDB_DATA *outdata) +{ + char *list = NULL; + struct ctdb_control_list_tunable *t; + + list = ctdb_tunable_names_to_string(outdata); + CTDB_NO_MEMORY(ctdb, list); + + outdata->dsize = offsetof(struct ctdb_control_list_tunable, data) + + strlen(list) + 1; + outdata->dptr = talloc_size(outdata, outdata->dsize); + CTDB_NO_MEMORY(ctdb, outdata->dptr); + + t = (struct ctdb_control_list_tunable *)outdata->dptr; + t->length = strlen(list)+1; + + memcpy(t->data, list, t->length); + talloc_free(list); + + return 0; +} + +bool ctdb_tunables_load(struct ctdb_context *ctdb) +{ + bool status; + TALLOC_CTX *tmp_ctx; + char *file = NULL; + + /* Fail by default */ + status = false; + + tmp_ctx = talloc_new(ctdb); + if (tmp_ctx == NULL) { + DBG_ERR("Memory allocation error\n"); + goto done; + } + + file = path_etcdir_append(tmp_ctx, "ctdb.tunables"); + if (file == NULL) { + D_ERR("Failed to construct path for ctdb.tunables\n"); + goto done; + } + + status = ctdb_tunable_load_file(tmp_ctx, &ctdb->tunable, file); + /* No need to log error, already logged above */ + +done: + talloc_free(tmp_ctx); + return status; +} diff --git a/ctdb/server/ctdb_tunnel.c b/ctdb/server/ctdb_tunnel.c new file mode 100644 index 0000000..2df9474 --- /dev/null +++ b/ctdb/server/ctdb_tunnel.c @@ -0,0 +1,141 @@ +/* + ctdb_tunnel protocol code + + Copyright (C) Amitay Isaacs 2017 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "replace.h" +#include "system/network.h" + +#include <talloc.h> +#include <tevent.h> +#include <tdb.h> + +#include "lib/util/debug.h" + +#include "common/logging.h" +#include "common/reqid.h" +#include "common/srvid.h" + +#include "ctdb_private.h" + +int32_t ctdb_control_tunnel_register(struct ctdb_context *ctdb, + uint32_t client_id, uint64_t tunnel_id) +{ + struct ctdb_client *client; + int ret; + + client = reqid_find(ctdb->idr, client_id, struct ctdb_client); + if (client == NULL) { + DEBUG(DEBUG_ERR, ("Bad client_id in ctdb_tunnel_register\n")); + return -1; + } + + ret = srvid_exists(ctdb->tunnels, tunnel_id, NULL); + if (ret == 0) { + DEBUG(DEBUG_ERR, + ("Tunnel id 0x%"PRIx64" already registered\n", + tunnel_id)); + return -1; + } + + ret = srvid_register(ctdb->tunnels, client, tunnel_id, + daemon_tunnel_handler, client); + if (ret != 0) { + DEBUG(DEBUG_ERR, + ("Failed to register tunnel id 0x%"PRIx64"\n", + tunnel_id)); + return -1; + } + + DEBUG(DEBUG_INFO, ("Registered tunnel for id 0x%"PRIx64"\n", + tunnel_id)); + return 0; +} + +int32_t ctdb_control_tunnel_deregister(struct ctdb_context *ctdb, + uint32_t client_id, uint64_t tunnel_id) +{ + struct ctdb_client *client; + int ret; + + client = reqid_find(ctdb->idr, client_id, struct ctdb_client); + if (client == NULL) { + DEBUG(DEBUG_ERR, ("Bad client_id in ctdb_tunnel_deregister\n")); + return -1; + } + + ret = srvid_deregister(ctdb->tunnels, tunnel_id, client); + if (ret != 0) { + DEBUG(DEBUG_ERR, + ("Failed to deregister tunnel id 0x%"PRIx64"\n", + tunnel_id)); + return -1; + } + + return 0; +} + +int ctdb_daemon_send_tunnel(struct ctdb_context *ctdb, uint32_t destnode, + uint64_t tunnel_id, uint32_t flags, TDB_DATA data) +{ + struct ctdb_req_tunnel_old *c; + size_t len; + + if (ctdb->methods == NULL) { + DEBUG(DEBUG_INFO, + ("Failed to send tunnel. Transport is DOWN\n")); + return -1; + } + + len = offsetof(struct ctdb_req_tunnel_old, data) + data.dsize; + c = ctdb_transport_allocate(ctdb, ctdb, CTDB_REQ_TUNNEL, len, + struct ctdb_req_tunnel_old); + if (c == NULL) { + DEBUG(DEBUG_ERR, + ("Memory error in ctdb_daemon_send_tunnel()\n")); + return -1; + } + + c->hdr.destnode = destnode; + c->tunnel_id = tunnel_id; + c->flags = flags; + c->datalen = data.dsize; + memcpy(c->data, data.dptr, data.dsize); + + ctdb_queue_packet(ctdb, &c->hdr); + + talloc_free(c); + return 0; +} + +void ctdb_request_tunnel(struct ctdb_context *ctdb, + struct ctdb_req_header *hdr) +{ + struct ctdb_req_tunnel_old *c = + (struct ctdb_req_tunnel_old *)hdr; + TDB_DATA data; + int ret; + + data.dsize = hdr->length; + data.dptr = (uint8_t *)c; + + ret = srvid_dispatch(ctdb->tunnels, c->tunnel_id, 0, data); + if (ret != 0) { + DEBUG(DEBUG_ERR, ("Tunnel id 0x%"PRIx64" not registered\n", + c->tunnel_id)); + } +} diff --git a/ctdb/server/ctdb_update_record.c b/ctdb/server/ctdb_update_record.c new file mode 100644 index 0000000..405499c --- /dev/null +++ b/ctdb/server/ctdb_update_record.c @@ -0,0 +1,372 @@ +/* + implementation of the update record control + + Copyright (C) Andrew Tridgell 2007 + Copyright (C) Ronnie Sahlberg 2007 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "replace.h" +#include "system/network.h" +#include "system/time.h" + +#include <talloc.h> +#include <tevent.h> + +#include "lib/tdb_wrap/tdb_wrap.h" +#include "lib/util/debug.h" +#include "lib/util/samba_util.h" +#include "lib/util/sys_rw.h" +#include "lib/util/util_process.h" + +#include "ctdb_private.h" +#include "ctdb_client.h" + +#include "common/system.h" +#include "common/common.h" +#include "common/logging.h" + +struct ctdb_persistent_write_state { + struct ctdb_db_context *ctdb_db; + struct ctdb_marshall_buffer *m; + struct ctdb_req_control_old *c; + uint32_t flags; +}; + +/* don't create/update records that does not exist locally */ +#define UPDATE_FLAGS_REPLACE_ONLY 1 + +/* + called from a child process to write the data + */ +static int ctdb_persistent_store(struct ctdb_persistent_write_state *state) +{ + unsigned int i; + int ret; + struct ctdb_rec_data_old *rec = NULL; + struct ctdb_marshall_buffer *m = state->m; + + ret = tdb_transaction_start(state->ctdb_db->ltdb->tdb); + if (ret == -1) { + DEBUG(DEBUG_ERR,("Failed to start transaction for db_id 0x%08x in ctdb_persistent_store\n", + state->ctdb_db->db_id)); + return -1; + } + + for (i=0;i<m->count;i++) { + struct ctdb_ltdb_header oldheader; + struct ctdb_ltdb_header header; + TDB_DATA key, data, olddata; + TALLOC_CTX *tmp_ctx = talloc_new(state); + + rec = ctdb_marshall_loop_next(m, rec, NULL, &header, &key, &data); + + if (rec == NULL) { + D_ERR("Failed to get next record %u for db_id 0x%08x " + "in ctdb_persistent_store\n", + i, + state->ctdb_db->db_id); + talloc_free(tmp_ctx); + goto failed; + } + + /* we must check if the record exists or not because + ctdb_ltdb_fetch will unconditionally create a record + */ + if (state->flags & UPDATE_FLAGS_REPLACE_ONLY) { + TDB_DATA trec; + trec = tdb_fetch(state->ctdb_db->ltdb->tdb, key); + if (trec.dsize == 0) { + talloc_free(tmp_ctx); + continue; + } + free(trec.dptr); + } + + /* fetch the old header and ensure the rsn is less than the new rsn */ + ret = ctdb_ltdb_fetch(state->ctdb_db, key, &oldheader, tmp_ctx, &olddata); + if (ret != 0) { + DEBUG(DEBUG_ERR,("Failed to fetch old record for db_id 0x%08x in ctdb_persistent_store\n", + state->ctdb_db->db_id)); + talloc_free(tmp_ctx); + goto failed; + } + + if (oldheader.rsn >= header.rsn && + (olddata.dsize != data.dsize || + memcmp(olddata.dptr, data.dptr, data.dsize) != 0)) { + DEBUG(DEBUG_CRIT,("existing header for db_id 0x%08x has larger RSN %llu than new RSN %llu in ctdb_persistent_store\n", + state->ctdb_db->db_id, + (unsigned long long)oldheader.rsn, (unsigned long long)header.rsn)); + talloc_free(tmp_ctx); + goto failed; + } + + talloc_free(tmp_ctx); + + ret = ctdb_ltdb_store(state->ctdb_db, key, &header, data); + if (ret != 0) { + DEBUG(DEBUG_CRIT,("Failed to store record for db_id 0x%08x in ctdb_persistent_store\n", + state->ctdb_db->db_id)); + goto failed; + } + } + + ret = tdb_transaction_commit(state->ctdb_db->ltdb->tdb); + if (ret == -1) { + DEBUG(DEBUG_ERR,("Failed to commit transaction for db_id 0x%08x in ctdb_persistent_store\n", + state->ctdb_db->db_id)); + return -1; + } + + return 0; + +failed: + tdb_transaction_cancel(state->ctdb_db->ltdb->tdb); + return -1; +} + + +/* + called when we the child has completed the persistent write + on our behalf + */ +static void ctdb_persistent_write_callback(int status, void *private_data) +{ + struct ctdb_persistent_write_state *state = talloc_get_type(private_data, + struct ctdb_persistent_write_state); + + + ctdb_request_control_reply(state->ctdb_db->ctdb, state->c, NULL, status, NULL); + + talloc_free(state); +} + +/* + called if our lockwait child times out + */ +static void ctdb_persistent_lock_timeout(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *private_data) +{ + struct ctdb_persistent_write_state *state = talloc_get_type(private_data, + struct ctdb_persistent_write_state); + ctdb_request_control_reply(state->ctdb_db->ctdb, state->c, NULL, -1, "timeout in ctdb_persistent_lock"); + talloc_free(state); +} + +struct childwrite_handle { + struct ctdb_context *ctdb; + struct ctdb_db_context *ctdb_db; + struct tevent_fd *fde; + int fd[2]; + pid_t child; + void *private_data; + void (*callback)(int, void *); + struct timeval start_time; +}; + +static int childwrite_destructor(struct childwrite_handle *h) +{ + CTDB_DECREMENT_STAT(h->ctdb, pending_childwrite_calls); + ctdb_kill(h->ctdb, h->child, SIGKILL); + return 0; +} + +/* called when the child process has finished writing the record to the + database +*/ +static void childwrite_handler(struct tevent_context *ev, + struct tevent_fd *fde, + uint16_t flags, void *private_data) +{ + struct childwrite_handle *h = talloc_get_type(private_data, + struct childwrite_handle); + void *p = h->private_data; + void (*callback)(int, void *) = h->callback; + pid_t child = h->child; + TALLOC_CTX *tmp_ctx = talloc_new(ev); + int ret; + char c; + + CTDB_UPDATE_LATENCY(h->ctdb, h->ctdb_db, "persistent", childwrite_latency, h->start_time); + CTDB_DECREMENT_STAT(h->ctdb, pending_childwrite_calls); + + /* the handle needs to go away when the context is gone - when + the handle goes away this implicitly closes the pipe, which + kills the child */ + talloc_steal(tmp_ctx, h); + + talloc_set_destructor(h, NULL); + + ret = sys_read(h->fd[0], &c, 1); + if (ret < 1) { + DEBUG(DEBUG_ERR, (__location__ " Read returned %d. Childwrite failed\n", ret)); + c = 1; + } + + callback(c, p); + + ctdb_kill(h->ctdb, child, SIGKILL); + talloc_free(tmp_ctx); +} + +/* this creates a child process which will take out a tdb transaction + and write the record to the database. +*/ +static struct childwrite_handle *ctdb_childwrite( + struct ctdb_db_context *ctdb_db, + void (*callback)(int, void *private_data), + struct ctdb_persistent_write_state *state) +{ + struct childwrite_handle *result; + int ret; + pid_t parent = getpid(); + + CTDB_INCREMENT_STAT(ctdb_db->ctdb, childwrite_calls); + CTDB_INCREMENT_STAT(ctdb_db->ctdb, pending_childwrite_calls); + + if (!(result = talloc_zero(state, struct childwrite_handle))) { + CTDB_DECREMENT_STAT(ctdb_db->ctdb, pending_childwrite_calls); + return NULL; + } + + ret = pipe(result->fd); + + if (ret != 0) { + talloc_free(result); + CTDB_DECREMENT_STAT(ctdb_db->ctdb, pending_childwrite_calls); + return NULL; + } + + result->child = ctdb_fork(ctdb_db->ctdb); + + if (result->child == (pid_t)-1) { + close(result->fd[0]); + close(result->fd[1]); + talloc_free(result); + CTDB_DECREMENT_STAT(ctdb_db->ctdb, pending_childwrite_calls); + return NULL; + } + + result->callback = callback; + result->private_data = state; + result->ctdb = ctdb_db->ctdb; + result->ctdb_db = ctdb_db; + + if (result->child == 0) { + char c = 0; + + close(result->fd[0]); + prctl_set_comment("ctdb_write_persistent"); + ret = ctdb_persistent_store(state); + if (ret != 0) { + DEBUG(DEBUG_ERR, (__location__ " Failed to write persistent data\n")); + c = 1; + } + + sys_write(result->fd[1], &c, 1); + + ctdb_wait_for_process_to_exit(parent); + _exit(0); + } + + close(result->fd[1]); + set_close_on_exec(result->fd[0]); + + talloc_set_destructor(result, childwrite_destructor); + + DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for ctdb_childwrite\n", result->fd[0])); + + result->fde = tevent_add_fd(ctdb_db->ctdb->ev, result, result->fd[0], + TEVENT_FD_READ, childwrite_handler, + (void *)result); + if (result->fde == NULL) { + talloc_free(result); + CTDB_DECREMENT_STAT(ctdb_db->ctdb, pending_childwrite_calls); + return NULL; + } + tevent_fd_set_auto_close(result->fde); + + result->start_time = timeval_current(); + + return result; +} + +/* + update a record on this node if the new record has a higher rsn than the + current record + */ +int32_t ctdb_control_update_record(struct ctdb_context *ctdb, + struct ctdb_req_control_old *c, TDB_DATA recdata, + bool *async_reply) +{ + struct ctdb_db_context *ctdb_db; + struct ctdb_persistent_write_state *state; + struct childwrite_handle *handle; + struct ctdb_marshall_buffer *m = (struct ctdb_marshall_buffer *)recdata.dptr; + + if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) { + DEBUG(DEBUG_INFO,("rejecting ctdb_control_update_record when recovery active\n")); + return -1; + } + + ctdb_db = find_ctdb_db(ctdb, m->db_id); + if (ctdb_db == NULL) { + DEBUG(DEBUG_ERR,("Unknown database 0x%08x in ctdb_control_update_record\n", m->db_id)); + return -1; + } + + if (ctdb_db->unhealthy_reason) { + DEBUG(DEBUG_ERR,("db(%s) unhealty in ctdb_control_update_record: %s\n", + ctdb_db->db_name, ctdb_db->unhealthy_reason)); + return -1; + } + + state = talloc(ctdb, struct ctdb_persistent_write_state); + CTDB_NO_MEMORY(ctdb, state); + + state->ctdb_db = ctdb_db; + state->c = c; + state->m = m; + state->flags = 0; + if (ctdb_db_volatile(ctdb_db)) { + state->flags = UPDATE_FLAGS_REPLACE_ONLY; + } + + /* create a child process to take out a transaction and + write the data. + */ + handle = ctdb_childwrite(ctdb_db, ctdb_persistent_write_callback, state); + if (handle == NULL) { + DEBUG(DEBUG_ERR,("Failed to setup childwrite handler in ctdb_control_update_record\n")); + talloc_free(state); + return -1; + } + + /* we need to wait for the replies */ + *async_reply = true; + + /* need to keep the control structure around */ + talloc_steal(state, c); + + /* but we won't wait forever */ + tevent_add_timer(ctdb->ev, state, + timeval_current_ofs(ctdb->tunable.control_timeout, 0), + ctdb_persistent_lock_timeout, state); + + return 0; +} + diff --git a/ctdb/server/ctdb_uptime.c b/ctdb/server/ctdb_uptime.c new file mode 100644 index 0000000..53025f5 --- /dev/null +++ b/ctdb/server/ctdb_uptime.c @@ -0,0 +1,55 @@ +/* + ctdb uptime code + + Copyright (C) Ronnie Sahlberg 2008 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "replace.h" +#include "system/syslog.h" +#include "system/time.h" +#include "system/filesys.h" +#include "system/network.h" + +#include <talloc.h> + +#include "lib/util/debug.h" + +#include "ctdb_private.h" +#include "ctdb_client.h" + +#include "common/common.h" +#include "common/logging.h" + +/* + returns the ctdb uptime +*/ +int32_t ctdb_control_uptime(struct ctdb_context *ctdb, TDB_DATA *outdata) +{ + struct ctdb_uptime *uptime; + + uptime = talloc_zero(outdata, struct ctdb_uptime); + CTDB_NO_MEMORY(ctdb, uptime); + + gettimeofday(&uptime->current_time, NULL); + uptime->ctdbd_start_time = ctdb->ctdbd_start_time; + uptime->last_recovery_started = ctdb->last_recovery_started; + uptime->last_recovery_finished = ctdb->last_recovery_finished; + + outdata->dsize = sizeof(struct ctdb_uptime); + outdata->dptr = (uint8_t *)uptime; + + return 0; +} diff --git a/ctdb/server/ctdb_vacuum.c b/ctdb/server/ctdb_vacuum.c new file mode 100644 index 0000000..7ff79ac --- /dev/null +++ b/ctdb/server/ctdb_vacuum.c @@ -0,0 +1,1990 @@ +/* + ctdb vacuuming events + + Copyright (C) Ronnie Sahlberg 2009 + Copyright (C) Michael Adam 2010-2013 + Copyright (C) Stefan Metzmacher 2010-2011 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "replace.h" +#include "system/network.h" +#include "system/filesys.h" +#include "system/time.h" + +#include <talloc.h> +#include <tevent.h> + +#include "lib/tdb_wrap/tdb_wrap.h" +#include "lib/util/dlinklist.h" +#include "lib/util/debug.h" +#include "lib/util/samba_util.h" +#include "lib/util/sys_rw.h" +#include "lib/util/util_process.h" + +#include "ctdb_private.h" +#include "ctdb_client.h" + +#include "protocol/protocol_private.h" + +#include "common/rb_tree.h" +#include "common/common.h" +#include "common/logging.h" + +#include "protocol/protocol_api.h" + +#define TIMELIMIT() timeval_current_ofs(10, 0) + +enum vacuum_child_status { VACUUM_RUNNING, VACUUM_OK, VACUUM_ERROR, VACUUM_TIMEOUT}; + +struct ctdb_vacuum_child_context { + struct ctdb_vacuum_handle *vacuum_handle; + /* fd child writes status to */ + int fd[2]; + pid_t child_pid; + enum vacuum_child_status status; + struct timeval start_time; + bool scheduled; +}; + +struct ctdb_vacuum_handle { + struct ctdb_db_context *ctdb_db; + uint32_t fast_path_count; + uint32_t vacuum_interval; +}; + + +/* a list of records to possibly delete */ +struct vacuum_data { + struct ctdb_context *ctdb; + struct ctdb_db_context *ctdb_db; + struct tdb_context *dest_db; + trbt_tree_t *delete_list; + struct ctdb_marshall_buffer **vacuum_fetch_list; + struct timeval start; + bool traverse_error; + bool vacuum; + struct { + struct { + uint32_t added_to_vacuum_fetch_list; + uint32_t added_to_delete_list; + uint32_t deleted; + uint32_t skipped; + uint32_t error; + uint32_t total; + } delete_queue; + struct { + uint32_t scheduled; + uint32_t skipped; + uint32_t error; + uint32_t total; + } db_traverse; + struct { + uint32_t total; + uint32_t remote_error; + uint32_t local_error; + uint32_t deleted; + uint32_t skipped; + uint32_t left; + } delete_list; + struct { + uint32_t vacuumed; + uint32_t copied; + } repack; + } count; +}; + +/* this structure contains the information for one record to be deleted */ +struct delete_record_data { + struct ctdb_context *ctdb; + struct ctdb_db_context *ctdb_db; + struct ctdb_ltdb_header hdr; + uint32_t remote_fail_count; + TDB_DATA key; + uint8_t keydata[1]; +}; + +struct delete_records_list { + struct ctdb_marshall_buffer *records; + struct vacuum_data *vdata; +}; + +struct fetch_record_data { + TDB_DATA key; + uint8_t keydata[1]; +}; + +static int insert_record_into_delete_queue(struct ctdb_db_context *ctdb_db, + const struct ctdb_ltdb_header *hdr, + TDB_DATA key); + +/** + * Store key and header in a tree, indexed by the key hash. + */ +static int insert_delete_record_data_into_tree(struct ctdb_context *ctdb, + struct ctdb_db_context *ctdb_db, + trbt_tree_t *tree, + const struct ctdb_ltdb_header *hdr, + TDB_DATA key) +{ + struct delete_record_data *dd; + uint32_t hash; + size_t len; + + len = offsetof(struct delete_record_data, keydata) + key.dsize; + + dd = (struct delete_record_data *)talloc_size(tree, len); + if (dd == NULL) { + DEBUG(DEBUG_ERR,(__location__ " Out of memory\n")); + return -1; + } + talloc_set_name_const(dd, "struct delete_record_data"); + + dd->ctdb = ctdb; + dd->ctdb_db = ctdb_db; + dd->key.dsize = key.dsize; + dd->key.dptr = dd->keydata; + memcpy(dd->keydata, key.dptr, key.dsize); + + dd->hdr = *hdr; + dd->remote_fail_count = 0; + + hash = ctdb_hash(&key); + + trbt_insert32(tree, hash, dd); + + return 0; +} + +static int add_record_to_delete_list(struct vacuum_data *vdata, TDB_DATA key, + struct ctdb_ltdb_header *hdr) +{ + struct ctdb_context *ctdb = vdata->ctdb; + struct ctdb_db_context *ctdb_db = vdata->ctdb_db; + uint32_t hash; + int ret; + + hash = ctdb_hash(&key); + + if (trbt_lookup32(vdata->delete_list, hash)) { + DEBUG(DEBUG_INFO, (__location__ " Hash collision when vacuuming, skipping this record.\n")); + return 0; + } + + ret = insert_delete_record_data_into_tree(ctdb, ctdb_db, + vdata->delete_list, + hdr, key); + if (ret != 0) { + return -1; + } + + vdata->count.delete_list.total++; + + return 0; +} + +/** + * Add a record to the list of records to be sent + * to their lmaster with VACUUM_FETCH. + */ +static int add_record_to_vacuum_fetch_list(struct vacuum_data *vdata, + TDB_DATA key) +{ + struct ctdb_context *ctdb = vdata->ctdb; + uint32_t lmaster; + struct ctdb_marshall_buffer *vfl; + + lmaster = ctdb_lmaster(ctdb, &key); + + vfl = vdata->vacuum_fetch_list[lmaster]; + + vfl = ctdb_marshall_add(ctdb, vfl, vfl->db_id, ctdb->pnn, + key, NULL, tdb_null); + if (vfl == NULL) { + DEBUG(DEBUG_ERR,(__location__ " Out of memory\n")); + vdata->traverse_error = true; + return -1; + } + + vdata->vacuum_fetch_list[lmaster] = vfl; + + return 0; +} + + +static void ctdb_vacuum_event(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *private_data); + +static int vacuum_record_parser(TDB_DATA key, TDB_DATA data, void *private_data) +{ + struct ctdb_ltdb_header *header = + (struct ctdb_ltdb_header *)private_data; + + if (data.dsize != sizeof(struct ctdb_ltdb_header)) { + return -1; + } + + *header = *(struct ctdb_ltdb_header *)data.dptr; + + return 0; +} + +/* + * traverse function for gathering the records that can be deleted + */ +static int vacuum_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, + void *private_data) +{ + struct vacuum_data *vdata = talloc_get_type(private_data, + struct vacuum_data); + struct ctdb_context *ctdb = vdata->ctdb; + struct ctdb_db_context *ctdb_db = vdata->ctdb_db; + uint32_t lmaster; + struct ctdb_ltdb_header *hdr; + int res = 0; + + vdata->count.db_traverse.total++; + + lmaster = ctdb_lmaster(ctdb, &key); + if (lmaster >= ctdb->num_nodes) { + vdata->count.db_traverse.error++; + DEBUG(DEBUG_CRIT, (__location__ + " lmaster[%u] >= ctdb->num_nodes[%u] for key" + " with hash[%u]!\n", + (unsigned)lmaster, + (unsigned)ctdb->num_nodes, + (unsigned)ctdb_hash(&key))); + return -1; + } + + if (data.dsize != sizeof(struct ctdb_ltdb_header)) { + /* it is not a deleted record */ + vdata->count.db_traverse.skipped++; + return 0; + } + + hdr = (struct ctdb_ltdb_header *)data.dptr; + + if (hdr->dmaster != ctdb->pnn) { + vdata->count.db_traverse.skipped++; + return 0; + } + + /* + * Add the record to this process's delete_queue for processing + * in the subsequent traverse in the fast vacuum run. + */ + res = insert_record_into_delete_queue(ctdb_db, hdr, key); + if (res != 0) { + vdata->count.db_traverse.error++; + } else { + vdata->count.db_traverse.scheduled++; + } + + return 0; +} + +/* + * traverse the tree of records to delete and marshall them into + * a blob + */ +static int delete_marshall_traverse(void *param, void *data) +{ + struct delete_record_data *dd = talloc_get_type(data, struct delete_record_data); + struct delete_records_list *recs = talloc_get_type(param, struct delete_records_list); + struct ctdb_marshall_buffer *m; + + m = ctdb_marshall_add(recs, recs->records, recs->records->db_id, + recs->records->db_id, + dd->key, &dd->hdr, tdb_null); + if (m == NULL) { + DEBUG(DEBUG_ERR, (__location__ " failed to marshall record\n")); + return -1; + } + + recs->records = m; + return 0; +} + +struct fetch_queue_state { + struct ctdb_db_context *ctdb_db; + int count; +}; + +struct fetch_record_migrate_state { + struct fetch_queue_state *fetch_queue; + TDB_DATA key; +}; + +static void fetch_record_migrate_callback(struct ctdb_client_call_state *state) +{ + struct fetch_record_migrate_state *fetch = talloc_get_type_abort( + state->async.private_data, struct fetch_record_migrate_state); + struct fetch_queue_state *fetch_queue = fetch->fetch_queue; + struct ctdb_ltdb_header hdr; + struct ctdb_call call = { 0 }; + int ret; + + ret = ctdb_call_recv(state, &call); + fetch_queue->count--; + if (ret != 0) { + D_ERR("Failed to migrate record for vacuuming\n"); + goto done; + } + + ret = tdb_chainlock_nonblock(fetch_queue->ctdb_db->ltdb->tdb, + fetch->key); + if (ret != 0) { + goto done; + } + + ret = tdb_parse_record(fetch_queue->ctdb_db->ltdb->tdb, + fetch->key, + vacuum_record_parser, + &hdr); + + tdb_chainunlock(fetch_queue->ctdb_db->ltdb->tdb, fetch->key); + + if (ret != 0) { + goto done; + } + + D_INFO("Vacuum Fetch record, key=%.*s\n", + (int)fetch->key.dsize, + fetch->key.dptr); + + (void) ctdb_local_schedule_for_deletion(fetch_queue->ctdb_db, + &hdr, + fetch->key); + +done: + talloc_free(fetch); +} + +static int fetch_record_parser(TDB_DATA key, TDB_DATA data, void *private_data) +{ + struct ctdb_ltdb_header *header = + (struct ctdb_ltdb_header *)private_data; + + if (data.dsize < sizeof(struct ctdb_ltdb_header)) { + return -1; + } + + memcpy(header, data.dptr, sizeof(*header)); + return 0; +} + +/** + * traverse function for the traversal of the fetch_queue. + * + * Send a record migration request. + */ +static int fetch_queue_traverse(void *param, void *data) +{ + struct fetch_record_data *rd = talloc_get_type_abort( + data, struct fetch_record_data); + struct fetch_queue_state *fetch_queue = + (struct fetch_queue_state *)param; + struct ctdb_db_context *ctdb_db = fetch_queue->ctdb_db; + struct ctdb_client_call_state *state; + struct fetch_record_migrate_state *fetch; + struct ctdb_call call = { 0 }; + struct ctdb_ltdb_header header; + int ret; + + ret = tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, rd->key); + if (ret != 0) { + return 0; + } + + ret = tdb_parse_record(ctdb_db->ltdb->tdb, + rd->key, + fetch_record_parser, + &header); + + tdb_chainunlock(ctdb_db->ltdb->tdb, rd->key); + + if (ret != 0) { + goto skipped; + } + + if (header.dmaster == ctdb_db->ctdb->pnn) { + /* If the record is already migrated, skip */ + goto skipped; + } + + fetch = talloc_zero(ctdb_db, struct fetch_record_migrate_state); + if (fetch == NULL) { + D_ERR("Failed to setup fetch record migrate state\n"); + return 0; + } + + fetch->fetch_queue = fetch_queue; + + fetch->key.dsize = rd->key.dsize; + fetch->key.dptr = talloc_memdup(fetch, rd->key.dptr, rd->key.dsize); + if (fetch->key.dptr == NULL) { + D_ERR("Memory error in fetch_queue_traverse\n"); + talloc_free(fetch); + return 0; + } + + call.call_id = CTDB_NULL_FUNC; + call.flags = CTDB_IMMEDIATE_MIGRATION | + CTDB_CALL_FLAG_VACUUM_MIGRATION; + call.key = fetch->key; + + state = ctdb_call_send(ctdb_db, &call); + if (state == NULL) { + DEBUG(DEBUG_ERR, ("Failed to setup vacuum fetch call\n")); + talloc_free(fetch); + return 0; + } + + state->async.fn = fetch_record_migrate_callback; + state->async.private_data = fetch; + + fetch_queue->count++; + + return 0; + +skipped: + D_INFO("Skipped Fetch record, key=%.*s\n", + (int)rd->key.dsize, + rd->key.dptr); + return 0; +} + +/** + * Traverse the fetch. + * Records are migrated to the local node and + * added to delete queue for further processing. + */ +static void ctdb_process_fetch_queue(struct ctdb_db_context *ctdb_db) +{ + struct fetch_queue_state state; + int ret; + + state.ctdb_db = ctdb_db; + state.count = 0; + + ret = trbt_traversearray32(ctdb_db->fetch_queue, 1, + fetch_queue_traverse, &state); + if (ret != 0) { + DEBUG(DEBUG_ERR, (__location__ " Error traversing " + "the fetch queue.\n")); + } + + /* Wait for all migrations to complete */ + while (state.count > 0) { + tevent_loop_once(ctdb_db->ctdb->ev); + } +} + +/** + * traverse function for the traversal of the delete_queue, + * the fast-path vacuuming list. + * + * - If the record has been migrated off the node + * or has been revived (filled with data) on the node, + * then skip the record. + * + * - If the current node is the record's lmaster and it is + * a record that has never been migrated with data, then + * delete the record from the local tdb. + * + * - If the current node is the record's lmaster and it has + * been migrated with data, then schedule it for the normal + * vacuuming procedure (i.e. add it to the delete_list). + * + * - If the current node is NOT the record's lmaster then + * add it to the list of records that are to be sent to + * the lmaster with the VACUUM_FETCH message. + */ +static int delete_queue_traverse(void *param, void *data) +{ + struct delete_record_data *dd = + talloc_get_type(data, struct delete_record_data); + struct vacuum_data *vdata = talloc_get_type(param, struct vacuum_data); + struct ctdb_db_context *ctdb_db = dd->ctdb_db; + struct ctdb_context *ctdb = ctdb_db->ctdb; /* or dd->ctdb ??? */ + int res; + struct ctdb_ltdb_header header; + uint32_t lmaster; + uint32_t hash = ctdb_hash(&(dd->key)); + + vdata->count.delete_queue.total++; + + res = tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, dd->key); + if (res != 0) { + vdata->count.delete_queue.error++; + return 0; + } + + res = tdb_parse_record(ctdb_db->ltdb->tdb, dd->key, + vacuum_record_parser, &header); + if (res != 0) { + goto skipped; + } + + if (header.dmaster != ctdb->pnn) { + /* The record has been migrated off the node. Skip. */ + goto skipped; + } + + if (header.rsn != dd->hdr.rsn) { + /* + * The record has been migrated off the node and back again. + * But not requeued for deletion. Skip it. + */ + goto skipped; + } + + /* + * We are dmaster, and the record has no data, and it has + * not been migrated after it has been queued for deletion. + * + * At this stage, the record could still have been revived locally + * and last been written with empty data. This can only be + * fixed with the addition of an active or delete flag. (TODO) + */ + + lmaster = ctdb_lmaster(ctdb_db->ctdb, &dd->key); + + if (lmaster != ctdb->pnn) { + res = add_record_to_vacuum_fetch_list(vdata, dd->key); + + if (res != 0) { + DEBUG(DEBUG_ERR, + (__location__ " Error adding record to list " + "of records to send to lmaster.\n")); + vdata->count.delete_queue.error++; + } else { + vdata->count.delete_queue.added_to_vacuum_fetch_list++; + } + goto done; + } + + /* use header->flags or dd->hdr.flags ?? */ + if (dd->hdr.flags & CTDB_REC_FLAG_MIGRATED_WITH_DATA) { + res = add_record_to_delete_list(vdata, dd->key, &dd->hdr); + + if (res != 0) { + DEBUG(DEBUG_ERR, + (__location__ " Error adding record to list " + "of records for deletion on lmaster.\n")); + vdata->count.delete_queue.error++; + } else { + vdata->count.delete_queue.added_to_delete_list++; + } + } else { + res = tdb_delete(ctdb_db->ltdb->tdb, dd->key); + + if (res != 0) { + DEBUG(DEBUG_ERR, + (__location__ " Error deleting record with key " + "hash [0x%08x] from local data base db[%s].\n", + hash, ctdb_db->db_name)); + vdata->count.delete_queue.error++; + goto done; + } + + DEBUG(DEBUG_DEBUG, + (__location__ " Deleted record with key hash " + "[0x%08x] from local data base db[%s].\n", + hash, ctdb_db->db_name)); + vdata->count.delete_queue.deleted++; + } + + goto done; + +skipped: + vdata->count.delete_queue.skipped++; + +done: + tdb_chainunlock(ctdb_db->ltdb->tdb, dd->key); + + return 0; +} + +/** + * Delete the records that we are lmaster and dmaster for and + * that could be deleted on all other nodes via the TRY_DELETE_RECORDS + * control. + */ +static int delete_record_traverse(void *param, void *data) +{ + struct delete_record_data *dd = + talloc_get_type(data, struct delete_record_data); + struct vacuum_data *vdata = talloc_get_type(param, struct vacuum_data); + struct ctdb_db_context *ctdb_db = dd->ctdb_db; + struct ctdb_context *ctdb = ctdb_db->ctdb; + int res; + struct ctdb_ltdb_header header; + uint32_t lmaster; + uint32_t hash = ctdb_hash(&(dd->key)); + + if (dd->remote_fail_count > 0) { + vdata->count.delete_list.remote_error++; + vdata->count.delete_list.left--; + talloc_free(dd); + return 0; + } + + res = tdb_chainlock(ctdb_db->ltdb->tdb, dd->key); + if (res != 0) { + DEBUG(DEBUG_ERR, + (__location__ " Error getting chainlock on record with " + "key hash [0x%08x] on database db[%s].\n", + hash, ctdb_db->db_name)); + vdata->count.delete_list.local_error++; + vdata->count.delete_list.left--; + talloc_free(dd); + return 0; + } + + /* + * Verify that the record is still empty, its RSN has not + * changed and that we are still its lmaster and dmaster. + */ + + res = tdb_parse_record(ctdb_db->ltdb->tdb, dd->key, + vacuum_record_parser, &header); + if (res != 0) { + goto skip; + } + + if (header.flags & CTDB_REC_RO_FLAGS) { + DEBUG(DEBUG_INFO, (__location__ ": record with hash [0x%08x] " + "on database db[%s] has read-only flags. " + "skipping.\n", + hash, ctdb_db->db_name)); + goto skip; + } + + if (header.dmaster != ctdb->pnn) { + DEBUG(DEBUG_INFO, (__location__ ": record with hash [0x%08x] " + "on database db[%s] has been migrated away. " + "skipping.\n", + hash, ctdb_db->db_name)); + goto skip; + } + + if (header.rsn != dd->hdr.rsn) { + /* + * The record has been migrated off the node and back again. + * But not requeued for deletion. Skip it. + */ + DEBUG(DEBUG_INFO, (__location__ ": record with hash [0x%08x] " + "on database db[%s] seems to have been " + "migrated away and back again (with empty " + "data). skipping.\n", + hash, ctdb_db->db_name)); + goto skip; + } + + lmaster = ctdb_lmaster(ctdb_db->ctdb, &dd->key); + + if (lmaster != ctdb->pnn) { + DEBUG(DEBUG_INFO, (__location__ ": not lmaster for record in " + "delete list (key hash [0x%08x], db[%s]). " + "Strange! skipping.\n", + hash, ctdb_db->db_name)); + goto skip; + } + + res = tdb_delete(ctdb_db->ltdb->tdb, dd->key); + + if (res != 0) { + DEBUG(DEBUG_ERR, + (__location__ " Error deleting record with key hash " + "[0x%08x] from local data base db[%s].\n", + hash, ctdb_db->db_name)); + vdata->count.delete_list.local_error++; + goto done; + } + + DEBUG(DEBUG_DEBUG, + (__location__ " Deleted record with key hash [0x%08x] from " + "local data base db[%s].\n", hash, ctdb_db->db_name)); + + vdata->count.delete_list.deleted++; + goto done; + +skip: + vdata->count.delete_list.skipped++; + +done: + tdb_chainunlock(ctdb_db->ltdb->tdb, dd->key); + + talloc_free(dd); + vdata->count.delete_list.left--; + + return 0; +} + +/** + * Traverse the delete_queue. + * Records are either deleted directly or filled + * into the delete list or the vacuum fetch lists + * for further processing. + */ +static void ctdb_process_delete_queue(struct ctdb_db_context *ctdb_db, + struct vacuum_data *vdata) +{ + uint32_t sum; + int ret; + + ret = trbt_traversearray32(ctdb_db->delete_queue, 1, + delete_queue_traverse, vdata); + + if (ret != 0) { + DEBUG(DEBUG_ERR, (__location__ " Error traversing " + "the delete queue.\n")); + } + + sum = vdata->count.delete_queue.deleted + + vdata->count.delete_queue.skipped + + vdata->count.delete_queue.error + + vdata->count.delete_queue.added_to_delete_list + + vdata->count.delete_queue.added_to_vacuum_fetch_list; + + if (vdata->count.delete_queue.total != sum) { + DEBUG(DEBUG_ERR, (__location__ " Inconsistency in fast vacuum " + "counts for db[%s]: total[%u] != sum[%u]\n", + ctdb_db->db_name, + (unsigned)vdata->count.delete_queue.total, + (unsigned)sum)); + } + + if (vdata->count.delete_queue.total > 0) { + DEBUG(DEBUG_INFO, + (__location__ + " fast vacuuming delete_queue traverse statistics: " + "db[%s] " + "total[%u] " + "del[%u] " + "skp[%u] " + "err[%u] " + "adl[%u] " + "avf[%u]\n", + ctdb_db->db_name, + (unsigned)vdata->count.delete_queue.total, + (unsigned)vdata->count.delete_queue.deleted, + (unsigned)vdata->count.delete_queue.skipped, + (unsigned)vdata->count.delete_queue.error, + (unsigned)vdata->count.delete_queue.added_to_delete_list, + (unsigned)vdata->count.delete_queue.added_to_vacuum_fetch_list)); + } + + return; +} + +/** + * read-only traverse of the database, looking for records that + * might be able to be vacuumed. + * + * This is not done each time but only every tunable + * VacuumFastPathCount times. + */ +static void ctdb_vacuum_traverse_db(struct ctdb_db_context *ctdb_db, + struct vacuum_data *vdata) +{ + int ret; + + ret = tdb_traverse_read(ctdb_db->ltdb->tdb, vacuum_traverse, vdata); + if (ret == -1 || vdata->traverse_error) { + DEBUG(DEBUG_ERR, (__location__ " Traverse error in vacuuming " + "'%s'\n", ctdb_db->db_name)); + return; + } + + if (vdata->count.db_traverse.total > 0) { + DEBUG(DEBUG_INFO, + (__location__ + " full vacuuming db traverse statistics: " + "db[%s] " + "total[%u] " + "skp[%u] " + "err[%u] " + "sched[%u]\n", + ctdb_db->db_name, + (unsigned)vdata->count.db_traverse.total, + (unsigned)vdata->count.db_traverse.skipped, + (unsigned)vdata->count.db_traverse.error, + (unsigned)vdata->count.db_traverse.scheduled)); + } + + return; +} + +/** + * Process the vacuum fetch lists: + * For records for which we are not the lmaster, tell the lmaster to + * fetch the record. + */ +static void ctdb_process_vacuum_fetch_lists(struct ctdb_db_context *ctdb_db, + struct vacuum_data *vdata) +{ + unsigned int i; + struct ctdb_context *ctdb = ctdb_db->ctdb; + int ret, res; + + for (i = 0; i < ctdb->num_nodes; i++) { + TDB_DATA data; + struct ctdb_marshall_buffer *vfl = vdata->vacuum_fetch_list[i]; + + if (ctdb->nodes[i]->pnn == ctdb->pnn) { + continue; + } + + if (vfl->count == 0) { + continue; + } + + DEBUG(DEBUG_INFO, ("Found %u records for lmaster %u in '%s'\n", + vfl->count, ctdb->nodes[i]->pnn, + ctdb_db->db_name)); + + data = ctdb_marshall_finish(vfl); + + ret = ctdb_control(ctdb, ctdb->nodes[i]->pnn, 0, + CTDB_CONTROL_VACUUM_FETCH, 0, + data, NULL, NULL, &res, NULL, NULL); + if (ret != 0 || res != 0) { + DEBUG(DEBUG_ERR, ("Failed to send vacuum " + "fetch control to node %u\n", + ctdb->nodes[i]->pnn)); + } + } +} + +/** + * Process the delete list: + * + * This is the last step of vacuuming that consistently deletes + * those records that have been migrated with data and can hence + * not be deleted when leaving a node. + * + * In this step, the lmaster does the final deletion of those empty + * records that it is also dmaster for. It has usually received + * at least some of these records previously from the former dmasters + * with the vacuum fetch message. + * + * 1) Send the records to all active nodes with the TRY_DELETE_RECORDS + * control. The remote notes delete their local copy. + * 2) The lmaster locally deletes its copies of all records that + * could successfully be deleted remotely in step #2. + */ +static void ctdb_process_delete_list(struct ctdb_db_context *ctdb_db, + struct vacuum_data *vdata) +{ + int ret, i; + struct ctdb_context *ctdb = ctdb_db->ctdb; + struct delete_records_list *recs; + TDB_DATA indata; + struct ctdb_node_map_old *nodemap; + uint32_t *active_nodes; + int num_active_nodes; + TALLOC_CTX *tmp_ctx; + uint32_t sum; + + if (vdata->count.delete_list.total == 0) { + return; + } + + tmp_ctx = talloc_new(vdata); + if (tmp_ctx == NULL) { + DEBUG(DEBUG_ERR,(__location__ " Out of memory\n")); + return; + } + + vdata->count.delete_list.left = vdata->count.delete_list.total; + + /* + * get the list of currently active nodes + */ + + ret = ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(), + CTDB_CURRENT_NODE, + tmp_ctx, + &nodemap); + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n")); + goto done; + } + + active_nodes = list_of_active_nodes(ctdb, nodemap, + nodemap, /* talloc context */ + false /* include self */); + /* yuck! ;-) */ + num_active_nodes = talloc_get_size(active_nodes)/sizeof(*active_nodes); + + /* + * Now delete the records all active nodes in a two-phase process: + * 1) tell all active remote nodes to delete all their copy + * 2) if all remote nodes deleted their record copy, delete it locally + */ + + recs = talloc_zero(tmp_ctx, struct delete_records_list); + if (recs == NULL) { + DEBUG(DEBUG_ERR,(__location__ " Out of memory\n")); + goto done; + } + + /* + * Step 1: + * Send all records to all active nodes for deletion. + */ + + /* + * Create a marshall blob from the remaining list of records to delete. + */ + + recs->records = (struct ctdb_marshall_buffer *) + talloc_zero_size(recs, + offsetof(struct ctdb_marshall_buffer, data)); + if (recs->records == NULL) { + DEBUG(DEBUG_ERR,(__location__ " Out of memory\n")); + goto done; + } + recs->records->db_id = ctdb_db->db_id; + + ret = trbt_traversearray32(vdata->delete_list, 1, + delete_marshall_traverse, recs); + if (ret != 0) { + DEBUG(DEBUG_ERR, (__location__ " Error traversing the " + "delete list for second marshalling.\n")); + goto done; + } + + indata = ctdb_marshall_finish(recs->records); + + for (i = 0; i < num_active_nodes; i++) { + struct ctdb_marshall_buffer *records; + struct ctdb_rec_data_old *rec; + int32_t res; + TDB_DATA outdata; + + ret = ctdb_control(ctdb, active_nodes[i], 0, + CTDB_CONTROL_TRY_DELETE_RECORDS, 0, + indata, recs, &outdata, &res, + NULL, NULL); + if (ret != 0 || res != 0) { + DEBUG(DEBUG_ERR, ("Failed to delete records on " + "node %u: ret[%d] res[%d]\n", + active_nodes[i], ret, res)); + goto done; + } + + /* + * outdata contains the list of records coming back + * from the node: These are the records that the + * remote node could not delete. We remove these from + * the list to delete locally. + */ + records = (struct ctdb_marshall_buffer *)outdata.dptr; + rec = (struct ctdb_rec_data_old *)&records->data[0]; + while (records->count-- > 0) { + TDB_DATA reckey, recdata; + struct ctdb_ltdb_header *rechdr; + struct delete_record_data *dd; + + reckey.dptr = &rec->data[0]; + reckey.dsize = rec->keylen; + recdata.dptr = &rec->data[reckey.dsize]; + recdata.dsize = rec->datalen; + + if (recdata.dsize < sizeof(struct ctdb_ltdb_header)) { + DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n")); + goto done; + } + rechdr = (struct ctdb_ltdb_header *)recdata.dptr; + recdata.dptr += sizeof(*rechdr); + recdata.dsize -= sizeof(*rechdr); + + dd = (struct delete_record_data *)trbt_lookup32( + vdata->delete_list, + ctdb_hash(&reckey)); + if (dd != NULL) { + /* + * The remote node could not delete the + * record. Since other remote nodes can + * also fail, we just mark the record. + */ + dd->remote_fail_count++; + } else { + DEBUG(DEBUG_ERR, (__location__ " Failed to " + "find record with hash 0x%08x coming " + "back from TRY_DELETE_RECORDS " + "control in delete list.\n", + ctdb_hash(&reckey))); + } + + rec = (struct ctdb_rec_data_old *)(rec->length + (uint8_t *)rec); + } + } + + /* + * Step 2: + * Delete the remaining records locally. + * + * These records have successfully been deleted on all + * active remote nodes. + */ + + ret = trbt_traversearray32(vdata->delete_list, 1, + delete_record_traverse, vdata); + if (ret != 0) { + DEBUG(DEBUG_ERR, (__location__ " Error traversing the " + "delete list for deletion.\n")); + } + + if (vdata->count.delete_list.left != 0) { + DEBUG(DEBUG_ERR, (__location__ " Vacuum db[%s] error: " + "there are %u records left for deletion after " + "processing delete list\n", + ctdb_db->db_name, + (unsigned)vdata->count.delete_list.left)); + } + + sum = vdata->count.delete_list.deleted + + vdata->count.delete_list.skipped + + vdata->count.delete_list.remote_error + + vdata->count.delete_list.local_error + + vdata->count.delete_list.left; + + if (vdata->count.delete_list.total != sum) { + DEBUG(DEBUG_ERR, (__location__ " Inconsistency in vacuum " + "delete list counts for db[%s]: total[%u] != sum[%u]\n", + ctdb_db->db_name, + (unsigned)vdata->count.delete_list.total, + (unsigned)sum)); + } + + if (vdata->count.delete_list.total > 0) { + DEBUG(DEBUG_INFO, + (__location__ + " vacuum delete list statistics: " + "db[%s] " + "total[%u] " + "del[%u] " + "skip[%u] " + "rem.err[%u] " + "loc.err[%u] " + "left[%u]\n", + ctdb_db->db_name, + (unsigned)vdata->count.delete_list.total, + (unsigned)vdata->count.delete_list.deleted, + (unsigned)vdata->count.delete_list.skipped, + (unsigned)vdata->count.delete_list.remote_error, + (unsigned)vdata->count.delete_list.local_error, + (unsigned)vdata->count.delete_list.left)); + } + +done: + talloc_free(tmp_ctx); + + return; +} + +/** + * initialize the vacuum_data + */ +static struct vacuum_data *ctdb_vacuum_init_vacuum_data( + struct ctdb_db_context *ctdb_db, + TALLOC_CTX *mem_ctx) +{ + unsigned int i; + struct ctdb_context *ctdb = ctdb_db->ctdb; + struct vacuum_data *vdata; + + vdata = talloc_zero(mem_ctx, struct vacuum_data); + if (vdata == NULL) { + DEBUG(DEBUG_ERR,(__location__ " Out of memory\n")); + return NULL; + } + + vdata->ctdb = ctdb_db->ctdb; + vdata->ctdb_db = ctdb_db; + vdata->delete_list = trbt_create(vdata, 0); + if (vdata->delete_list == NULL) { + DEBUG(DEBUG_ERR,(__location__ " Out of memory\n")); + goto fail; + } + + vdata->start = timeval_current(); + + vdata->count.delete_queue.added_to_delete_list = 0; + vdata->count.delete_queue.added_to_vacuum_fetch_list = 0; + vdata->count.delete_queue.deleted = 0; + vdata->count.delete_queue.skipped = 0; + vdata->count.delete_queue.error = 0; + vdata->count.delete_queue.total = 0; + vdata->count.db_traverse.scheduled = 0; + vdata->count.db_traverse.skipped = 0; + vdata->count.db_traverse.error = 0; + vdata->count.db_traverse.total = 0; + vdata->count.delete_list.total = 0; + vdata->count.delete_list.left = 0; + vdata->count.delete_list.remote_error = 0; + vdata->count.delete_list.local_error = 0; + vdata->count.delete_list.skipped = 0; + vdata->count.delete_list.deleted = 0; + + /* the list needs to be of length num_nodes */ + vdata->vacuum_fetch_list = talloc_zero_array(vdata, + struct ctdb_marshall_buffer *, + ctdb->num_nodes); + if (vdata->vacuum_fetch_list == NULL) { + DEBUG(DEBUG_ERR,(__location__ " Out of memory\n")); + goto fail; + } + for (i = 0; i < ctdb->num_nodes; i++) { + vdata->vacuum_fetch_list[i] = (struct ctdb_marshall_buffer *) + talloc_zero_size(vdata->vacuum_fetch_list, + offsetof(struct ctdb_marshall_buffer, data)); + if (vdata->vacuum_fetch_list[i] == NULL) { + DEBUG(DEBUG_ERR,(__location__ " Out of memory\n")); + talloc_free(vdata); + return NULL; + } + vdata->vacuum_fetch_list[i]->db_id = ctdb_db->db_id; + } + + return vdata; + +fail: + talloc_free(vdata); + return NULL; +} + +/** + * Vacuum a DB: + * - Always do the fast vacuuming run, which traverses + * - the in-memory fetch queue: these records have been + * scheduled for migration + * - the in-memory delete queue: these records have been + * scheduled for deletion. + * - Only if explicitly requested, the database is traversed + * in order to use the traditional heuristics on empty records + * to trigger deletion. + * This is done only every VacuumFastPathCount'th vacuuming run. + * + * The traverse runs fill two lists: + * + * - The delete_list: + * This is the list of empty records the current + * node is lmaster and dmaster for. These records are later + * deleted first on other nodes and then locally. + * + * The fast vacuuming run has a short cut for those records + * that have never been migrated with data: these records + * are immediately deleted locally, since they have left + * no trace on other nodes. + * + * - The vacuum_fetch lists + * (one for each other lmaster node): + * The records in this list are sent for deletion to + * their lmaster in a bulk VACUUM_FETCH control. + * + * The lmaster then migrates all these records to itelf + * so that they can be vacuumed there. + * + * This executes in the child context. + */ +static int ctdb_vacuum_db(struct ctdb_db_context *ctdb_db, + bool full_vacuum_run) +{ + struct ctdb_context *ctdb = ctdb_db->ctdb; + int ret, pnn; + struct vacuum_data *vdata; + TALLOC_CTX *tmp_ctx; + + DEBUG(DEBUG_INFO, (__location__ " Entering %s vacuum run for db " + "%s db_id[0x%08x]\n", + full_vacuum_run ? "full" : "fast", + ctdb_db->db_name, ctdb_db->db_id)); + + ret = ctdb_ctrl_getvnnmap(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, ctdb, &ctdb->vnn_map); + if (ret != 0) { + DEBUG(DEBUG_ERR, ("Unable to get vnnmap from local node\n")); + return ret; + } + + pnn = ctdb_ctrl_getpnn(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE); + if (pnn == -1) { + DEBUG(DEBUG_ERR, ("Unable to get pnn from local node\n")); + return -1; + } + + ctdb->pnn = pnn; + + tmp_ctx = talloc_new(ctdb_db); + if (tmp_ctx == NULL) { + DEBUG(DEBUG_ERR, ("Out of memory!\n")); + return -1; + } + + vdata = ctdb_vacuum_init_vacuum_data(ctdb_db, tmp_ctx); + if (vdata == NULL) { + talloc_free(tmp_ctx); + return -1; + } + + if (full_vacuum_run) { + ctdb_vacuum_traverse_db(ctdb_db, vdata); + } + + ctdb_process_fetch_queue(ctdb_db); + + ctdb_process_delete_queue(ctdb_db, vdata); + + ctdb_process_vacuum_fetch_lists(ctdb_db, vdata); + + ctdb_process_delete_list(ctdb_db, vdata); + + talloc_free(tmp_ctx); + + return 0; +} + +/* + * repack and vacuum a db + * called from the child context + */ +static int ctdb_vacuum_and_repack_db(struct ctdb_db_context *ctdb_db, + bool full_vacuum_run) +{ + uint32_t repack_limit = ctdb_db->ctdb->tunable.repack_limit; + const char *name = ctdb_db->db_name; + int freelist_size = 0; + int ret; + + if (ctdb_vacuum_db(ctdb_db, full_vacuum_run) != 0) { + DEBUG(DEBUG_ERR,(__location__ " Failed to vacuum '%s'\n", name)); + } + + freelist_size = tdb_freelist_size(ctdb_db->ltdb->tdb); + if (freelist_size == -1) { + DEBUG(DEBUG_ERR,(__location__ " Failed to get freelist size for '%s'\n", name)); + return -1; + } + + /* + * decide if a repack is necessary + */ + if ((repack_limit == 0 || (uint32_t)freelist_size < repack_limit)) + { + return 0; + } + + D_NOTICE("Repacking %s with %u freelist entries\n", + name, + freelist_size); + + ret = tdb_repack(ctdb_db->ltdb->tdb); + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ " Failed to repack '%s'\n", name)); + return -1; + } + + return 0; +} + +static uint32_t get_vacuum_interval(struct ctdb_db_context *ctdb_db) +{ + uint32_t interval = ctdb_db->ctdb->tunable.vacuum_interval; + + return interval; +} + +static int vacuum_child_destructor(struct ctdb_vacuum_child_context *child_ctx) +{ + double l = timeval_elapsed(&child_ctx->start_time); + struct ctdb_vacuum_handle *vacuum_handle = child_ctx->vacuum_handle; + struct ctdb_db_context *ctdb_db = vacuum_handle->ctdb_db; + struct ctdb_context *ctdb = ctdb_db->ctdb; + + CTDB_UPDATE_DB_LATENCY(ctdb_db, "vacuum", vacuum.latency, l); + DEBUG(DEBUG_INFO,("Vacuuming took %.3f seconds for database %s\n", l, ctdb_db->db_name)); + + if (child_ctx->child_pid != -1) { + ctdb_kill(ctdb, child_ctx->child_pid, SIGKILL); + } else { + /* Bump the number of successful fast-path runs. */ + vacuum_handle->fast_path_count++; + } + + ctdb->vacuumer = NULL; + + if (child_ctx->scheduled) { + vacuum_handle->vacuum_interval = get_vacuum_interval(ctdb_db); + + tevent_add_timer( + ctdb->ev, + vacuum_handle, + timeval_current_ofs(vacuum_handle->vacuum_interval, 0), + ctdb_vacuum_event, + vacuum_handle); + } + + return 0; +} + +/* + * this event is generated when a vacuum child process times out + */ +static void vacuum_child_timeout(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *private_data) +{ + struct ctdb_vacuum_child_context *child_ctx = talloc_get_type(private_data, struct ctdb_vacuum_child_context); + + DEBUG(DEBUG_ERR,("Vacuuming child process timed out for db %s\n", child_ctx->vacuum_handle->ctdb_db->db_name)); + + child_ctx->status = VACUUM_TIMEOUT; + + talloc_free(child_ctx); +} + + +/* + * this event is generated when a vacuum child process has completed + */ +static void vacuum_child_handler(struct tevent_context *ev, + struct tevent_fd *fde, + uint16_t flags, void *private_data) +{ + struct ctdb_vacuum_child_context *child_ctx = talloc_get_type(private_data, struct ctdb_vacuum_child_context); + char c = 0; + int ret; + + DEBUG(DEBUG_INFO,("Vacuuming child process %d finished for db %s\n", child_ctx->child_pid, child_ctx->vacuum_handle->ctdb_db->db_name)); + child_ctx->child_pid = -1; + + ret = sys_read(child_ctx->fd[0], &c, 1); + if (ret != 1 || c != 0) { + child_ctx->status = VACUUM_ERROR; + DEBUG(DEBUG_ERR, ("A vacuum child process failed with an error for database %s. ret=%d c=%d\n", child_ctx->vacuum_handle->ctdb_db->db_name, ret, c)); + } else { + child_ctx->status = VACUUM_OK; + } + + talloc_free(child_ctx); +} + +/* + * this event is called every time we need to start a new vacuum process + */ +static int vacuum_db_child(TALLOC_CTX *mem_ctx, + struct ctdb_db_context *ctdb_db, + bool scheduled, + bool full_vacuum_run, + struct ctdb_vacuum_child_context **out) +{ + struct ctdb_context *ctdb = ctdb_db->ctdb; + struct ctdb_vacuum_child_context *child_ctx; + struct tevent_fd *fde; + int ret; + + /* we don't vacuum if we are in recovery mode, or db frozen */ + if (ctdb->recovery_mode == CTDB_RECOVERY_ACTIVE || + ctdb_db_frozen(ctdb_db)) { + D_INFO("Not vacuuming %s (%s)\n", ctdb_db->db_name, + ctdb->recovery_mode == CTDB_RECOVERY_ACTIVE ? + "in recovery" : "frozen"); + return EAGAIN; + } + + /* Do not allow multiple vacuuming child processes to be active at the + * same time. If there is vacuuming child process active, delay + * new vacuuming event to stagger vacuuming events. + */ + if (ctdb->vacuumer != NULL) { + return EBUSY; + } + + child_ctx = talloc_zero(mem_ctx, struct ctdb_vacuum_child_context); + if (child_ctx == NULL) { + DBG_ERR("Failed to allocate child context for vacuuming of %s\n", + ctdb_db->db_name); + return ENOMEM; + } + + + ret = pipe(child_ctx->fd); + if (ret != 0) { + talloc_free(child_ctx); + D_ERR("Failed to create pipe for vacuum child process.\n"); + return EAGAIN; + } + + child_ctx->child_pid = ctdb_fork(ctdb); + if (child_ctx->child_pid == (pid_t)-1) { + close(child_ctx->fd[0]); + close(child_ctx->fd[1]); + talloc_free(child_ctx); + D_ERR("Failed to fork vacuum child process.\n"); + return EAGAIN; + } + + + if (child_ctx->child_pid == 0) { + char cc = 0; + close(child_ctx->fd[0]); + + D_INFO("Vacuuming child process %d for db %s started\n", + getpid(), + ctdb_db->db_name); + prctl_set_comment("ctdb_vacuum"); + ret = switch_from_server_to_client(ctdb); + if (ret != 0) { + DBG_ERR("ERROR: failed to switch vacuum daemon " + "into client mode.\n"); + return EIO; + } + + cc = ctdb_vacuum_and_repack_db(ctdb_db, full_vacuum_run); + + sys_write(child_ctx->fd[1], &cc, 1); + _exit(0); + } + + set_close_on_exec(child_ctx->fd[0]); + close(child_ctx->fd[1]); + + child_ctx->status = VACUUM_RUNNING; + child_ctx->scheduled = scheduled; + child_ctx->start_time = timeval_current(); + + ctdb->vacuumer = child_ctx; + talloc_set_destructor(child_ctx, vacuum_child_destructor); + + /* + * Clear the fastpath vacuuming list in the parent. + */ + talloc_free(ctdb_db->delete_queue); + ctdb_db->delete_queue = trbt_create(ctdb_db, 0); + if (ctdb_db->delete_queue == NULL) { + DBG_ERR("Out of memory when re-creating vacuum tree\n"); + return ENOMEM; + } + + talloc_free(ctdb_db->fetch_queue); + ctdb_db->fetch_queue = trbt_create(ctdb_db, 0); + if (ctdb_db->fetch_queue == NULL) { + ctdb_fatal(ctdb, "Out of memory when re-create fetch queue " + " in parent context. Shutting down\n"); + } + + tevent_add_timer(ctdb->ev, child_ctx, + timeval_current_ofs(ctdb->tunable.vacuum_max_run_time, + 0), + vacuum_child_timeout, child_ctx); + + DBG_DEBUG(" Created PIPE FD:%d to child vacuum process\n", + child_ctx->fd[0]); + + fde = tevent_add_fd(ctdb->ev, child_ctx, child_ctx->fd[0], + TEVENT_FD_READ, vacuum_child_handler, child_ctx); + tevent_fd_set_auto_close(fde); + + child_ctx->vacuum_handle = ctdb_db->vacuum_handle; + + *out = child_ctx; + return 0; +} + +static void ctdb_vacuum_event(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *private_data) +{ + struct ctdb_vacuum_handle *vacuum_handle = talloc_get_type( + private_data, struct ctdb_vacuum_handle); + struct ctdb_db_context *ctdb_db = vacuum_handle->ctdb_db; + struct ctdb_context *ctdb = ctdb_db->ctdb; + struct ctdb_vacuum_child_context *child_ctx = NULL; + uint32_t fast_path_max = ctdb->tunable.vacuum_fast_path_count; + uint32_t vacuum_interval = get_vacuum_interval(ctdb_db); + bool full_vacuum_run = false; + int ret; + + if (vacuum_interval > vacuum_handle->vacuum_interval) { + uint32_t d = vacuum_interval - vacuum_handle->vacuum_interval; + + DBG_INFO("Vacuum interval increased from " + "%"PRIu32" to %"PRIu32", rescheduling\n", + vacuum_handle->vacuum_interval, + vacuum_interval); + vacuum_handle->vacuum_interval = vacuum_interval; + tevent_add_timer(ctdb->ev, + vacuum_handle, + timeval_current_ofs(d, 0), + ctdb_vacuum_event, + vacuum_handle); + return; + } + + vacuum_handle->vacuum_interval = vacuum_interval; + + if (vacuum_handle->fast_path_count >= fast_path_max) { + if (fast_path_max > 0) { + full_vacuum_run = true; + } + vacuum_handle->fast_path_count = 0; + } + + ret = vacuum_db_child(vacuum_handle, + ctdb_db, + true, + full_vacuum_run, + &child_ctx); + + if (ret == 0) { + return; + } + + switch (ret) { + case EBUSY: + /* Stagger */ + tevent_add_timer(ctdb->ev, + vacuum_handle, + timeval_current_ofs(0, 500*1000), + ctdb_vacuum_event, + vacuum_handle); + break; + + default: + /* Temporary failure, schedule next attempt */ + tevent_add_timer(ctdb->ev, + vacuum_handle, + timeval_current_ofs( + vacuum_handle->vacuum_interval, 0), + ctdb_vacuum_event, + vacuum_handle); + } + +} + +struct vacuum_control_state { + struct ctdb_vacuum_child_context *child_ctx; + struct ctdb_req_control_old *c; + struct ctdb_context *ctdb; +}; + +static int vacuum_control_state_destructor(struct vacuum_control_state *state) +{ + struct ctdb_vacuum_child_context *child_ctx = state->child_ctx; + int32_t status; + + status = (child_ctx->status == VACUUM_OK ? 0 : -1); + ctdb_request_control_reply(state->ctdb, state->c, NULL, status, NULL); + + return 0; +} + +int32_t ctdb_control_db_vacuum(struct ctdb_context *ctdb, + struct ctdb_req_control_old *c, + TDB_DATA indata, + bool *async_reply) +{ + struct ctdb_db_context *ctdb_db; + struct ctdb_vacuum_child_context *child_ctx = NULL; + struct ctdb_db_vacuum *db_vacuum; + struct vacuum_control_state *state; + size_t np; + int ret; + + ret = ctdb_db_vacuum_pull(indata.dptr, + indata.dsize, + ctdb, + &db_vacuum, + &np); + if (ret != 0) { + DBG_ERR("Invalid data\n"); + return -1; + } + + ctdb_db = find_ctdb_db(ctdb, db_vacuum->db_id); + if (ctdb_db == NULL) { + DBG_ERR("Unknown db id 0x%08x\n", db_vacuum->db_id); + talloc_free(db_vacuum); + return -1; + } + + state = talloc(ctdb, struct vacuum_control_state); + if (state == NULL) { + DBG_ERR("Memory allocation error\n"); + return -1; + } + + ret = vacuum_db_child(ctdb_db, + ctdb_db, + false, + db_vacuum->full_vacuum_run, + &child_ctx); + + talloc_free(db_vacuum); + + if (ret == 0) { + (void) talloc_steal(child_ctx, state); + + state->child_ctx = child_ctx; + state->c = talloc_steal(state, c); + state->ctdb = ctdb; + + talloc_set_destructor(state, vacuum_control_state_destructor); + + *async_reply = true; + return 0; + } + + talloc_free(state); + + switch (ret) { + case EBUSY: + DBG_WARNING("Vacuuming collision\n"); + break; + + default: + DBG_ERR("Temporary vacuuming failure, ret=%d\n", ret); + } + + return -1; +} + +void ctdb_stop_vacuuming(struct ctdb_context *ctdb) +{ + if (ctdb->vacuumer != NULL) { + D_INFO("Aborting vacuuming for %s (%i)\n", + ctdb->vacuumer->vacuum_handle->ctdb_db->db_name, + (int)ctdb->vacuumer->child_pid); + /* vacuum_child_destructor kills it, removes from list */ + talloc_free(ctdb->vacuumer); + } +} + +/* this function initializes the vacuuming context for a database + * starts the vacuuming events + */ +int ctdb_vacuum_init(struct ctdb_db_context *ctdb_db) +{ + struct ctdb_vacuum_handle *vacuum_handle; + + if (! ctdb_db_volatile(ctdb_db)) { + DEBUG(DEBUG_ERR, + ("Vacuuming is disabled for non-volatile database %s\n", + ctdb_db->db_name)); + return 0; + } + + vacuum_handle = talloc(ctdb_db, struct ctdb_vacuum_handle); + if (vacuum_handle == NULL) { + DBG_ERR("Memory allocation error\n"); + return -1; + } + + vacuum_handle->ctdb_db = ctdb_db; + vacuum_handle->fast_path_count = 0; + vacuum_handle->vacuum_interval = get_vacuum_interval(ctdb_db); + + ctdb_db->vacuum_handle = vacuum_handle; + + tevent_add_timer(ctdb_db->ctdb->ev, + vacuum_handle, + timeval_current_ofs(vacuum_handle->vacuum_interval, 0), + ctdb_vacuum_event, + vacuum_handle); + + return 0; +} + +static void remove_record_from_delete_queue(struct ctdb_db_context *ctdb_db, + const struct ctdb_ltdb_header *hdr, + const TDB_DATA key) +{ + struct delete_record_data *kd; + uint32_t hash; + + hash = (uint32_t)ctdb_hash(&key); + + DEBUG(DEBUG_DEBUG, (__location__ + " remove_record_from_delete_queue: " + "db[%s] " + "db_id[0x%08x] " + "key_hash[0x%08x] " + "lmaster[%u] " + "migrated_with_data[%s]\n", + ctdb_db->db_name, ctdb_db->db_id, + hash, + ctdb_lmaster(ctdb_db->ctdb, &key), + hdr->flags & CTDB_REC_FLAG_MIGRATED_WITH_DATA ? "yes" : "no")); + + kd = (struct delete_record_data *)trbt_lookup32(ctdb_db->delete_queue, hash); + if (kd == NULL) { + DEBUG(DEBUG_DEBUG, (__location__ + " remove_record_from_delete_queue: " + "record not in queue (hash[0x%08x])\n.", + hash)); + return; + } + + if ((kd->key.dsize != key.dsize) || + (memcmp(kd->key.dptr, key.dptr, key.dsize) != 0)) + { + DEBUG(DEBUG_DEBUG, (__location__ + " remove_record_from_delete_queue: " + "hash collision for key with hash[0x%08x] " + "in db[%s] - skipping\n", + hash, ctdb_db->db_name)); + return; + } + + DEBUG(DEBUG_DEBUG, (__location__ + " remove_record_from_delete_queue: " + "removing key with hash[0x%08x]\n", + hash)); + + talloc_free(kd); + + return; +} + +/** + * Insert a record into the ctdb_db context's delete queue, + * handling hash collisions. + */ +static int insert_record_into_delete_queue(struct ctdb_db_context *ctdb_db, + const struct ctdb_ltdb_header *hdr, + TDB_DATA key) +{ + struct delete_record_data *kd; + uint32_t hash; + int ret; + + hash = (uint32_t)ctdb_hash(&key); + + DEBUG(DEBUG_DEBUG, (__location__ " schedule for deletion: db[%s] " + "db_id[0x%08x] " + "key_hash[0x%08x] " + "lmaster[%u] " + "migrated_with_data[%s]\n", + ctdb_db->db_name, ctdb_db->db_id, + hash, + ctdb_lmaster(ctdb_db->ctdb, &key), + hdr->flags & CTDB_REC_FLAG_MIGRATED_WITH_DATA ? "yes" : "no")); + + kd = (struct delete_record_data *)trbt_lookup32(ctdb_db->delete_queue, hash); + if (kd != NULL) { + if ((kd->key.dsize != key.dsize) || + (memcmp(kd->key.dptr, key.dptr, key.dsize) != 0)) + { + DEBUG(DEBUG_INFO, + (__location__ " schedule for deletion: " + "hash collision for key hash [0x%08x]. " + "Skipping the record.\n", hash)); + return 0; + } else { + DEBUG(DEBUG_DEBUG, + (__location__ " schedule for deletion: " + "updating entry for key with hash [0x%08x].\n", + hash)); + } + } + + ret = insert_delete_record_data_into_tree(ctdb_db->ctdb, ctdb_db, + ctdb_db->delete_queue, + hdr, key); + if (ret != 0) { + DEBUG(DEBUG_INFO, + (__location__ " schedule for deletion: error " + "inserting key with hash [0x%08x] into delete queue\n", + hash)); + return -1; + } + + return 0; +} + +/** + * Schedule a record for deletion. + * Called from the parent context. + */ +int32_t ctdb_control_schedule_for_deletion(struct ctdb_context *ctdb, + TDB_DATA indata) +{ + struct ctdb_control_schedule_for_deletion *dd; + struct ctdb_db_context *ctdb_db; + int ret; + TDB_DATA key; + + dd = (struct ctdb_control_schedule_for_deletion *)indata.dptr; + + ctdb_db = find_ctdb_db(ctdb, dd->db_id); + if (ctdb_db == NULL) { + DEBUG(DEBUG_ERR, (__location__ " Unknown db id 0x%08x\n", + dd->db_id)); + return -1; + } + + key.dsize = dd->keylen; + key.dptr = dd->key; + + ret = insert_record_into_delete_queue(ctdb_db, &dd->hdr, key); + + return ret; +} + +int32_t ctdb_local_schedule_for_deletion(struct ctdb_db_context *ctdb_db, + const struct ctdb_ltdb_header *hdr, + TDB_DATA key) +{ + int ret; + struct ctdb_control_schedule_for_deletion *dd; + TDB_DATA indata; + int32_t status; + + if (ctdb_db->ctdb->ctdbd_pid == getpid()) { + /* main daemon - directly queue */ + ret = insert_record_into_delete_queue(ctdb_db, hdr, key); + + return ret; + } + + /* if we don't have a connection to the daemon we can not send + a control. For example sometimes from update_record control child + process. + */ + if (!ctdb_db->ctdb->can_send_controls) { + return -1; + } + + + /* child process: send the main daemon a control */ + indata.dsize = offsetof(struct ctdb_control_schedule_for_deletion, key) + key.dsize; + indata.dptr = talloc_zero_array(ctdb_db, uint8_t, indata.dsize); + if (indata.dptr == NULL) { + DEBUG(DEBUG_ERR, (__location__ " out of memory\n")); + return -1; + } + dd = (struct ctdb_control_schedule_for_deletion *)(void *)indata.dptr; + dd->db_id = ctdb_db->db_id; + dd->hdr = *hdr; + dd->keylen = key.dsize; + memcpy(dd->key, key.dptr, key.dsize); + + ret = ctdb_control(ctdb_db->ctdb, + CTDB_CURRENT_NODE, + ctdb_db->db_id, + CTDB_CONTROL_SCHEDULE_FOR_DELETION, + CTDB_CTRL_FLAG_NOREPLY, /* flags */ + indata, + NULL, /* mem_ctx */ + NULL, /* outdata */ + &status, + NULL, /* timeout : NULL == wait forever */ + NULL); /* error message */ + + talloc_free(indata.dptr); + + if (ret != 0 || status != 0) { + DEBUG(DEBUG_ERR, (__location__ " Error sending " + "SCHEDULE_FOR_DELETION " + "control.\n")); + if (status != 0) { + ret = -1; + } + } + + return ret; +} + +void ctdb_local_remove_from_delete_queue(struct ctdb_db_context *ctdb_db, + const struct ctdb_ltdb_header *hdr, + const TDB_DATA key) +{ + if (ctdb_db->ctdb->ctdbd_pid != getpid()) { + /* + * Only remove the record from the delete queue if called + * in the main daemon. + */ + return; + } + + remove_record_from_delete_queue(ctdb_db, hdr, key); + + return; +} + +static int vacuum_fetch_parser(uint32_t reqid, + struct ctdb_ltdb_header *header, + TDB_DATA key, TDB_DATA data, + void *private_data) +{ + struct ctdb_db_context *ctdb_db = talloc_get_type_abort( + private_data, struct ctdb_db_context); + struct fetch_record_data *rd; + size_t len; + uint32_t hash; + + len = offsetof(struct fetch_record_data, keydata) + key.dsize; + + rd = (struct fetch_record_data *)talloc_size(ctdb_db->fetch_queue, + len); + if (rd == NULL) { + DEBUG(DEBUG_ERR, (__location__ " Memory error\n")); + return -1; + } + talloc_set_name_const(rd, "struct fetch_record_data"); + + rd->key.dsize = key.dsize; + rd->key.dptr = rd->keydata; + memcpy(rd->keydata, key.dptr, key.dsize); + + hash = ctdb_hash(&key); + + trbt_insert32(ctdb_db->fetch_queue, hash, rd); + + return 0; +} + +int32_t ctdb_control_vacuum_fetch(struct ctdb_context *ctdb, TDB_DATA indata) +{ + struct ctdb_rec_buffer *recbuf; + struct ctdb_db_context *ctdb_db; + size_t npull; + int ret; + + ret = ctdb_rec_buffer_pull(indata.dptr, indata.dsize, ctdb, &recbuf, + &npull); + if (ret != 0) { + DEBUG(DEBUG_ERR, ("Invalid data in vacuum_fetch\n")); + return -1; + } + + ctdb_db = find_ctdb_db(ctdb, recbuf->db_id); + if (ctdb_db == NULL) { + talloc_free(recbuf); + DEBUG(DEBUG_ERR, (__location__ " Unknown db 0x%08x\n", + recbuf->db_id)); + return -1; + } + + ret = ctdb_rec_buffer_traverse(recbuf, vacuum_fetch_parser, ctdb_db); + talloc_free(recbuf); + return ret; +} diff --git a/ctdb/server/ctdbd.c b/ctdb/server/ctdbd.c new file mode 100644 index 0000000..a388bff --- /dev/null +++ b/ctdb/server/ctdbd.c @@ -0,0 +1,407 @@ +/* + standalone ctdb daemon + + Copyright (C) Andrew Tridgell 2006 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "replace.h" +#include "system/filesys.h" +#include "system/time.h" +#include "system/wait.h" +#include "system/network.h" +#include "system/syslog.h" + +#include <popt.h> +#include <talloc.h> +/* Allow use of deprecated function tevent_loop_allow_nesting() */ +#define TEVENT_DEPRECATED +#include <tevent.h> + +#include "lib/util/debug.h" +#include "lib/util/samba_util.h" + +#include "ctdb_private.h" + +#include "common/reqid.h" +#include "common/system.h" +#include "common/common.h" +#include "common/path.h" +#include "common/logging.h" +#include "common/logging_conf.h" + +#include "ctdb_config.h" + +int script_log_level; +bool fast_start; + +/* + called by the transport layer when a packet comes in +*/ +static void ctdb_recv_pkt(struct ctdb_context *ctdb, uint8_t *data, uint32_t length) +{ + struct ctdb_req_header *hdr = (struct ctdb_req_header *)data; + + CTDB_INCREMENT_STAT(ctdb, node_packets_recv); + + /* up the counter for this source node, so we know its alive */ + if (ctdb_validate_pnn(ctdb, hdr->srcnode)) { + /* as a special case, redirected calls don't increment the rx_cnt */ + if (hdr->operation != CTDB_REQ_CALL || + ((struct ctdb_req_call_old *)hdr)->hopcount == 0) { + ctdb->nodes[hdr->srcnode]->rx_cnt++; + } + } + + ctdb_input_pkt(ctdb, hdr); +} + +static const struct ctdb_upcalls ctdb_upcalls = { + .recv_pkt = ctdb_recv_pkt, + .node_dead = ctdb_node_dead, + .node_connected = ctdb_node_connected +}; + +static struct ctdb_context *ctdb_init(struct tevent_context *ev) +{ + int ret; + struct ctdb_context *ctdb; + + ctdb = talloc_zero(ev, struct ctdb_context); + if (ctdb == NULL) { + DBG_ERR("Memory error\n"); + return NULL; + } + ctdb->ev = ev; + + /* Wrap early to exercise code. */ + ret = reqid_init(ctdb, INT_MAX-200, &ctdb->idr); + if (ret != 0) { + D_ERR("reqid_init failed (%s)\n", strerror(ret)); + talloc_free(ctdb); + return NULL; + } + + ret = srvid_init(ctdb, &ctdb->srv); + if (ret != 0) { + D_ERR("srvid_init failed (%s)\n", strerror(ret)); + talloc_free(ctdb); + return NULL; + } + + ctdb->daemon.name = path_socket(ctdb, "ctdbd"); + if (ctdb->daemon.name == NULL) { + DBG_ERR("Memory allocation error\n"); + talloc_free(ctdb); + return NULL; + } + + ctdbd_pidfile = path_pidfile(ctdb, "ctdbd"); + if (ctdbd_pidfile == NULL) { + DBG_ERR("Memory allocation error\n"); + talloc_free(ctdb); + return NULL; + } + + gettimeofday(&ctdb->ctdbd_start_time, NULL); + + gettimeofday(&ctdb->last_recovery_started, NULL); + gettimeofday(&ctdb->last_recovery_finished, NULL); + + ctdb->recovery_mode = CTDB_RECOVERY_NORMAL; + + ctdb->upcalls = &ctdb_upcalls; + + ctdb->statistics.statistics_start_time = timeval_current(); + + ctdb->capabilities = CTDB_CAP_DEFAULT; + + /* + * Initialise this node's PNN to the unknown value. This will + * be set to the correct value by either ctdb_add_node() as + * part of loading the nodes file or by + * ctdb_tcp_listen_automatic() when the transport is + * initialised. At some point we should de-optimise this and + * pull it out into ctdb_start_daemon() so it is done clearly + * and only in one place. + */ + ctdb->pnn = CTDB_UNKNOWN_PNN; + + ctdb->do_checkpublicip = true; + + return ctdb; +} + + +/* + main program +*/ +int main(int argc, const char *argv[]) +{ + struct ctdb_context *ctdb = NULL; + int interactive_opt = 0; + bool interactive = false; + + struct poptOption popt_options[] = { + POPT_AUTOHELP + { "interactive", 'i', POPT_ARG_NONE, &interactive_opt, 0, + "don't fork, log to stderr", NULL }, + POPT_TABLEEND + }; + int opt, ret; + const char **extra_argv; + poptContext pc; + struct tevent_context *ev; + const char *ctdb_base; + struct conf_context *conf; + const char *logging_location; + const char *test_mode; + bool ok; + + setproctitle_init(argc, discard_const(argv), environ); + + /* + * Basic setup + */ + + talloc_enable_null_tracking(); + + fault_setup(); + + ev = tevent_context_init(NULL); + if (ev == NULL) { + fprintf(stderr, "tevent_context_init() failed\n"); + exit(1); + } + tevent_loop_allow_nesting(ev); + + ctdb = ctdb_init(ev); + if (ctdb == NULL) { + fprintf(stderr, "Failed to init ctdb\n"); + exit(1); + } + + /* Default value for CTDB_BASE - don't override */ + setenv("CTDB_BASE", CTDB_ETCDIR, 0); + ctdb_base = getenv("CTDB_BASE"); + if (ctdb_base == NULL) { + D_ERR("CTDB_BASE not set\n"); + exit(1); + } + + /* + * Command-line option handling + */ + + pc = poptGetContext(argv[0], argc, argv, popt_options, POPT_CONTEXT_KEEP_FIRST); + + while ((opt = poptGetNextOpt(pc)) != -1) { + switch (opt) { + default: + fprintf(stderr, "Invalid option %s: %s\n", + poptBadOption(pc, 0), poptStrerror(opt)); + goto fail; + } + } + + /* If there are extra arguments then exit with usage message */ + extra_argv = poptGetArgs(pc); + if (extra_argv) { + extra_argv++; + if (extra_argv[0]) { + poptPrintHelp(pc, stdout, 0); + goto fail; + } + } + + interactive = (interactive_opt != 0); + + /* + * Configuration file handling + */ + + ret = ctdbd_config_load(ctdb, &conf); + if (ret != 0) { + /* ctdbd_config_load() logs the failure */ + goto fail; + } + + /* + * Logging setup/options + */ + + test_mode = getenv("CTDB_TEST_MODE"); + + /* Log to stderr (ignoring configuration) when running as interactive */ + if (interactive) { + logging_location = "file:"; + setenv("CTDB_INTERACTIVE", "true", 1); + } else { + logging_location = logging_conf_location(conf); + } + + if (strcmp(logging_location, "syslog") != 0 && test_mode == NULL) { + /* This can help when CTDB logging is misconfigured */ + syslog(LOG_DAEMON|LOG_NOTICE, + "CTDB logging to location %s", + logging_location); + } + + /* Initialize logging and set the debug level */ + ok = ctdb_logging_init(ctdb, + logging_location, + logging_conf_log_level(conf)); + if (!ok) { + goto fail; + } + setenv("CTDB_LOGGING", logging_location, 1); + setenv("CTDB_DEBUGLEVEL", debug_level_to_string(DEBUGLEVEL), 1); + + script_log_level = debug_level_from_string( + ctdb_config.script_log_level); + + D_NOTICE("CTDB starting on node\n"); + + /* + * Cluster setup/options + */ + + ret = ctdb_set_transport(ctdb, ctdb_config.transport); + if (ret == -1) { + D_ERR("ctdb_set_transport failed - %s\n", ctdb_errstr(ctdb)); + goto fail; + } + + if (ctdb_config.cluster_lock != NULL) { + ctdb->recovery_lock = ctdb_config.cluster_lock; + } else if (ctdb_config.recovery_lock != NULL) { + ctdb->recovery_lock = ctdb_config.recovery_lock; + } else { + D_WARNING("Cluster lock not set\n"); + } + + /* tell ctdb what address to listen on */ + if (ctdb_config.node_address) { + ret = ctdb_set_address(ctdb, ctdb_config.node_address); + if (ret == -1) { + D_ERR("ctdb_set_address failed - %s\n", + ctdb_errstr(ctdb)); + goto fail; + } + } + + /* tell ctdb what nodes are available */ + ctdb->nodes_file = talloc_asprintf(ctdb, "%s/nodes", ctdb_base); + if (ctdb->nodes_file == NULL) { + DBG_ERR(" Out of memory\n"); + goto fail; + } + ctdb_load_nodes_file(ctdb); + + /* + * Database setup/options + */ + + ctdb->db_directory = ctdb_config.dbdir_volatile; + ok = directory_exist(ctdb->db_directory); + if (! ok) { + D_ERR("Volatile database directory %s does not exist\n", + ctdb->db_directory); + goto fail; + } + + ctdb->db_directory_persistent = ctdb_config.dbdir_persistent; + ok = directory_exist(ctdb->db_directory_persistent); + if (! ok) { + D_ERR("Persistent database directory %s does not exist\n", + ctdb->db_directory_persistent); + goto fail; + } + + ctdb->db_directory_state = ctdb_config.dbdir_state; + ok = directory_exist(ctdb->db_directory_state); + if (! ok) { + D_ERR("State database directory %s does not exist\n", + ctdb->db_directory_state); + goto fail; + } + + if (ctdb_config.lock_debug_script != NULL) { + ret = setenv("CTDB_DEBUG_LOCKS", + ctdb_config.lock_debug_script, + 1); + if (ret != 0) { + D_ERR("Failed to set up lock debugging (%s)\n", + strerror(errno)); + goto fail; + } + } + + /* + * Legacy setup/options + */ + + ctdb->start_as_disabled = (int)ctdb_config.start_as_disabled; + ctdb->start_as_stopped = (int)ctdb_config.start_as_stopped; + + /* set ctdbd capabilities */ + if (!ctdb_config.lmaster_capability) { + ctdb->capabilities &= ~CTDB_CAP_LMASTER; + } + if (!ctdb_config.leader_capability) { + ctdb->capabilities &= ~CTDB_CAP_RECMASTER; + } + + ctdb->do_setsched = ctdb_config.realtime_scheduling; + + /* + * Miscellaneous setup + */ + + ctdb_tunables_load(ctdb); + + ctdb->event_script_dir = talloc_asprintf(ctdb, + "%s/events/legacy", + ctdb_base); + if (ctdb->event_script_dir == NULL) { + DBG_ERR("Out of memory\n"); + goto fail; + } + + ctdb->notification_script = talloc_asprintf(ctdb, + "%s/notify.sh", + ctdb_base); + if (ctdb->notification_script == NULL) { + D_ERR("Unable to set notification script\n"); + goto fail; + } + + /* + * Testing and debug options + */ + + if (test_mode != NULL) { + ctdb->do_setsched = false; + ctdb->do_checkpublicip = false; + fast_start = true; + } + + /* start the protocol running (as a child) */ + return ctdb_start_daemon(ctdb, interactive, test_mode != NULL); + +fail: + talloc_free(ctdb); + exit(1); +} diff --git a/ctdb/server/eventscript.c b/ctdb/server/eventscript.c new file mode 100644 index 0000000..3ea7d74 --- /dev/null +++ b/ctdb/server/eventscript.c @@ -0,0 +1,845 @@ +/* + event script handling + + Copyright (C) Andrew Tridgell 2007 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "replace.h" +#include "system/filesys.h" +#include "system/network.h" +#include "system/wait.h" +#include "system/dir.h" +#include "system/locale.h" +#include "system/time.h" +#include "system/dir.h" + +#include <talloc.h> +#include <tevent.h> + +#include "lib/util/dlinklist.h" +#include "lib/util/debug.h" +#include "lib/util/samba_util.h" +#include "lib/util/sys_rw.h" + +#include "ctdb_private.h" + +#include "common/common.h" +#include "common/logging.h" +#include "common/reqid.h" +#include "common/sock_io.h" +#include "common/path.h" + +#include "protocol/protocol_util.h" +#include "event/event_protocol_api.h" + +/* + * Setting up event daemon + */ + +struct eventd_context { + struct tevent_context *ev; + const char *path; + const char *socket; + + /* server state */ + pid_t eventd_pid; + struct tevent_fd *eventd_fde; + + /* client state */ + struct reqid_context *idr; + struct sock_queue *queue; + struct eventd_client_state *calls; +}; + +static bool eventd_context_init(TALLOC_CTX *mem_ctx, + struct ctdb_context *ctdb, + struct eventd_context **out) +{ + struct eventd_context *ectx; + const char *eventd = CTDB_HELPER_BINDIR "/ctdb-eventd"; + const char *value; + int ret; + + ectx = talloc_zero(mem_ctx, struct eventd_context); + if (ectx == NULL) { + return false; + } + + ectx->ev = ctdb->ev; + + value = getenv("CTDB_EVENTD"); + if (value != NULL) { + eventd = value; + } + + ectx->path = talloc_strdup(ectx, eventd); + if (ectx->path == NULL) { + talloc_free(ectx); + return false; + } + + ectx->socket = path_socket(ectx, "eventd"); + if (ectx->socket == NULL) { + talloc_free(ectx); + return false; + } + + ret = reqid_init(ectx, 1, &ectx->idr); + if (ret != 0) { + talloc_free(ectx); + return false; + } + + ectx->eventd_pid = -1; + + *out = ectx; + return true; +} + +struct eventd_startup_state { + bool done; + int ret; + int fd; +}; + +static void eventd_startup_timeout_handler(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, + void *private_data) +{ + struct eventd_startup_state *state = + (struct eventd_startup_state *) private_data; + + state->done = true; + state->ret = ETIMEDOUT; +} + +static void eventd_startup_handler(struct tevent_context *ev, + struct tevent_fd *fde, uint16_t flags, + void *private_data) +{ + struct eventd_startup_state *state = + (struct eventd_startup_state *)private_data; + unsigned int data; + ssize_t num_read; + + num_read = sys_read(state->fd, &data, sizeof(data)); + if (num_read == sizeof(data)) { + if (data == 0) { + state->ret = 0; + } else { + state->ret = EIO; + } + } else if (num_read == 0) { + state->ret = EPIPE; + } else if (num_read == -1) { + state->ret = errno; + } else { + state->ret = EINVAL; + } + + state->done = true; +} + + +static int wait_for_daemon_startup(struct tevent_context *ev, + int fd) +{ + TALLOC_CTX *mem_ctx; + struct tevent_timer *timer; + struct tevent_fd *fde; + struct eventd_startup_state state = { + .done = false, + .ret = 0, + .fd = fd, + }; + + mem_ctx = talloc_new(ev); + if (mem_ctx == NULL) { + return ENOMEM; + } + + timer = tevent_add_timer(ev, + mem_ctx, + tevent_timeval_current_ofs(10, 0), + eventd_startup_timeout_handler, + &state); + if (timer == NULL) { + talloc_free(mem_ctx); + return ENOMEM; + } + + fde = tevent_add_fd(ev, + mem_ctx, + fd, + TEVENT_FD_READ, + eventd_startup_handler, + &state); + if (fde == NULL) { + talloc_free(mem_ctx); + return ENOMEM; + } + + while (! state.done) { + tevent_loop_once(ev); + } + + talloc_free(mem_ctx); + + return state.ret; +} + + +/* + * Start and stop event daemon + */ + +static bool eventd_client_connect(struct eventd_context *ectx); +static void eventd_dead_handler(struct tevent_context *ev, + struct tevent_fd *fde, uint16_t flags, + void *private_data); + +int ctdb_start_eventd(struct ctdb_context *ctdb) +{ + struct eventd_context *ectx; + const char **argv; + int fd[2]; + pid_t pid; + int ret; + bool status; + + if (ctdb->ectx == NULL) { + status = eventd_context_init(ctdb, ctdb, &ctdb->ectx); + if (! status) { + DEBUG(DEBUG_ERR, + ("Failed to initialize eventd context\n")); + return -1; + } + } + + ectx = ctdb->ectx; + + if (! sock_clean(ectx->socket)) { + return -1; + } + + ret = pipe(fd); + if (ret != 0) { + return -1; + } + + argv = talloc_array(ectx, const char *, 6); + if (argv == NULL) { + close(fd[0]); + close(fd[1]); + return -1; + } + + argv[0] = ectx->path; + argv[1] = "-P"; + argv[2] = talloc_asprintf(argv, "%d", ctdb->ctdbd_pid); + argv[3] = "-S"; + argv[4] = talloc_asprintf(argv, "%d", fd[1]); + argv[5] = NULL; + + if (argv[2] == NULL || argv[4] == NULL) { + close(fd[0]); + close(fd[1]); + talloc_free(argv); + return -1; + } + + D_NOTICE("Starting event daemon %s %s %s %s %s\n", + argv[0], + argv[1], + argv[2], + argv[3], + argv[4]); + + pid = ctdb_fork(ctdb); + if (pid == -1) { + close(fd[0]); + close(fd[1]); + talloc_free(argv); + return -1; + } + + if (pid == 0) { + close(fd[0]); + ret = execv(argv[0], discard_const(argv)); + if (ret == -1) { + _exit(errno); + } + _exit(0); + } + + talloc_free(argv); + close(fd[1]); + + ret = wait_for_daemon_startup(ctdb->ev, fd[0]); + if (ret != 0) { + ctdb_kill(ctdb, pid, SIGKILL); + close(fd[0]); + D_ERR("Failed to initialize event daemon (%d)\n", ret); + return -1; + } + + ectx->eventd_fde = tevent_add_fd(ctdb->ev, ectx, fd[0], + TEVENT_FD_READ, + eventd_dead_handler, ectx); + if (ectx->eventd_fde == NULL) { + ctdb_kill(ctdb, pid, SIGKILL); + close(fd[0]); + return -1; + } + + tevent_fd_set_auto_close(ectx->eventd_fde); + ectx->eventd_pid = pid; + + status = eventd_client_connect(ectx); + if (! status) { + DEBUG(DEBUG_ERR, ("Failed to connect to event daemon\n")); + ctdb_stop_eventd(ctdb); + return -1; + } + + return 0; +} + +static void eventd_dead_handler(struct tevent_context *ev, + struct tevent_fd *fde, uint16_t flags, + void *private_data) +{ + D_ERR("Eventd went away - exiting\n"); + exit(1); +} + +void ctdb_stop_eventd(struct ctdb_context *ctdb) +{ + struct eventd_context *ectx = ctdb->ectx; + + if (ectx == NULL) { + return; + } + + TALLOC_FREE(ectx->eventd_fde); + if (ectx->eventd_pid != -1) { + kill(ectx->eventd_pid, SIGTERM); + ectx->eventd_pid = -1; + } + TALLOC_FREE(ctdb->ectx); +} + +/* + * Connect to event daemon + */ + +struct eventd_client_state { + struct eventd_client_state *prev, *next; + + struct eventd_context *ectx; + void (*callback)(struct ctdb_event_reply *reply, void *private_data); + void *private_data; + + uint32_t reqid; + uint8_t *buf; + size_t buflen; +}; + +static void eventd_client_read(uint8_t *buf, size_t buflen, + void *private_data); +static int eventd_client_state_destructor(struct eventd_client_state *state); + +static bool eventd_client_connect(struct eventd_context *ectx) +{ + int fd; + + if (ectx->queue != NULL) { + return true; + } + + fd = sock_connect(ectx->socket); + if (fd == -1) { + return false; + } + + ectx->queue = sock_queue_setup(ectx, ectx->ev, fd, + eventd_client_read, ectx); + if (ectx->queue == NULL) { + close(fd); + return false; + } + + return true; +} + +static int eventd_client_write(struct eventd_context *ectx, + TALLOC_CTX *mem_ctx, + struct ctdb_event_request *request, + void (*callback)(struct ctdb_event_reply *reply, + void *private_data), + void *private_data) +{ + struct ctdb_event_header header = { 0 }; + struct eventd_client_state *state; + int ret; + + if (! eventd_client_connect(ectx)) { + return -1; + } + + state = talloc_zero(mem_ctx, struct eventd_client_state); + if (state == NULL) { + return -1; + } + + state->ectx = ectx; + state->callback = callback; + state->private_data = private_data; + + state->reqid = reqid_new(ectx->idr, state); + if (state->reqid == REQID_INVALID) { + talloc_free(state); + return -1; + } + + talloc_set_destructor(state, eventd_client_state_destructor); + + header.reqid = state->reqid; + + state->buflen = ctdb_event_request_len(&header, request); + state->buf = talloc_size(state, state->buflen); + if (state->buf == NULL) { + talloc_free(state); + return -1; + } + + ret = ctdb_event_request_push(&header, + request, + state->buf, + &state->buflen); + if (ret != 0) { + talloc_free(state); + return -1; + } + + ret = sock_queue_write(ectx->queue, state->buf, state->buflen); + if (ret != 0) { + talloc_free(state); + return -1; + } + + DLIST_ADD(ectx->calls, state); + + return 0; +} + +static int eventd_client_state_destructor(struct eventd_client_state *state) +{ + struct eventd_context *ectx = state->ectx; + + reqid_remove(ectx->idr, state->reqid); + DLIST_REMOVE(ectx->calls, state); + return 0; +} + +static void eventd_client_read(uint8_t *buf, size_t buflen, + void *private_data) +{ + struct eventd_context *ectx = talloc_get_type_abort( + private_data, struct eventd_context); + struct eventd_client_state *state; + struct ctdb_event_header header; + struct ctdb_event_reply *reply; + int ret; + + if (buf == NULL) { + /* connection lost */ + TALLOC_FREE(ectx->queue); + return; + } + + ret = ctdb_event_reply_pull(buf, buflen, &header, ectx, &reply); + if (ret != 0) { + D_ERR("Invalid packet received, ret=%d\n", ret); + return; + } + + if (buflen != header.length) { + D_ERR("Packet size mismatch %zu != %"PRIu32"\n", + buflen, header.length); + talloc_free(reply); + return; + } + + state = reqid_find(ectx->idr, header.reqid, + struct eventd_client_state); + if (state == NULL) { + talloc_free(reply); + return; + } + + if (state->reqid != header.reqid) { + talloc_free(reply); + return; + } + + state = talloc_steal(reply, state); + state->callback(reply, state->private_data); + talloc_free(reply); +} + +/* + * Run an event + */ + +struct eventd_client_run_state { + struct eventd_context *ectx; + void (*callback)(int result, void *private_data); + void *private_data; +}; + +static void eventd_client_run_done(struct ctdb_event_reply *reply, + void *private_data); + +static int eventd_client_run(struct eventd_context *ectx, + TALLOC_CTX *mem_ctx, + void (*callback)(int result, + void *private_data), + void *private_data, + enum ctdb_event event, + const char *arg_str, + uint32_t timeout) +{ + struct eventd_client_run_state *state; + struct ctdb_event_request request; + struct ctdb_event_request_run rdata; + int ret; + + state = talloc_zero(mem_ctx, struct eventd_client_run_state); + if (state == NULL) { + return -1; + } + + state->ectx = ectx; + state->callback = callback; + state->private_data = private_data; + + rdata.component = "legacy"; + rdata.event = ctdb_event_to_string(event); + rdata.args = arg_str; + rdata.timeout = timeout; + rdata.flags = 0; + + request.cmd = CTDB_EVENT_CMD_RUN; + request.data.run = &rdata; + + ret = eventd_client_write(ectx, state, &request, + eventd_client_run_done, state); + if (ret != 0) { + talloc_free(state); + return ret; + } + + return 0; +} + +static void eventd_client_run_done(struct ctdb_event_reply *reply, + void *private_data) +{ + struct eventd_client_run_state *state = talloc_get_type_abort( + private_data, struct eventd_client_run_state); + + state = talloc_steal(state->ectx, state); + state->callback(reply->result, state->private_data); + talloc_free(state); +} + +/* + * CTDB event script functions + */ + +int ctdb_event_script_run(struct ctdb_context *ctdb, + TALLOC_CTX *mem_ctx, + void (*callback)(struct ctdb_context *ctdb, + int result, void *private_data), + void *private_data, + enum ctdb_event event, + const char *fmt, va_list ap) + PRINTF_ATTRIBUTE(6,0); + +struct ctdb_event_script_run_state { + struct ctdb_context *ctdb; + void (*callback)(struct ctdb_context *ctdb, int result, + void *private_data); + void *private_data; + enum ctdb_event event; +}; + +static bool event_allowed_during_recovery(enum ctdb_event event); +static void ctdb_event_script_run_done(int result, void *private_data); +static bool check_options(enum ctdb_event call, const char *options); + +int ctdb_event_script_run(struct ctdb_context *ctdb, + TALLOC_CTX *mem_ctx, + void (*callback)(struct ctdb_context *ctdb, + int result, void *private_data), + void *private_data, + enum ctdb_event event, + const char *fmt, va_list ap) +{ + struct ctdb_event_script_run_state *state; + char *arg_str; + int ret; + + if ( (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) && + (! event_allowed_during_recovery(event)) ) { + DEBUG(DEBUG_ERR, + ("Refusing to run event '%s' while in recovery\n", + ctdb_eventscript_call_names[event])); + return -1; + } + + state = talloc_zero(mem_ctx, struct ctdb_event_script_run_state); + if (state == NULL) { + return -1; + } + + state->ctdb = ctdb; + state->callback = callback; + state->private_data = private_data; + state->event = event; + + if (fmt != NULL) { + arg_str = talloc_vasprintf(state, fmt, ap); + if (arg_str == NULL) { + talloc_free(state); + return -1; + } + } else { + arg_str = NULL; + } + + if (! check_options(event, arg_str)) { + DEBUG(DEBUG_ERR, + ("Bad event script arguments '%s' for '%s'\n", + arg_str, ctdb_eventscript_call_names[event])); + talloc_free(arg_str); + return -1; + } + + ret = eventd_client_run(ctdb->ectx, state, + ctdb_event_script_run_done, state, + event, arg_str, ctdb->tunable.script_timeout); + if (ret != 0) { + talloc_free(state); + return ret; + } + + DEBUG(DEBUG_INFO, + (__location__ " Running event %s with arguments %s\n", + ctdb_eventscript_call_names[event], arg_str)); + + talloc_free(arg_str); + return 0; +} + +static void ctdb_event_script_run_done(int result, void *private_data) +{ + struct ctdb_event_script_run_state *state = talloc_get_type_abort( + private_data, struct ctdb_event_script_run_state); + + if (result == ETIMEDOUT) { + switch (state->event) { + case CTDB_EVENT_START_RECOVERY: + case CTDB_EVENT_RECOVERED: + case CTDB_EVENT_TAKE_IP: + case CTDB_EVENT_RELEASE_IP: + DEBUG(DEBUG_ERR, + ("Ignoring hung script for %s event\n", + ctdb_eventscript_call_names[state->event])); + result = 0; + break; + + default: + break; + } + } + + state = talloc_steal(state->ctdb, state); + state->callback(state->ctdb, result, state->private_data); + talloc_free(state); +} + + +static unsigned int count_words(const char *options) +{ + unsigned int words = 0; + + if (options == NULL) { + return 0; + } + + options += strspn(options, " \t"); + while (*options) { + words++; + options += strcspn(options, " \t"); + options += strspn(options, " \t"); + } + return words; +} + +static bool check_options(enum ctdb_event call, const char *options) +{ + switch (call) { + /* These all take no arguments. */ + case CTDB_EVENT_INIT: + case CTDB_EVENT_SETUP: + case CTDB_EVENT_STARTUP: + case CTDB_EVENT_START_RECOVERY: + case CTDB_EVENT_RECOVERED: + case CTDB_EVENT_MONITOR: + case CTDB_EVENT_SHUTDOWN: + case CTDB_EVENT_IPREALLOCATED: + return count_words(options) == 0; + + case CTDB_EVENT_TAKE_IP: /* interface, IP address, netmask bits. */ + case CTDB_EVENT_RELEASE_IP: + return count_words(options) == 3; + + case CTDB_EVENT_UPDATE_IP: /* old interface, new interface, IP address, netmask bits. */ + return count_words(options) == 4; + + default: + DEBUG(DEBUG_ERR,(__location__ "Unknown ctdb_event %u\n", call)); + return false; + } +} + +/* only specific events are allowed while in recovery */ +static bool event_allowed_during_recovery(enum ctdb_event event) +{ + const enum ctdb_event allowed_events[] = { + CTDB_EVENT_INIT, + CTDB_EVENT_SETUP, + CTDB_EVENT_START_RECOVERY, + CTDB_EVENT_SHUTDOWN, + CTDB_EVENT_RELEASE_IP, + CTDB_EVENT_IPREALLOCATED, + }; + size_t i; + + for (i = 0; i < ARRAY_SIZE(allowed_events); i++) { + if (event == allowed_events[i]) { + return true; + } + } + + return false; +} + +/* + run the event script in the background, calling the callback when + finished. If mem_ctx is freed, callback will never be called. + */ +int ctdb_event_script_callback(struct ctdb_context *ctdb, + TALLOC_CTX *mem_ctx, + void (*callback)(struct ctdb_context *, int, void *), + void *private_data, + enum ctdb_event call, + const char *fmt, ...) +{ + va_list ap; + int ret; + + va_start(ap, fmt); + ret = ctdb_event_script_run(ctdb, mem_ctx, callback, private_data, + call, fmt, ap); + va_end(ap); + + return ret; +} + + +struct ctdb_event_script_args_state { + bool done; + int status; +}; + +static void ctdb_event_script_args_done(struct ctdb_context *ctdb, + int status, void *private_data) +{ + struct ctdb_event_script_args_state *s = + (struct ctdb_event_script_args_state *)private_data; + + s->done = true; + s->status = status; +} + +/* + run the event script, waiting for it to complete. Used when the caller + doesn't want to continue till the event script has finished. + */ +int ctdb_event_script_args(struct ctdb_context *ctdb, enum ctdb_event call, + const char *fmt, ...) +{ + va_list ap; + int ret; + struct ctdb_event_script_args_state state = { + .status = -1, + .done = false, + }; + + va_start(ap, fmt); + ret = ctdb_event_script_run(ctdb, ctdb, + ctdb_event_script_args_done, &state, + call, fmt, ap); + va_end(ap); + if (ret != 0) { + return ret; + } + + while (! state.done) { + tevent_loop_once(ctdb->ev); + } + + if (state.status == ETIMEDOUT) { + /* Don't ban self if CTDB is starting up or shutting down */ + if (call != CTDB_EVENT_INIT && call != CTDB_EVENT_SHUTDOWN) { + DEBUG(DEBUG_ERR, + (__location__ " eventscript for '%s' timed out." + " Immediately banning ourself for %d seconds\n", + ctdb_eventscript_call_names[call], + ctdb->tunable.recovery_ban_period)); + ctdb_ban_self(ctdb); + } + } + + return state.status; +} + +int ctdb_event_script(struct ctdb_context *ctdb, enum ctdb_event call) +{ + /* GCC complains about empty format string, so use %s and "". */ + return ctdb_event_script_args(ctdb, call, NULL); +} + +void ctdb_event_reopen_logs(struct ctdb_context *ctdb) +{ + if (ctdb->ectx->eventd_pid > 0) { + kill(ctdb->ectx->eventd_pid, SIGHUP); + } +} diff --git a/ctdb/server/ipalloc.c b/ctdb/server/ipalloc.c new file mode 100644 index 0000000..7f49364 --- /dev/null +++ b/ctdb/server/ipalloc.c @@ -0,0 +1,284 @@ +/* + ctdb ip takeover code + + Copyright (C) Ronnie Sahlberg 2007 + Copyright (C) Andrew Tridgell 2007 + Copyright (C) Martin Schwenke 2011 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "replace.h" +#include "system/network.h" + +#include <talloc.h> + +#include "lib/util/debug.h" + +#include "common/logging.h" +#include "common/rb_tree.h" + +#include "protocol/protocol_util.h" + +#include "server/ipalloc_private.h" + +/* Initialise main ipalloc state and sub-structures */ +struct ipalloc_state * +ipalloc_state_init(TALLOC_CTX *mem_ctx, + uint32_t num_nodes, + enum ipalloc_algorithm algorithm, + bool no_ip_takeover, + bool no_ip_failback, + uint32_t *force_rebalance_nodes) +{ + struct ipalloc_state *ipalloc_state = + talloc_zero(mem_ctx, struct ipalloc_state); + if (ipalloc_state == NULL) { + DEBUG(DEBUG_ERR, (__location__ " Out of memory\n")); + return NULL; + } + + ipalloc_state->num = num_nodes; + + ipalloc_state->algorithm = algorithm; + ipalloc_state->no_ip_takeover = no_ip_takeover; + ipalloc_state->no_ip_failback = no_ip_failback; + ipalloc_state->force_rebalance_nodes = force_rebalance_nodes; + + return ipalloc_state; +} + +static void *add_ip_callback(void *parm, void *data) +{ + struct public_ip_list *this_ip = parm; + struct public_ip_list *prev_ip = data; + + if (prev_ip == NULL) { + return parm; + } + if (this_ip->pnn == CTDB_UNKNOWN_PNN) { + this_ip->pnn = prev_ip->pnn; + } + + return parm; +} + +static int getips_count_callback(void *param, void *data) +{ + struct public_ip_list **ip_list = (struct public_ip_list **)param; + struct public_ip_list *new_ip = (struct public_ip_list *)data; + + new_ip->next = *ip_list; + *ip_list = new_ip; + return 0; +} + +/* Nodes only know about those public addresses that they are + * configured to serve and no individual node has a full list of all + * public addresses configured across the cluster. Therefore, a + * merged list of all public addresses needs to be built so that IP + * allocation can be done. */ +static struct public_ip_list * +create_merged_ip_list(struct ipalloc_state *ipalloc_state) +{ + unsigned int i, j; + struct public_ip_list *ip_list; + struct ctdb_public_ip_list *public_ips; + struct trbt_tree *ip_tree; + int ret; + + ip_tree = trbt_create(ipalloc_state, 0); + + if (ipalloc_state->known_public_ips == NULL) { + DEBUG(DEBUG_ERR, ("Known public IPs not set\n")); + return NULL; + } + + for (i=0; i < ipalloc_state->num; i++) { + + public_ips = &ipalloc_state->known_public_ips[i]; + + for (j=0; j < public_ips->num; j++) { + struct public_ip_list *tmp_ip; + + /* This is returned as part of ip_list */ + tmp_ip = talloc_zero(ipalloc_state, struct public_ip_list); + if (tmp_ip == NULL) { + DEBUG(DEBUG_ERR, + (__location__ " out of memory\n")); + talloc_free(ip_tree); + return NULL; + } + + /* Do not use information about IP addresses hosted + * on other nodes, it may not be accurate */ + if (public_ips->ip[j].pnn == i) { + tmp_ip->pnn = public_ips->ip[j].pnn; + } else { + tmp_ip->pnn = CTDB_UNKNOWN_PNN; + } + tmp_ip->addr = public_ips->ip[j].addr; + tmp_ip->next = NULL; + + trbt_insertarray32_callback(ip_tree, + IP_KEYLEN, ip_key(&public_ips->ip[j].addr), + add_ip_callback, + tmp_ip); + } + } + + ip_list = NULL; + ret = trbt_traversearray32(ip_tree, IP_KEYLEN, getips_count_callback, &ip_list); + if (ret != 0) { + DBG_ERR("Error traversing the IP tree.\n"); + } + + talloc_free(ip_tree); + + return ip_list; +} + +static bool populate_bitmap(struct ipalloc_state *ipalloc_state) +{ + struct public_ip_list *ip = NULL; + unsigned int i, j; + + for (ip = ipalloc_state->all_ips; ip != NULL; ip = ip->next) { + + ip->known_on = bitmap_talloc(ip, ipalloc_state->num); + if (ip->known_on == NULL) { + return false; + } + + ip->available_on = bitmap_talloc(ip, ipalloc_state->num); + if (ip->available_on == NULL) { + return false; + } + + for (i = 0; i < ipalloc_state->num; i++) { + struct ctdb_public_ip_list *known = + &ipalloc_state->known_public_ips[i]; + struct ctdb_public_ip_list *avail = + &ipalloc_state->available_public_ips[i]; + + /* Check to see if "ip" is available on node "i" */ + for (j = 0; j < avail->num; j++) { + if (ctdb_sock_addr_same_ip( + &ip->addr, &avail->ip[j].addr)) { + bitmap_set(ip->available_on, i); + break; + } + } + + /* Optimisation: available => known */ + if (bitmap_query(ip->available_on, i)) { + bitmap_set(ip->known_on, i); + continue; + } + + /* Check to see if "ip" is known on node "i" */ + for (j = 0; j < known->num; j++) { + if (ctdb_sock_addr_same_ip( + &ip->addr, &known->ip[j].addr)) { + bitmap_set(ip->known_on, i); + break; + } + } + } + } + + return true; +} + +void ipalloc_set_public_ips(struct ipalloc_state *ipalloc_state, + struct ctdb_public_ip_list *known_ips, + struct ctdb_public_ip_list *available_ips) +{ + ipalloc_state->available_public_ips = available_ips; + ipalloc_state->known_public_ips = known_ips; +} + +/* This can only return false if there are no available IPs *and* + * there are no IP addresses currently allocated. If the latter is + * true then the cluster can clearly host IPs... just not necessarily + * right now... */ +bool ipalloc_can_host_ips(struct ipalloc_state *ipalloc_state) +{ + unsigned int i; + bool have_ips = false; + + for (i=0; i < ipalloc_state->num; i++) { + struct ctdb_public_ip_list *ips = + ipalloc_state->known_public_ips; + if (ips[i].num != 0) { + unsigned int j; + have_ips = true; + /* Succeed if an address is hosted on node i */ + for (j=0; j < ips[i].num; j++) { + if (ips[i].ip[j].pnn == i) { + return true; + } + } + } + } + + if (! have_ips) { + return false; + } + + /* At this point there are known addresses but none are + * hosted. Need to check if cluster can now host some + * addresses. + */ + for (i=0; i < ipalloc_state->num; i++) { + if (ipalloc_state->available_public_ips[i].num != 0) { + return true; + } + } + + return false; +} + +/* The calculation part of the IP allocation algorithm. */ +struct public_ip_list *ipalloc(struct ipalloc_state *ipalloc_state) +{ + bool ret = false; + + ipalloc_state->all_ips = create_merged_ip_list(ipalloc_state); + if (ipalloc_state->all_ips == NULL) { + return NULL; + } + + if (!populate_bitmap(ipalloc_state)) { + return NULL; + } + + switch (ipalloc_state->algorithm) { + case IPALLOC_LCP2: + ret = ipalloc_lcp2(ipalloc_state); + break; + case IPALLOC_DETERMINISTIC: + ret = ipalloc_deterministic(ipalloc_state); + break; + case IPALLOC_NONDETERMINISTIC: + ret = ipalloc_nondeterministic(ipalloc_state); + break; + } + + /* at this point ->pnn is the node which will own each IP + or CTDB_UNKNOWN_PNN if there is no node that can cover this ip + */ + + return (ret ? ipalloc_state->all_ips : NULL); +} diff --git a/ctdb/server/ipalloc.h b/ctdb/server/ipalloc.h new file mode 100644 index 0000000..42aec9e --- /dev/null +++ b/ctdb/server/ipalloc.h @@ -0,0 +1,67 @@ +/* + CTDB IP takeover code + + Copyright (C) Ronnie Sahlberg 2007 + Copyright (C) Andrew Tridgell 2007 + Copyright (C) Martin Schwenke 2015 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#ifndef __CTDB_IPALLOC_H__ +#define __CTDB_IPALLOC_H__ + +#include <talloc.h> + +#include "replace.h" +#include "system/network.h" + +#include "lib/util/bitmap.h" + +struct public_ip_list { + struct public_ip_list *next; + uint32_t pnn; + ctdb_sock_addr addr; + struct bitmap *known_on; + struct bitmap *available_on; +}; + +#define IP_KEYLEN 4 +uint32_t *ip_key(ctdb_sock_addr *ip); + +/* Flags used in IP allocation algorithms. */ +enum ipalloc_algorithm { + IPALLOC_DETERMINISTIC, + IPALLOC_NONDETERMINISTIC, + IPALLOC_LCP2, +}; + +struct ipalloc_state; + +struct ipalloc_state * ipalloc_state_init(TALLOC_CTX *mem_ctx, + uint32_t num_nodes, + enum ipalloc_algorithm algorithm, + bool no_ip_takeover, + bool no_ip_failback, + uint32_t *force_rebalance_nodes); + +void ipalloc_set_public_ips(struct ipalloc_state *ipalloc_state, + struct ctdb_public_ip_list *known_ips, + struct ctdb_public_ip_list *available_ips); + +bool ipalloc_can_host_ips(struct ipalloc_state *ipalloc_state); + +struct public_ip_list *ipalloc(struct ipalloc_state *ipalloc_state); + +#endif /* __CTDB_IPALLOC_H__ */ diff --git a/ctdb/server/ipalloc_common.c b/ctdb/server/ipalloc_common.c new file mode 100644 index 0000000..a5177d4 --- /dev/null +++ b/ctdb/server/ipalloc_common.c @@ -0,0 +1,192 @@ +/* + ctdb ip takeover code + + Copyright (C) Ronnie Sahlberg 2007 + Copyright (C) Andrew Tridgell 2007 + Copyright (C) Martin Schwenke 2011 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "replace.h" +#include "system/network.h" + +#include "ctdb_private.h" + +#include "lib/util/time.h" + +#include "lib/util/debug.h" +#include "common/logging.h" + +#include "common/common.h" + +#include "protocol/protocol_util.h" + +#include "server/ipalloc_private.h" + +#define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0) + +/* Given a physical node, return the number of + public addresses that is currently assigned to this node. +*/ +int node_ip_coverage(uint32_t pnn, struct public_ip_list *ips) +{ + int num=0; + + for (;ips;ips=ips->next) { + if (ips->pnn == pnn) { + num++; + } + } + return num; +} + + +/* Can the given node host the given IP: is the public IP known to the + * node and is NOIPHOST unset? +*/ +static bool can_node_host_ip(struct ipalloc_state *ipalloc_state, + int32_t pnn, + struct public_ip_list *ip) +{ + return bitmap_query(ip->available_on, pnn); +} + +bool can_node_takeover_ip(struct ipalloc_state *ipalloc_state, + int32_t pnn, + struct public_ip_list *ip) +{ + if (ipalloc_state->no_ip_takeover) { + return false; + } + + return can_node_host_ip(ipalloc_state, pnn, ip); +} + +/* search the node lists list for a node to takeover this ip. + pick the node that currently are serving the least number of ips + so that the ips get spread out evenly. +*/ +int find_takeover_node(struct ipalloc_state *ipalloc_state, + struct public_ip_list *ip) +{ + unsigned int pnn; + int min=0, num; + unsigned int i, numnodes; + + numnodes = ipalloc_state->num; + pnn = CTDB_UNKNOWN_PNN; + for (i=0; i<numnodes; i++) { + /* verify that this node can serve this ip */ + if (!can_node_takeover_ip(ipalloc_state, i, ip)) { + /* no it couldn't so skip to the next node */ + continue; + } + + num = node_ip_coverage(i, ipalloc_state->all_ips); + /* was this the first node we checked ? */ + if (pnn == CTDB_UNKNOWN_PNN) { + pnn = i; + min = num; + } else { + if (num < min) { + pnn = i; + min = num; + } + } + } + if (pnn == CTDB_UNKNOWN_PNN) { + DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n", + ctdb_sock_addr_to_string(ipalloc_state, + &ip->addr, + false))); + + return -1; + } + + ip->pnn = pnn; + return 0; +} + +uint32_t *ip_key(ctdb_sock_addr *ip) +{ + static uint32_t key[IP_KEYLEN]; + + bzero(key, sizeof(key)); + + switch (ip->sa.sa_family) { + case AF_INET: + key[3] = htonl(ip->ip.sin_addr.s_addr); + break; + case AF_INET6: { + uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr); + key[0] = htonl(s6_a32[0]); + key[1] = htonl(s6_a32[1]); + key[2] = htonl(s6_a32[2]); + key[3] = htonl(s6_a32[3]); + break; + } + default: + DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family)); + return key; + } + + return key; +} + +/* Allocate any unassigned IPs just by looping through the IPs and + * finding the best node for each. + */ +void basic_allocate_unassigned(struct ipalloc_state *ipalloc_state) +{ + struct public_ip_list *t; + + /* loop over all ip's and find a physical node to cover for + each unassigned ip. + */ + for (t = ipalloc_state->all_ips; t != NULL; t = t->next) { + if (t->pnn == CTDB_UNKNOWN_PNN) { + if (find_takeover_node(ipalloc_state, t)) { + DEBUG(DEBUG_WARNING, + ("Failed to find node to cover ip %s\n", + ctdb_sock_addr_to_string(ipalloc_state, + &t->addr, + false))); + } + } + } +} + +void unassign_unsuitable_ips(struct ipalloc_state *ipalloc_state) +{ + struct public_ip_list *t; + + /* verify that the assigned nodes can serve that public ip + and set it to CTDB_UNKNOWN_PNN if not + */ + for (t = ipalloc_state->all_ips; t != NULL; t = t->next) { + if (t->pnn == CTDB_UNKNOWN_PNN) { + continue; + } + if (!can_node_host_ip(ipalloc_state, t->pnn, t) != 0) { + /* this node can not serve this ip. */ + DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n", + ctdb_sock_addr_to_string( + ipalloc_state, + &t->addr, false), + t->pnn)); + t->pnn = CTDB_UNKNOWN_PNN; + } + } +} diff --git a/ctdb/server/ipalloc_deterministic.c b/ctdb/server/ipalloc_deterministic.c new file mode 100644 index 0000000..43680ba --- /dev/null +++ b/ctdb/server/ipalloc_deterministic.c @@ -0,0 +1,191 @@ +/* + ctdb ip takeover code + + Copyright (C) Ronnie Sahlberg 2007 + Copyright (C) Andrew Tridgell 2007 + Copyright (C) Martin Schwenke 2011 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "replace.h" +#include "system/network.h" + +#include "lib/util/debug.h" +#include "common/logging.h" +#include "common/path.h" + +#include "protocol/protocol_util.h" +#include "lib/util/smb_strtox.h" +#include "lib/util/memory.h" + +#include "server/ipalloc_private.h" + +struct home_node { + ctdb_sock_addr addr; + uint32_t pnn; +}; + +static struct home_node *ipalloc_get_home_nodes(TALLOC_CTX *mem_ctx) +{ + char *line = NULL; + size_t len = 0; + char *fname = NULL; + FILE *fp = NULL; + struct home_node *result = NULL; + + fname = path_etcdir_append(mem_ctx, "home_nodes"); + if (fname == NULL) { + goto fail; + } + + fp = fopen(fname, "r"); + if (fp == NULL) { + goto fail; + } + TALLOC_FREE(fname); + + while (true) { + size_t num_nodes = talloc_array_length(result); + char *saveptr = NULL, *addrstr = NULL, *nodestr = NULL; + struct home_node hn = { + .pnn = CTDB_UNKNOWN_PNN, + }; + struct home_node *tmp = NULL; + ssize_t n = 0; + int ret; + + n = getline(&line, &len, fp); + if (n < 0) { + if (!feof(fp)) { + /* real error */ + goto fail; + } + break; + } + if ((n > 0) && (line[n - 1] == '\n')) { + line[n - 1] = '\0'; + } + + addrstr = strtok_r(line, " \t", &saveptr); + if (addrstr == NULL) { + continue; + } + nodestr = strtok_r(NULL, " \t", &saveptr); + if (nodestr == NULL) { + continue; + } + + ret = ctdb_sock_addr_from_string(addrstr, &hn.addr, false); + if (ret != 0) { + DBG_WARNING("Could not parse %s: %s\n", + addrstr, + strerror(ret)); + goto fail; + } + + hn.pnn = smb_strtoul(nodestr, + NULL, + 10, + &ret, + SMB_STR_FULL_STR_CONV); + if (ret != 0) { + DBG_WARNING("Could not parse \"%s\"\n", nodestr); + goto fail; + } + + tmp = talloc_realloc(mem_ctx, + result, + struct home_node, + num_nodes + 1); + if (tmp == NULL) { + goto fail; + } + result = tmp; + result[num_nodes] = hn; + } + + fclose(fp); + fp = NULL; + return result; + +fail: + if (fp != NULL) { + fclose(fp); + fp = NULL; + } + SAFE_FREE(line); + TALLOC_FREE(fname); + TALLOC_FREE(result); + return NULL; +} + +bool ipalloc_deterministic(struct ipalloc_state *ipalloc_state) +{ + struct home_node *home_nodes = ipalloc_get_home_nodes(ipalloc_state); + size_t num_home_nodes = talloc_array_length(home_nodes); + struct public_ip_list *t; + int i; + uint32_t numnodes; + + numnodes = ipalloc_state->num; + + DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n")); + /* Allocate IPs to nodes in a modulo fashion so that IPs will + * always be allocated the same way for a specific set of + * available/unavailable nodes. + */ + + for (i = 0, t = ipalloc_state->all_ips; t!= NULL; t = t->next, i++) { + size_t j; + + t->pnn = i % numnodes; + + for (j = 0; j < num_home_nodes; j++) { + struct home_node *hn = &home_nodes[j]; + + if (ctdb_sock_addr_same_ip(&t->addr, &hn->addr)) { + + if (hn->pnn >= numnodes) { + DBG_WARNING("pnn %" PRIu32 + " too large\n", + hn->pnn); + break; + } + + t->pnn = hn->pnn; + break; + } + } + } + + /* IP failback doesn't make sense with deterministic + * IPs, since the modulo step above implicitly fails + * back IPs to their "home" node. + */ + if (ipalloc_state->no_ip_failback) { + D_WARNING("WARNING: 'NoIPFailback' set but ignored - " + "incompatible with 'Deterministic IPs\n"); + } + + unassign_unsuitable_ips(ipalloc_state); + + basic_allocate_unassigned(ipalloc_state); + + /* No failback here! */ + + TALLOC_FREE(home_nodes); + + return true; +} diff --git a/ctdb/server/ipalloc_lcp2.c b/ctdb/server/ipalloc_lcp2.c new file mode 100644 index 0000000..996adcf --- /dev/null +++ b/ctdb/server/ipalloc_lcp2.c @@ -0,0 +1,525 @@ +/* + ctdb ip takeover code + + Copyright (C) Ronnie Sahlberg 2007 + Copyright (C) Andrew Tridgell 2007 + Copyright (C) Martin Schwenke 2011 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "replace.h" +#include "system/network.h" + +#include "lib/util/debug.h" +#include "common/logging.h" + +#include "protocol/protocol_util.h" + +#include "server/ipalloc_private.h" + +/* + * This is the length of the longtest common prefix between the IPs. + * It is calculated by XOR-ing the 2 IPs together and counting the + * number of leading zeroes. The implementation means that all + * addresses end up being 128 bits long. + * + * FIXME? Should we consider IPv4 and IPv6 separately given that the + * 12 bytes of 0 prefix padding will hurt the algorithm if there are + * lots of nodes and IP addresses? + */ +static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2) +{ + uint32_t ip1_k[IP_KEYLEN]; + uint32_t *t; + int i; + uint32_t x; + + uint32_t distance = 0; + + memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k)); + t = ip_key(ip2); + for (i=0; i<IP_KEYLEN; i++) { + x = ip1_k[i] ^ t[i]; + if (x == 0) { + distance += 32; + } else { + /* Count number of leading zeroes. + * FIXME? This could be optimised... + */ + while ((x & ((uint32_t)1 << 31)) == 0) { + x <<= 1; + distance += 1; + } + } + } + + return distance; +} + +/* Calculate the IP distance for the given IP relative to IPs on the + given node. The ips argument is generally the all_ips variable + used in the main part of the algorithm. + */ +static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip, + struct public_ip_list *ips, + unsigned int pnn) +{ + struct public_ip_list *t; + uint32_t d; + + uint32_t sum = 0; + + for (t = ips; t != NULL; t = t->next) { + if (t->pnn != pnn) { + continue; + } + + /* Optimisation: We never calculate the distance + * between an address and itself. This allows us to + * calculate the effect of removing an address from a + * node by simply calculating the distance between + * that address and all of the existing addresses. + * Moreover, we assume that we're only ever dealing + * with addresses from all_ips so we can identify an + * address via a pointer rather than doing a more + * expensive address comparison. */ + if (&(t->addr) == ip) { + continue; + } + + d = ip_distance(ip, &(t->addr)); + sum += d * d; /* Cheaper than pulling in math.h :-) */ + } + + return sum; +} + +/* Return the LCP2 imbalance metric for addresses currently assigned + to the given node. + */ +static uint32_t lcp2_imbalance(struct public_ip_list * all_ips, + unsigned int pnn) +{ + struct public_ip_list *t; + + uint32_t imbalance = 0; + + for (t = all_ips; t != NULL; t = t->next) { + if (t->pnn != pnn) { + continue; + } + /* Pass the rest of the IPs rather than the whole + all_ips input list. + */ + imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn); + } + + return imbalance; +} + +static bool lcp2_init(struct ipalloc_state *ipalloc_state, + uint32_t **lcp2_imbalances, + bool **rebalance_candidates) +{ + unsigned int i, numnodes; + struct public_ip_list *t; + + numnodes = ipalloc_state->num; + + *rebalance_candidates = talloc_array(ipalloc_state, bool, numnodes); + if (*rebalance_candidates == NULL) { + DEBUG(DEBUG_ERR, (__location__ " out of memory\n")); + return false; + } + *lcp2_imbalances = talloc_array(ipalloc_state, uint32_t, numnodes); + if (*lcp2_imbalances == NULL) { + DEBUG(DEBUG_ERR, (__location__ " out of memory\n")); + return false; + } + + for (i=0; i<numnodes; i++) { + (*lcp2_imbalances)[i] = + lcp2_imbalance(ipalloc_state->all_ips, i); + /* First step: assume all nodes are candidates */ + (*rebalance_candidates)[i] = true; + } + + /* 2nd step: if a node has IPs assigned then it must have been + * healthy before, so we remove it from consideration. This + * is overkill but is all we have because we don't maintain + * state between takeover runs. An alternative would be to + * keep state and invalidate it every time the recovery master + * changes. + */ + for (t = ipalloc_state->all_ips; t != NULL; t = t->next) { + if (t->pnn != CTDB_UNKNOWN_PNN) { + (*rebalance_candidates)[t->pnn] = false; + } + } + + /* 3rd step: if a node is forced to re-balance then + we allow failback onto the node */ + if (ipalloc_state->force_rebalance_nodes == NULL) { + return true; + } + for (i = 0; + i < talloc_array_length(ipalloc_state->force_rebalance_nodes); + i++) { + uint32_t pnn = ipalloc_state->force_rebalance_nodes[i]; + if (pnn >= numnodes) { + DEBUG(DEBUG_ERR, + (__location__ "unknown node %u\n", pnn)); + continue; + } + + DEBUG(DEBUG_NOTICE, + ("Forcing rebalancing of IPs to node %u\n", pnn)); + (*rebalance_candidates)[pnn] = true; + } + + return true; +} + +/* Allocate any unassigned addresses using the LCP2 algorithm to find + * the IP/node combination that will cost the least. + */ +static void lcp2_allocate_unassigned(struct ipalloc_state *ipalloc_state, + uint32_t *lcp2_imbalances) +{ + struct public_ip_list *t; + unsigned int dstnode, numnodes; + + unsigned int minnode; + uint32_t mindsum, dstdsum, dstimbl; + uint32_t minimbl = 0; + struct public_ip_list *minip; + + bool should_loop = true; + bool have_unassigned = true; + + numnodes = ipalloc_state->num; + + while (have_unassigned && should_loop) { + should_loop = false; + + DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n")); + DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n")); + + minnode = CTDB_UNKNOWN_PNN; + mindsum = 0; + minip = NULL; + + /* loop over each unassigned ip. */ + for (t = ipalloc_state->all_ips; t != NULL ; t = t->next) { + if (t->pnn != CTDB_UNKNOWN_PNN) { + continue; + } + + for (dstnode = 0; dstnode < numnodes; dstnode++) { + /* only check nodes that can actually takeover this ip */ + if (!can_node_takeover_ip(ipalloc_state, + dstnode, + t)) { + /* no it couldn't so skip to the next node */ + continue; + } + + dstdsum = ip_distance_2_sum(&(t->addr), + ipalloc_state->all_ips, + dstnode); + dstimbl = lcp2_imbalances[dstnode] + dstdsum; + DEBUG(DEBUG_DEBUG, + (" %s -> %d [+%d]\n", + ctdb_sock_addr_to_string(ipalloc_state, + &(t->addr), + false), + dstnode, + dstimbl - lcp2_imbalances[dstnode])); + + + if (minnode == CTDB_UNKNOWN_PNN || + dstdsum < mindsum) { + minnode = dstnode; + minimbl = dstimbl; + mindsum = dstdsum; + minip = t; + should_loop = true; + } + } + } + + DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n")); + + /* If we found one then assign it to the given node. */ + if (minnode != CTDB_UNKNOWN_PNN) { + minip->pnn = minnode; + lcp2_imbalances[minnode] = minimbl; + DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n", + ctdb_sock_addr_to_string( + ipalloc_state, + &(minip->addr), false), + minnode, + mindsum)); + } + + /* There might be a better way but at least this is clear. */ + have_unassigned = false; + for (t = ipalloc_state->all_ips; t != NULL; t = t->next) { + if (t->pnn == CTDB_UNKNOWN_PNN) { + have_unassigned = true; + } + } + } + + /* We know if we have an unassigned addresses so we might as + * well optimise. + */ + if (have_unassigned) { + for (t = ipalloc_state->all_ips; t != NULL; t = t->next) { + if (t->pnn == CTDB_UNKNOWN_PNN) { + DEBUG(DEBUG_WARNING, + ("Failed to find node to cover ip %s\n", + ctdb_sock_addr_to_string(ipalloc_state, + &t->addr, + false))); + } + } + } +} + +/* LCP2 algorithm for rebalancing the cluster. Given a candidate node + * to move IPs from, determines the best IP/destination node + * combination to move from the source node. + */ +static bool lcp2_failback_candidate(struct ipalloc_state *ipalloc_state, + unsigned int srcnode, + uint32_t *lcp2_imbalances, + bool *rebalance_candidates) +{ + unsigned int dstnode, mindstnode, numnodes; + uint32_t srcdsum, dstimbl, dstdsum; + uint32_t minsrcimbl, mindstimbl; + struct public_ip_list *minip; + struct public_ip_list *t; + + /* Find an IP and destination node that best reduces imbalance. */ + minip = NULL; + minsrcimbl = 0; + mindstnode = CTDB_UNKNOWN_PNN; + mindstimbl = 0; + + numnodes = ipalloc_state->num; + + DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n")); + DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", + srcnode, lcp2_imbalances[srcnode])); + + for (t = ipalloc_state->all_ips; t != NULL; t = t->next) { + uint32_t srcimbl; + + /* Only consider addresses on srcnode. */ + if (t->pnn != srcnode) { + continue; + } + + /* What is this IP address costing the source node? */ + srcdsum = ip_distance_2_sum(&(t->addr), + ipalloc_state->all_ips, + srcnode); + srcimbl = lcp2_imbalances[srcnode] - srcdsum; + + /* Consider this IP address would cost each potential + * destination node. Destination nodes are limited to + * those that are newly healthy, since we don't want + * to do gratuitous failover of IPs just to make minor + * balance improvements. + */ + for (dstnode = 0; dstnode < numnodes; dstnode++) { + if (!rebalance_candidates[dstnode]) { + continue; + } + + /* only check nodes that can actually takeover this ip */ + if (!can_node_takeover_ip(ipalloc_state, dstnode, + t)) { + /* no it couldn't so skip to the next node */ + continue; + } + + dstdsum = ip_distance_2_sum(&(t->addr), + ipalloc_state->all_ips, + dstnode); + dstimbl = lcp2_imbalances[dstnode] + dstdsum; + DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n", + srcnode, -srcdsum, + ctdb_sock_addr_to_string( + ipalloc_state, + &(t->addr), false), + dstnode, dstdsum)); + + if ((dstimbl < lcp2_imbalances[srcnode]) && + (dstdsum < srcdsum) && \ + ((mindstnode == CTDB_UNKNOWN_PNN) || \ + ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) { + + minip = t; + minsrcimbl = srcimbl; + mindstnode = dstnode; + mindstimbl = dstimbl; + } + } + } + DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n")); + + if (mindstnode != CTDB_UNKNOWN_PNN) { + /* We found a move that makes things better... */ + DEBUG(DEBUG_INFO, + ("%d [%d] -> %s -> %d [+%d]\n", + srcnode, minsrcimbl - lcp2_imbalances[srcnode], + ctdb_sock_addr_to_string(ipalloc_state, + &(minip->addr), false), + mindstnode, mindstimbl - lcp2_imbalances[mindstnode])); + + + lcp2_imbalances[srcnode] = minsrcimbl; + lcp2_imbalances[mindstnode] = mindstimbl; + minip->pnn = mindstnode; + + return true; + } + + return false; +} + +struct lcp2_imbalance_pnn { + uint32_t imbalance; + unsigned int pnn; +}; + +static int lcp2_cmp_imbalance_pnn(const void * a, const void * b) +{ + const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a; + const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b; + + if (lipa->imbalance > lipb->imbalance) { + return -1; + } else if (lipa->imbalance == lipb->imbalance) { + return 0; + } else { + return 1; + } +} + +/* LCP2 algorithm for rebalancing the cluster. This finds the source + * node with the highest LCP2 imbalance, and then determines the best + * IP/destination node combination to move from the source node. + */ +static void lcp2_failback(struct ipalloc_state *ipalloc_state, + uint32_t *lcp2_imbalances, + bool *rebalance_candidates) +{ + int i, numnodes; + struct lcp2_imbalance_pnn * lips; + bool again; + + numnodes = ipalloc_state->num; + +try_again: + /* Put the imbalances and nodes into an array, sort them and + * iterate through candidates. Usually the 1st one will be + * used, so this doesn't cost much... + */ + DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n")); + DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n")); + lips = talloc_array(ipalloc_state, struct lcp2_imbalance_pnn, numnodes); + for (i = 0; i < numnodes; i++) { + lips[i].imbalance = lcp2_imbalances[i]; + lips[i].pnn = i; + DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i])); + } + qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn), + lcp2_cmp_imbalance_pnn); + + again = false; + for (i = 0; i < numnodes; i++) { + /* This means that all nodes had 0 or 1 addresses, so + * can't be imbalanced. + */ + if (lips[i].imbalance == 0) { + break; + } + + if (lcp2_failback_candidate(ipalloc_state, + lips[i].pnn, + lcp2_imbalances, + rebalance_candidates)) { + again = true; + break; + } + } + + talloc_free(lips); + if (again) { + goto try_again; + } +} + +bool ipalloc_lcp2(struct ipalloc_state *ipalloc_state) +{ + uint32_t *lcp2_imbalances; + bool *rebalance_candidates; + int numnodes, i; + bool have_rebalance_candidates; + bool ret = true; + + unassign_unsuitable_ips(ipalloc_state); + + if (!lcp2_init(ipalloc_state, + &lcp2_imbalances, &rebalance_candidates)) { + ret = false; + goto finished; + } + + lcp2_allocate_unassigned(ipalloc_state, lcp2_imbalances); + + /* If we don't want IPs to fail back then don't rebalance IPs. */ + if (ipalloc_state->no_ip_failback) { + goto finished; + } + + /* It is only worth continuing if we have suitable target + * nodes to transfer IPs to. This check is much cheaper than + * continuing on... + */ + numnodes = ipalloc_state->num; + have_rebalance_candidates = false; + for (i=0; i<numnodes; i++) { + if (rebalance_candidates[i]) { + have_rebalance_candidates = true; + break; + } + } + if (!have_rebalance_candidates) { + goto finished; + } + + /* Now, try to make sure the ip addresses are evenly distributed + across the nodes. + */ + lcp2_failback(ipalloc_state, lcp2_imbalances, rebalance_candidates); + +finished: + return ret; +} diff --git a/ctdb/server/ipalloc_nondeterministic.c b/ctdb/server/ipalloc_nondeterministic.c new file mode 100644 index 0000000..9da7d6c --- /dev/null +++ b/ctdb/server/ipalloc_nondeterministic.c @@ -0,0 +1,150 @@ +/* + ctdb ip takeover code + + Copyright (C) Ronnie Sahlberg 2007 + Copyright (C) Andrew Tridgell 2007 + Copyright (C) Martin Schwenke 2011 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "replace.h" +#include "system/network.h" + +#include "ctdb_private.h" + +#include "lib/util/debug.h" +#include "common/logging.h" +#include "common/common.h" + +#include "protocol/protocol_util.h" + +#include "server/ipalloc_private.h" + +/* Basic non-deterministic rebalancing algorithm. + */ +static void basic_failback(struct ipalloc_state *ipalloc_state, + int num_ips) +{ + unsigned int i, numnodes, maxnode, minnode; + int maxnum, minnum, num, retries; + struct public_ip_list *t; + + numnodes = ipalloc_state->num; + retries = 0; + +try_again: + maxnum=0; + minnum=0; + + /* for each ip address, loop over all nodes that can serve + this ip and make sure that the difference between the node + serving the most and the node serving the least ip's are + not greater than 1. + */ + for (t = ipalloc_state->all_ips; t != NULL; t = t->next) { + if (t->pnn == CTDB_UNKNOWN_PNN) { + continue; + } + + /* Get the highest and lowest number of ips's served by any + valid node which can serve this ip. + */ + maxnode = CTDB_UNKNOWN_PNN; + minnode = CTDB_UNKNOWN_PNN; + for (i=0; i<numnodes; i++) { + /* only check nodes that can actually serve this ip */ + if (!can_node_takeover_ip(ipalloc_state, i, + t)) { + /* no it couldn't so skip to the next node */ + continue; + } + + num = node_ip_coverage(i, ipalloc_state->all_ips); + if (maxnode == CTDB_UNKNOWN_PNN) { + maxnode = i; + maxnum = num; + } else { + if (num > maxnum) { + maxnode = i; + maxnum = num; + } + } + if (minnode == CTDB_UNKNOWN_PNN) { + minnode = i; + minnum = num; + } else { + if (num < minnum) { + minnode = i; + minnum = num; + } + } + } + if (maxnode == CTDB_UNKNOWN_PNN) { + DEBUG(DEBUG_WARNING, + (__location__ " Could not find maxnode. May not be able to serve ip '%s'\n", + ctdb_sock_addr_to_string(ipalloc_state, + &t->addr, false))); + + continue; + } + + /* if the spread between the smallest and largest coverage by + a node is >=2 we steal one of the ips from the node with + most coverage to even things out a bit. + try to do this a limited number of times since we dont + want to spend too much time balancing the ip coverage. + */ + if ((maxnum > minnum+1) && + (retries < (num_ips + 5))){ + struct public_ip_list *tt; + + /* Reassign one of maxnode's VNNs */ + for (tt = ipalloc_state->all_ips; tt != NULL; tt = tt->next) { + if (tt->pnn == maxnode) { + (void)find_takeover_node(ipalloc_state, + tt); + retries++; + goto try_again;; + } + } + } + } +} + +bool ipalloc_nondeterministic(struct ipalloc_state *ipalloc_state) +{ + /* This should be pushed down into basic_failback. */ + struct public_ip_list *t; + int num_ips = 0; + for (t = ipalloc_state->all_ips; t != NULL; t = t->next) { + num_ips++; + } + + unassign_unsuitable_ips(ipalloc_state); + + basic_allocate_unassigned(ipalloc_state); + + /* If we don't want IPs to fail back then don't rebalance IPs. */ + if (ipalloc_state->no_ip_failback) { + return true; + } + + /* Now, try to make sure the ip addresses are evenly distributed + across the nodes. + */ + basic_failback(ipalloc_state, num_ips); + + return true; +} diff --git a/ctdb/server/ipalloc_private.h b/ctdb/server/ipalloc_private.h new file mode 100644 index 0000000..3ea3d31 --- /dev/null +++ b/ctdb/server/ipalloc_private.h @@ -0,0 +1,57 @@ +/* + CTDB IP takeover code + + Copyright (C) Ronnie Sahlberg 2007 + Copyright (C) Andrew Tridgell 2007 + Copyright (C) Martin Schwenke 2015 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#ifndef __CTDB_IPALLOC_PRIVATE_H__ +#define __CTDB_IPALLOC_PRIVATE_H__ + +#include "protocol/protocol.h" + +#include "server/ipalloc.h" + +struct ipalloc_state { + uint32_t num; + + /* Arrays with data for each node */ + struct ctdb_public_ip_list *available_public_ips; + struct ctdb_public_ip_list *known_public_ips; + + struct public_ip_list *all_ips; + enum ipalloc_algorithm algorithm; + bool no_ip_failback; + bool no_ip_takeover; + uint32_t *force_rebalance_nodes; +}; + +bool can_node_takeover_ip(struct ipalloc_state *ipalloc_state, + int32_t pnn, + struct public_ip_list *ip); +int node_ip_coverage(uint32_t pnn, struct public_ip_list *ips); +int find_takeover_node(struct ipalloc_state *ipalloc_state, + struct public_ip_list *ip); + +void unassign_unsuitable_ips(struct ipalloc_state *ipalloc_state); +void basic_allocate_unassigned(struct ipalloc_state *ipalloc_state); + +bool ipalloc_nondeterministic(struct ipalloc_state *ipalloc_state); +bool ipalloc_deterministic(struct ipalloc_state *ipalloc_state); +bool ipalloc_lcp2(struct ipalloc_state *ipalloc_state); + +#endif /* __CTDB_IPALLOC_PRIVATE_H__ */ diff --git a/ctdb/server/legacy_conf.c b/ctdb/server/legacy_conf.c new file mode 100644 index 0000000..3391a3b --- /dev/null +++ b/ctdb/server/legacy_conf.c @@ -0,0 +1,80 @@ +/* + CTDB legacy config handling + + Copyright (C) Martin Schwenke 2018 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "replace.h" + +#include "lib/util/debug.h" + +#include "common/conf.h" +#include "common/logging.h" + +#include "legacy_conf.h" + +#define LEGACY_SCRIPT_LOG_LEVEL_DEFAULT "ERROR" + +static bool legacy_conf_validate_script_log_level(const char *key, + const char *old_loglevel, + const char *new_loglevel, + enum conf_update_mode mode) +{ + int log_level; + bool ok; + + ok = debug_level_parse(new_loglevel, &log_level); + if (!ok) { + D_ERR("Invalid value for [%s] -> %s = %s\n", + LEGACY_CONF_SECTION, + key, + new_loglevel); + return false; + } + + return true; +} + +void legacy_conf_init(struct conf_context *conf) +{ + conf_define_section(conf, LEGACY_CONF_SECTION, NULL); + + conf_define_boolean(conf, + LEGACY_CONF_SECTION, + LEGACY_CONF_REALTIME_SCHEDULING, + true, + NULL); + conf_define_boolean(conf, + LEGACY_CONF_SECTION, + LEGACY_CONF_LMASTER_CAPABILITY, + true, + NULL); + conf_define_boolean(conf, + LEGACY_CONF_SECTION, + LEGACY_CONF_START_AS_STOPPED, + false, + NULL); + conf_define_boolean(conf, + LEGACY_CONF_SECTION, + LEGACY_CONF_START_AS_DISABLED, + false, + NULL); + conf_define_string(conf, + LEGACY_CONF_SECTION, + LEGACY_CONF_SCRIPT_LOG_LEVEL, + LEGACY_SCRIPT_LOG_LEVEL_DEFAULT, + legacy_conf_validate_script_log_level); +} diff --git a/ctdb/server/legacy_conf.h b/ctdb/server/legacy_conf.h new file mode 100644 index 0000000..b6b4b57 --- /dev/null +++ b/ctdb/server/legacy_conf.h @@ -0,0 +1,35 @@ +/* + CTDB legacy config handling + + Copyright (C) Martin Schwenke 2018 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. +*/ + +#ifndef __CTDB_LEGACY_CONF_H__ +#define __CTDB_LEGACY_CONF_H__ + +#include "common/conf.h" + +#define LEGACY_CONF_SECTION "legacy" + +#define LEGACY_CONF_REALTIME_SCHEDULING "realtime scheduling" +#define LEGACY_CONF_LMASTER_CAPABILITY "lmaster capability" +#define LEGACY_CONF_START_AS_STOPPED "start as stopped" +#define LEGACY_CONF_START_AS_DISABLED "start as disabled" +#define LEGACY_CONF_SCRIPT_LOG_LEVEL "script log level" + +void legacy_conf_init(struct conf_context *conf); + +#endif /* __CTDB_LEGACY_CONF_H__ */ |