summaryrefslogtreecommitdiffstats
path: root/ctdb/server/ipalloc_lcp2.c
diff options
context:
space:
mode:
Diffstat (limited to 'ctdb/server/ipalloc_lcp2.c')
-rw-r--r--ctdb/server/ipalloc_lcp2.c525
1 files changed, 525 insertions, 0 deletions
diff --git a/ctdb/server/ipalloc_lcp2.c b/ctdb/server/ipalloc_lcp2.c
new file mode 100644
index 0000000..bc2936b
--- /dev/null
+++ b/ctdb/server/ipalloc_lcp2.c
@@ -0,0 +1,525 @@
+/*
+ ctdb ip takeover code
+
+ Copyright (C) Ronnie Sahlberg 2007
+ Copyright (C) Andrew Tridgell 2007
+ Copyright (C) Martin Schwenke 2011
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+
+#include "lib/util/debug.h"
+#include "common/logging.h"
+
+#include "protocol/protocol_util.h"
+
+#include "server/ipalloc_private.h"
+
+/*
+ * This is the length of the longtest common prefix between the IPs.
+ * It is calculated by XOR-ing the 2 IPs together and counting the
+ * number of leading zeroes. The implementation means that all
+ * addresses end up being 128 bits long.
+ *
+ * FIXME? Should we consider IPv4 and IPv6 separately given that the
+ * 12 bytes of 0 prefix padding will hurt the algorithm if there are
+ * lots of nodes and IP addresses?
+ */
+static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
+{
+ uint32_t ip1_k[IP_KEYLEN];
+ uint32_t *t;
+ int i;
+ uint32_t x;
+
+ uint32_t distance = 0;
+
+ memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
+ t = ip_key(ip2);
+ for (i=0; i<IP_KEYLEN; i++) {
+ x = ip1_k[i] ^ t[i];
+ if (x == 0) {
+ distance += 32;
+ } else {
+ /* Count number of leading zeroes.
+ * FIXME? This could be optimised...
+ */
+ while ((x & ((uint32_t)1 << 31)) == 0) {
+ x <<= 1;
+ distance += 1;
+ }
+ }
+ }
+
+ return distance;
+}
+
+/* Calculate the IP distance for the given IP relative to IPs on the
+ given node. The ips argument is generally the all_ips variable
+ used in the main part of the algorithm.
+ */
+static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
+ struct public_ip_list *ips,
+ unsigned int pnn)
+{
+ struct public_ip_list *t;
+ uint32_t d;
+
+ uint32_t sum = 0;
+
+ for (t = ips; t != NULL; t = t->next) {
+ if (t->pnn != pnn) {
+ continue;
+ }
+
+ /* Optimisation: We never calculate the distance
+ * between an address and itself. This allows us to
+ * calculate the effect of removing an address from a
+ * node by simply calculating the distance between
+ * that address and all of the exitsing addresses.
+ * Moreover, we assume that we're only ever dealing
+ * with addresses from all_ips so we can identify an
+ * address via a pointer rather than doing a more
+ * expensive address comparison. */
+ if (&(t->addr) == ip) {
+ continue;
+ }
+
+ d = ip_distance(ip, &(t->addr));
+ sum += d * d; /* Cheaper than pulling in math.h :-) */
+ }
+
+ return sum;
+}
+
+/* Return the LCP2 imbalance metric for addresses currently assigned
+ to the given node.
+ */
+static uint32_t lcp2_imbalance(struct public_ip_list * all_ips,
+ unsigned int pnn)
+{
+ struct public_ip_list *t;
+
+ uint32_t imbalance = 0;
+
+ for (t = all_ips; t != NULL; t = t->next) {
+ if (t->pnn != pnn) {
+ continue;
+ }
+ /* Pass the rest of the IPs rather than the whole
+ all_ips input list.
+ */
+ imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
+ }
+
+ return imbalance;
+}
+
+static bool lcp2_init(struct ipalloc_state *ipalloc_state,
+ uint32_t **lcp2_imbalances,
+ bool **rebalance_candidates)
+{
+ unsigned int i, numnodes;
+ struct public_ip_list *t;
+
+ numnodes = ipalloc_state->num;
+
+ *rebalance_candidates = talloc_array(ipalloc_state, bool, numnodes);
+ if (*rebalance_candidates == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
+ return false;
+ }
+ *lcp2_imbalances = talloc_array(ipalloc_state, uint32_t, numnodes);
+ if (*lcp2_imbalances == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
+ return false;
+ }
+
+ for (i=0; i<numnodes; i++) {
+ (*lcp2_imbalances)[i] =
+ lcp2_imbalance(ipalloc_state->all_ips, i);
+ /* First step: assume all nodes are candidates */
+ (*rebalance_candidates)[i] = true;
+ }
+
+ /* 2nd step: if a node has IPs assigned then it must have been
+ * healthy before, so we remove it from consideration. This
+ * is overkill but is all we have because we don't maintain
+ * state between takeover runs. An alternative would be to
+ * keep state and invalidate it every time the recovery master
+ * changes.
+ */
+ for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
+ if (t->pnn != CTDB_UNKNOWN_PNN) {
+ (*rebalance_candidates)[t->pnn] = false;
+ }
+ }
+
+ /* 3rd step: if a node is forced to re-balance then
+ we allow failback onto the node */
+ if (ipalloc_state->force_rebalance_nodes == NULL) {
+ return true;
+ }
+ for (i = 0;
+ i < talloc_array_length(ipalloc_state->force_rebalance_nodes);
+ i++) {
+ uint32_t pnn = ipalloc_state->force_rebalance_nodes[i];
+ if (pnn >= numnodes) {
+ DEBUG(DEBUG_ERR,
+ (__location__ "unknown node %u\n", pnn));
+ continue;
+ }
+
+ DEBUG(DEBUG_NOTICE,
+ ("Forcing rebalancing of IPs to node %u\n", pnn));
+ (*rebalance_candidates)[pnn] = true;
+ }
+
+ return true;
+}
+
+/* Allocate any unassigned addresses using the LCP2 algorithm to find
+ * the IP/node combination that will cost the least.
+ */
+static void lcp2_allocate_unassigned(struct ipalloc_state *ipalloc_state,
+ uint32_t *lcp2_imbalances)
+{
+ struct public_ip_list *t;
+ unsigned int dstnode, numnodes;
+
+ unsigned int minnode;
+ uint32_t mindsum, dstdsum, dstimbl;
+ uint32_t minimbl = 0;
+ struct public_ip_list *minip;
+
+ bool should_loop = true;
+ bool have_unassigned = true;
+
+ numnodes = ipalloc_state->num;
+
+ while (have_unassigned && should_loop) {
+ should_loop = false;
+
+ DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
+ DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
+
+ minnode = CTDB_UNKNOWN_PNN;
+ mindsum = 0;
+ minip = NULL;
+
+ /* loop over each unassigned ip. */
+ for (t = ipalloc_state->all_ips; t != NULL ; t = t->next) {
+ if (t->pnn != CTDB_UNKNOWN_PNN) {
+ continue;
+ }
+
+ for (dstnode = 0; dstnode < numnodes; dstnode++) {
+ /* only check nodes that can actually takeover this ip */
+ if (!can_node_takeover_ip(ipalloc_state,
+ dstnode,
+ t)) {
+ /* no it couldnt so skip to the next node */
+ continue;
+ }
+
+ dstdsum = ip_distance_2_sum(&(t->addr),
+ ipalloc_state->all_ips,
+ dstnode);
+ dstimbl = lcp2_imbalances[dstnode] + dstdsum;
+ DEBUG(DEBUG_DEBUG,
+ (" %s -> %d [+%d]\n",
+ ctdb_sock_addr_to_string(ipalloc_state,
+ &(t->addr),
+ false),
+ dstnode,
+ dstimbl - lcp2_imbalances[dstnode]));
+
+
+ if (minnode == CTDB_UNKNOWN_PNN ||
+ dstdsum < mindsum) {
+ minnode = dstnode;
+ minimbl = dstimbl;
+ mindsum = dstdsum;
+ minip = t;
+ should_loop = true;
+ }
+ }
+ }
+
+ DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
+
+ /* If we found one then assign it to the given node. */
+ if (minnode != CTDB_UNKNOWN_PNN) {
+ minip->pnn = minnode;
+ lcp2_imbalances[minnode] = minimbl;
+ DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
+ ctdb_sock_addr_to_string(
+ ipalloc_state,
+ &(minip->addr), false),
+ minnode,
+ mindsum));
+ }
+
+ /* There might be a better way but at least this is clear. */
+ have_unassigned = false;
+ for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
+ if (t->pnn == CTDB_UNKNOWN_PNN) {
+ have_unassigned = true;
+ }
+ }
+ }
+
+ /* We know if we have an unassigned addresses so we might as
+ * well optimise.
+ */
+ if (have_unassigned) {
+ for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
+ if (t->pnn == CTDB_UNKNOWN_PNN) {
+ DEBUG(DEBUG_WARNING,
+ ("Failed to find node to cover ip %s\n",
+ ctdb_sock_addr_to_string(ipalloc_state,
+ &t->addr,
+ false)));
+ }
+ }
+ }
+}
+
+/* LCP2 algorithm for rebalancing the cluster. Given a candidate node
+ * to move IPs from, determines the best IP/destination node
+ * combination to move from the source node.
+ */
+static bool lcp2_failback_candidate(struct ipalloc_state *ipalloc_state,
+ unsigned int srcnode,
+ uint32_t *lcp2_imbalances,
+ bool *rebalance_candidates)
+{
+ unsigned int dstnode, mindstnode, numnodes;
+ uint32_t srcdsum, dstimbl, dstdsum;
+ uint32_t minsrcimbl, mindstimbl;
+ struct public_ip_list *minip;
+ struct public_ip_list *t;
+
+ /* Find an IP and destination node that best reduces imbalance. */
+ minip = NULL;
+ minsrcimbl = 0;
+ mindstnode = CTDB_UNKNOWN_PNN;
+ mindstimbl = 0;
+
+ numnodes = ipalloc_state->num;
+
+ DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
+ DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
+ srcnode, lcp2_imbalances[srcnode]));
+
+ for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
+ uint32_t srcimbl;
+
+ /* Only consider addresses on srcnode. */
+ if (t->pnn != srcnode) {
+ continue;
+ }
+
+ /* What is this IP address costing the source node? */
+ srcdsum = ip_distance_2_sum(&(t->addr),
+ ipalloc_state->all_ips,
+ srcnode);
+ srcimbl = lcp2_imbalances[srcnode] - srcdsum;
+
+ /* Consider this IP address would cost each potential
+ * destination node. Destination nodes are limited to
+ * those that are newly healthy, since we don't want
+ * to do gratuitous failover of IPs just to make minor
+ * balance improvements.
+ */
+ for (dstnode = 0; dstnode < numnodes; dstnode++) {
+ if (!rebalance_candidates[dstnode]) {
+ continue;
+ }
+
+ /* only check nodes that can actually takeover this ip */
+ if (!can_node_takeover_ip(ipalloc_state, dstnode,
+ t)) {
+ /* no it couldnt so skip to the next node */
+ continue;
+ }
+
+ dstdsum = ip_distance_2_sum(&(t->addr),
+ ipalloc_state->all_ips,
+ dstnode);
+ dstimbl = lcp2_imbalances[dstnode] + dstdsum;
+ DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
+ srcnode, -srcdsum,
+ ctdb_sock_addr_to_string(
+ ipalloc_state,
+ &(t->addr), false),
+ dstnode, dstdsum));
+
+ if ((dstimbl < lcp2_imbalances[srcnode]) &&
+ (dstdsum < srcdsum) && \
+ ((mindstnode == CTDB_UNKNOWN_PNN) || \
+ ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
+
+ minip = t;
+ minsrcimbl = srcimbl;
+ mindstnode = dstnode;
+ mindstimbl = dstimbl;
+ }
+ }
+ }
+ DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
+
+ if (mindstnode != CTDB_UNKNOWN_PNN) {
+ /* We found a move that makes things better... */
+ DEBUG(DEBUG_INFO,
+ ("%d [%d] -> %s -> %d [+%d]\n",
+ srcnode, minsrcimbl - lcp2_imbalances[srcnode],
+ ctdb_sock_addr_to_string(ipalloc_state,
+ &(minip->addr), false),
+ mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
+
+
+ lcp2_imbalances[srcnode] = minsrcimbl;
+ lcp2_imbalances[mindstnode] = mindstimbl;
+ minip->pnn = mindstnode;
+
+ return true;
+ }
+
+ return false;
+}
+
+struct lcp2_imbalance_pnn {
+ uint32_t imbalance;
+ unsigned int pnn;
+};
+
+static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
+{
+ const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
+ const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
+
+ if (lipa->imbalance > lipb->imbalance) {
+ return -1;
+ } else if (lipa->imbalance == lipb->imbalance) {
+ return 0;
+ } else {
+ return 1;
+ }
+}
+
+/* LCP2 algorithm for rebalancing the cluster. This finds the source
+ * node with the highest LCP2 imbalance, and then determines the best
+ * IP/destination node combination to move from the source node.
+ */
+static void lcp2_failback(struct ipalloc_state *ipalloc_state,
+ uint32_t *lcp2_imbalances,
+ bool *rebalance_candidates)
+{
+ int i, numnodes;
+ struct lcp2_imbalance_pnn * lips;
+ bool again;
+
+ numnodes = ipalloc_state->num;
+
+try_again:
+ /* Put the imbalances and nodes into an array, sort them and
+ * iterate through candidates. Usually the 1st one will be
+ * used, so this doesn't cost much...
+ */
+ DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
+ DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
+ lips = talloc_array(ipalloc_state, struct lcp2_imbalance_pnn, numnodes);
+ for (i = 0; i < numnodes; i++) {
+ lips[i].imbalance = lcp2_imbalances[i];
+ lips[i].pnn = i;
+ DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
+ }
+ qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
+ lcp2_cmp_imbalance_pnn);
+
+ again = false;
+ for (i = 0; i < numnodes; i++) {
+ /* This means that all nodes had 0 or 1 addresses, so
+ * can't be imbalanced.
+ */
+ if (lips[i].imbalance == 0) {
+ break;
+ }
+
+ if (lcp2_failback_candidate(ipalloc_state,
+ lips[i].pnn,
+ lcp2_imbalances,
+ rebalance_candidates)) {
+ again = true;
+ break;
+ }
+ }
+
+ talloc_free(lips);
+ if (again) {
+ goto try_again;
+ }
+}
+
+bool ipalloc_lcp2(struct ipalloc_state *ipalloc_state)
+{
+ uint32_t *lcp2_imbalances;
+ bool *rebalance_candidates;
+ int numnodes, i;
+ bool have_rebalance_candidates;
+ bool ret = true;
+
+ unassign_unsuitable_ips(ipalloc_state);
+
+ if (!lcp2_init(ipalloc_state,
+ &lcp2_imbalances, &rebalance_candidates)) {
+ ret = false;
+ goto finished;
+ }
+
+ lcp2_allocate_unassigned(ipalloc_state, lcp2_imbalances);
+
+ /* If we don't want IPs to fail back then don't rebalance IPs. */
+ if (ipalloc_state->no_ip_failback) {
+ goto finished;
+ }
+
+ /* It is only worth continuing if we have suitable target
+ * nodes to transfer IPs to. This check is much cheaper than
+ * continuing on...
+ */
+ numnodes = ipalloc_state->num;
+ have_rebalance_candidates = false;
+ for (i=0; i<numnodes; i++) {
+ if (rebalance_candidates[i]) {
+ have_rebalance_candidates = true;
+ break;
+ }
+ }
+ if (!have_rebalance_candidates) {
+ goto finished;
+ }
+
+ /* Now, try to make sure the ip adresses are evenly distributed
+ across the nodes.
+ */
+ lcp2_failback(ipalloc_state, lcp2_imbalances, rebalance_candidates);
+
+finished:
+ return ret;
+}