diff options
Diffstat (limited to 'daemons/controld/controld_remote_ra.c')
-rw-r--r-- | daemons/controld/controld_remote_ra.c | 1440 |
1 files changed, 1440 insertions, 0 deletions
diff --git a/daemons/controld/controld_remote_ra.c b/daemons/controld/controld_remote_ra.c new file mode 100644 index 0000000..f24b755 --- /dev/null +++ b/daemons/controld/controld_remote_ra.c @@ -0,0 +1,1440 @@ +/* + * Copyright 2013-2023 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * + * This source code is licensed under the GNU General Public License version 2 + * or later (GPLv2+) WITHOUT ANY WARRANTY. + */ + +#include <crm_internal.h> + +#include <crm/crm.h> +#include <crm/msg_xml.h> +#include <crm/common/xml_internal.h> +#include <crm/lrmd.h> +#include <crm/lrmd_internal.h> +#include <crm/services.h> + +#include <pacemaker-controld.h> + +#define REMOTE_LRMD_RA "remote" + +/* The max start timeout before cmd retry */ +#define MAX_START_TIMEOUT_MS 10000 + +#define cmd_set_flags(cmd, flags_to_set) do { \ + (cmd)->status = pcmk__set_flags_as(__func__, __LINE__, LOG_TRACE, \ + "Remote command", (cmd)->rsc_id, (cmd)->status, \ + (flags_to_set), #flags_to_set); \ + } while (0) + +#define cmd_clear_flags(cmd, flags_to_clear) do { \ + (cmd)->status = pcmk__clear_flags_as(__func__, __LINE__, LOG_TRACE, \ + "Remote command", (cmd)->rsc_id, (cmd)->status, \ + (flags_to_clear), #flags_to_clear); \ + } while (0) + +enum remote_cmd_status { + cmd_reported_success = (1 << 0), + cmd_cancel = (1 << 1), +}; + +typedef struct remote_ra_cmd_s { + /*! the local node the cmd is issued from */ + char *owner; + /*! the remote node the cmd is executed on */ + char *rsc_id; + /*! the action to execute */ + char *action; + /*! some string the client wants us to give it back */ + char *userdata; + /*! start delay in ms */ + int start_delay; + /*! timer id used for start delay. */ + int delay_id; + /*! timeout in ms for cmd */ + int timeout; + int remaining_timeout; + /*! recurring interval in ms */ + guint interval_ms; + /*! interval timer id */ + int interval_id; + int monitor_timeout_id; + int takeover_timeout_id; + /*! action parameters */ + lrmd_key_value_t *params; + pcmk__action_result_t result; + int call_id; + time_t start_time; + uint32_t status; +} remote_ra_cmd_t; + +#define lrm_remote_set_flags(lrm_state, flags_to_set) do { \ + lrm_state_t *lrm = (lrm_state); \ + remote_ra_data_t *ra = lrm->remote_ra_data; \ + ra->status = pcmk__set_flags_as(__func__, __LINE__, LOG_TRACE, "Remote", \ + lrm->node_name, ra->status, \ + (flags_to_set), #flags_to_set); \ + } while (0) + +#define lrm_remote_clear_flags(lrm_state, flags_to_clear) do { \ + lrm_state_t *lrm = (lrm_state); \ + remote_ra_data_t *ra = lrm->remote_ra_data; \ + ra->status = pcmk__clear_flags_as(__func__, __LINE__, LOG_TRACE, "Remote", \ + lrm->node_name, ra->status, \ + (flags_to_clear), #flags_to_clear); \ + } while (0) + +enum remote_status { + expect_takeover = (1 << 0), + takeover_complete = (1 << 1), + remote_active = (1 << 2), + /* Maintenance mode is difficult to determine from the controller's context, + * so we have it signalled back with the transition from the scheduler. + */ + remote_in_maint = (1 << 3), + /* Similar for whether we are controlling a guest node or remote node. + * Fortunately there is a meta-attribute in the transition already and + * as the situation doesn't change over time we can use the + * resource start for noting down the information for later use when + * the attributes aren't at hand. + */ + controlling_guest = (1 << 4), +}; + +typedef struct remote_ra_data_s { + crm_trigger_t *work; + remote_ra_cmd_t *cur_cmd; + GList *cmds; + GList *recurring_cmds; + uint32_t status; +} remote_ra_data_t; + +static int handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms); +static void handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd); +static GList *fail_all_monitor_cmds(GList * list); + +static void +free_cmd(gpointer user_data) +{ + remote_ra_cmd_t *cmd = user_data; + + if (!cmd) { + return; + } + if (cmd->delay_id) { + g_source_remove(cmd->delay_id); + } + if (cmd->interval_id) { + g_source_remove(cmd->interval_id); + } + if (cmd->monitor_timeout_id) { + g_source_remove(cmd->monitor_timeout_id); + } + if (cmd->takeover_timeout_id) { + g_source_remove(cmd->takeover_timeout_id); + } + free(cmd->owner); + free(cmd->rsc_id); + free(cmd->action); + free(cmd->userdata); + pcmk__reset_result(&(cmd->result)); + lrmd_key_value_freeall(cmd->params); + free(cmd); +} + +static int +generate_callid(void) +{ + static int remote_ra_callid = 0; + + remote_ra_callid++; + if (remote_ra_callid <= 0) { + remote_ra_callid = 1; + } + + return remote_ra_callid; +} + +static gboolean +recurring_helper(gpointer data) +{ + remote_ra_cmd_t *cmd = data; + lrm_state_t *connection_rsc = NULL; + + cmd->interval_id = 0; + connection_rsc = lrm_state_find(cmd->rsc_id); + if (connection_rsc && connection_rsc->remote_ra_data) { + remote_ra_data_t *ra_data = connection_rsc->remote_ra_data; + + ra_data->recurring_cmds = g_list_remove(ra_data->recurring_cmds, cmd); + + ra_data->cmds = g_list_append(ra_data->cmds, cmd); + mainloop_set_trigger(ra_data->work); + } + return FALSE; +} + +static gboolean +start_delay_helper(gpointer data) +{ + remote_ra_cmd_t *cmd = data; + lrm_state_t *connection_rsc = NULL; + + cmd->delay_id = 0; + connection_rsc = lrm_state_find(cmd->rsc_id); + if (connection_rsc && connection_rsc->remote_ra_data) { + remote_ra_data_t *ra_data = connection_rsc->remote_ra_data; + + mainloop_set_trigger(ra_data->work); + } + return FALSE; +} + +static bool +should_purge_attributes(crm_node_t *node) +{ + bool purge = true; + crm_node_t *conn_node = NULL; + lrm_state_t *connection_rsc = NULL; + + if (!node->conn_host) { + return purge; + } + + /* Get the node that was hosting the remote connection resource from the + * peer cache. That's the one we really care about here. + */ + conn_node = crm_get_peer(0, node->conn_host); + if (conn_node == NULL) { + return purge; + } + + /* Check the uptime of connection_rsc. If it hasn't been running long + * enough, set purge=true. "Long enough" means it started running earlier + * than the timestamp when we noticed it went away in the first place. + */ + connection_rsc = lrm_state_find(node->uname); + + if (connection_rsc != NULL) { + lrmd_t *lrm = connection_rsc->conn; + time_t uptime = lrmd__uptime(lrm); + time_t now = time(NULL); + + /* Add 20s of fuzziness to give corosync a while to notice the remote + * host is gone. On various error conditions (failure to get uptime, + * peer_lost isn't set) we default to purging. + */ + if (uptime > 0 && + conn_node->peer_lost > 0 && + uptime + 20 >= now - conn_node->peer_lost) { + purge = false; + } + } + + return purge; +} + +static enum controld_section_e +section_to_delete(bool purge) +{ + if (pcmk_is_set(controld_globals.flags, controld_shutdown_lock_enabled)) { + if (purge) { + return controld_section_all_unlocked; + } else { + return controld_section_lrm_unlocked; + } + } else { + if (purge) { + return controld_section_all; + } else { + return controld_section_lrm; + } + } +} + +static void +purge_remote_node_attrs(int call_opt, crm_node_t *node) +{ + bool purge = should_purge_attributes(node); + enum controld_section_e section = section_to_delete(purge); + + /* Purge node from attrd's memory */ + if (purge) { + update_attrd_remote_node_removed(node->uname, NULL); + } + + controld_delete_node_state(node->uname, section, call_opt); +} + +/*! + * \internal + * \brief Handle cluster communication related to pacemaker_remote node joining + * + * \param[in] node_name Name of newly integrated pacemaker_remote node + */ +static void +remote_node_up(const char *node_name) +{ + int call_opt; + xmlNode *update, *state; + crm_node_t *node; + + CRM_CHECK(node_name != NULL, return); + crm_info("Announcing Pacemaker Remote node %s", node_name); + + call_opt = crmd_cib_smart_opt(); + + /* Delete node's probe_complete attribute. This serves two purposes: + * + * - @COMPAT DCs < 1.1.14 in a rolling upgrade might use it + * - deleting it (or any attribute for that matter) here ensures the + * attribute manager learns the node is remote + */ + update_attrd(node_name, CRM_OP_PROBED, NULL, NULL, TRUE); + + /* Ensure node is in the remote peer cache with member status */ + node = crm_remote_peer_get(node_name); + CRM_CHECK(node != NULL, return); + + purge_remote_node_attrs(call_opt, node); + pcmk__update_peer_state(__func__, node, CRM_NODE_MEMBER, 0); + + /* pacemaker_remote nodes don't participate in the membership layer, + * so cluster nodes don't automatically get notified when they come and go. + * We send a cluster message to the DC, and update the CIB node state entry, + * so the DC will get it sooner (via message) or later (via CIB refresh), + * and any other interested parties can query the CIB. + */ + broadcast_remote_state_message(node_name, true); + + update = create_xml_node(NULL, XML_CIB_TAG_STATUS); + state = create_node_state_update(node, node_update_cluster, update, + __func__); + + /* Clear the XML_NODE_IS_FENCED flag in the node state. If the node ever + * needs to be fenced, this flag will allow various actions to determine + * whether the fencing has happened yet. + */ + crm_xml_add(state, XML_NODE_IS_FENCED, "0"); + + /* TODO: If the remote connection drops, and this (async) CIB update either + * failed or has not yet completed, later actions could mistakenly think the + * node has already been fenced (if the XML_NODE_IS_FENCED attribute was + * previously set, because it won't have been cleared). This could prevent + * actual fencing or allow recurring monitor failures to be cleared too + * soon. Ideally, we wouldn't rely on the CIB for the fenced status. + */ + controld_update_cib(XML_CIB_TAG_STATUS, update, call_opt, NULL); + free_xml(update); +} + +enum down_opts { + DOWN_KEEP_LRM, + DOWN_ERASE_LRM +}; + +/*! + * \internal + * \brief Handle cluster communication related to pacemaker_remote node leaving + * + * \param[in] node_name Name of lost node + * \param[in] opts Whether to keep or erase LRM history + */ +static void +remote_node_down(const char *node_name, const enum down_opts opts) +{ + xmlNode *update; + int call_opt = crmd_cib_smart_opt(); + crm_node_t *node; + + /* Purge node from attrd's memory */ + update_attrd_remote_node_removed(node_name, NULL); + + /* Normally, only node attributes should be erased, and the resource history + * should be kept until the node comes back up. However, after a successful + * fence, we want to clear the history as well, so we don't think resources + * are still running on the node. + */ + if (opts == DOWN_ERASE_LRM) { + controld_delete_node_state(node_name, controld_section_all, call_opt); + } else { + controld_delete_node_state(node_name, controld_section_attrs, call_opt); + } + + /* Ensure node is in the remote peer cache with lost state */ + node = crm_remote_peer_get(node_name); + CRM_CHECK(node != NULL, return); + pcmk__update_peer_state(__func__, node, CRM_NODE_LOST, 0); + + /* Notify DC */ + broadcast_remote_state_message(node_name, false); + + /* Update CIB node state */ + update = create_xml_node(NULL, XML_CIB_TAG_STATUS); + create_node_state_update(node, node_update_cluster, update, __func__); + controld_update_cib(XML_CIB_TAG_STATUS, update, call_opt, NULL); + free_xml(update); +} + +/*! + * \internal + * \brief Handle effects of a remote RA command on node state + * + * \param[in] cmd Completed remote RA command + */ +static void +check_remote_node_state(const remote_ra_cmd_t *cmd) +{ + /* Only successful actions can change node state */ + if (!pcmk__result_ok(&(cmd->result))) { + return; + } + + if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) { + remote_node_up(cmd->rsc_id); + + } else if (pcmk__str_eq(cmd->action, "migrate_from", pcmk__str_casei)) { + /* After a successful migration, we don't need to do remote_node_up() + * because the DC already knows the node is up, and we don't want to + * clear LRM history etc. We do need to add the remote node to this + * host's remote peer cache, because (unless it happens to be DC) + * it hasn't been tracking the remote node, and other code relies on + * the cache to distinguish remote nodes from unseen cluster nodes. + */ + crm_node_t *node = crm_remote_peer_get(cmd->rsc_id); + + CRM_CHECK(node != NULL, return); + pcmk__update_peer_state(__func__, node, CRM_NODE_MEMBER, 0); + + } else if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) { + lrm_state_t *lrm_state = lrm_state_find(cmd->rsc_id); + remote_ra_data_t *ra_data = lrm_state? lrm_state->remote_ra_data : NULL; + + if (ra_data) { + if (!pcmk_is_set(ra_data->status, takeover_complete)) { + /* Stop means down if we didn't successfully migrate elsewhere */ + remote_node_down(cmd->rsc_id, DOWN_KEEP_LRM); + } else if (AM_I_DC == FALSE) { + /* Only the connection host and DC track node state, + * so if the connection migrated elsewhere and we aren't DC, + * un-cache the node, so we don't have stale info + */ + crm_remote_peer_cache_remove(cmd->rsc_id); + } + } + } + + /* We don't do anything for successful monitors, which is correct for + * routine recurring monitors, and for monitors on nodes where the + * connection isn't supposed to be (the cluster will stop the connection in + * that case). However, if the initial probe finds the connection already + * active on the node where we want it, we probably should do + * remote_node_up(). Unfortunately, we can't distinguish that case here. + * Given that connections have to be initiated by the cluster, the chance of + * that should be close to zero. + */ +} + +static void +report_remote_ra_result(remote_ra_cmd_t * cmd) +{ + lrmd_event_data_t op = { 0, }; + + check_remote_node_state(cmd); + + op.type = lrmd_event_exec_complete; + op.rsc_id = cmd->rsc_id; + op.op_type = cmd->action; + op.user_data = cmd->userdata; + op.timeout = cmd->timeout; + op.interval_ms = cmd->interval_ms; + op.t_run = (unsigned int) cmd->start_time; + op.t_rcchange = (unsigned int) cmd->start_time; + + lrmd__set_result(&op, cmd->result.exit_status, cmd->result.execution_status, + cmd->result.exit_reason); + + if (pcmk_is_set(cmd->status, cmd_reported_success) && !pcmk__result_ok(&(cmd->result))) { + op.t_rcchange = (unsigned int) time(NULL); + /* This edge case will likely never ever occur, but if it does the + * result is that a failure will not be processed correctly. This is only + * remotely possible because we are able to detect a connection resource's tcp + * connection has failed at any moment after start has completed. The actual + * recurring operation is just a connectivity ping. + * + * basically, we are not guaranteed that the first successful monitor op and + * a subsequent failed monitor op will not occur in the same timestamp. We have to + * make it look like the operations occurred at separate times though. */ + if (op.t_rcchange == op.t_run) { + op.t_rcchange++; + } + } + + if (cmd->params) { + lrmd_key_value_t *tmp; + + op.params = pcmk__strkey_table(free, free); + for (tmp = cmd->params; tmp; tmp = tmp->next) { + g_hash_table_insert(op.params, strdup(tmp->key), strdup(tmp->value)); + } + + } + op.call_id = cmd->call_id; + op.remote_nodename = cmd->owner; + + lrm_op_callback(&op); + + if (op.params) { + g_hash_table_destroy(op.params); + } + lrmd__reset_result(&op); +} + +static void +update_remaining_timeout(remote_ra_cmd_t * cmd) +{ + cmd->remaining_timeout = ((cmd->timeout / 1000) - (time(NULL) - cmd->start_time)) * 1000; +} + +static gboolean +retry_start_cmd_cb(gpointer data) +{ + lrm_state_t *lrm_state = data; + remote_ra_data_t *ra_data = lrm_state->remote_ra_data; + remote_ra_cmd_t *cmd = NULL; + int rc = ETIME; + + if (!ra_data || !ra_data->cur_cmd) { + return FALSE; + } + cmd = ra_data->cur_cmd; + if (!pcmk__strcase_any_of(cmd->action, "start", "migrate_from", NULL)) { + return FALSE; + } + update_remaining_timeout(cmd); + + if (cmd->remaining_timeout > 0) { + rc = handle_remote_ra_start(lrm_state, cmd, cmd->remaining_timeout); + } else { + pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR, + PCMK_EXEC_TIMEOUT, + "Not enough time remains to retry remote connection"); + } + + if (rc != pcmk_rc_ok) { + report_remote_ra_result(cmd); + + if (ra_data->cmds) { + mainloop_set_trigger(ra_data->work); + } + ra_data->cur_cmd = NULL; + free_cmd(cmd); + } else { + /* wait for connection event */ + } + + return FALSE; +} + + +static gboolean +connection_takeover_timeout_cb(gpointer data) +{ + lrm_state_t *lrm_state = NULL; + remote_ra_cmd_t *cmd = data; + + crm_info("takeover event timed out for node %s", cmd->rsc_id); + cmd->takeover_timeout_id = 0; + + lrm_state = lrm_state_find(cmd->rsc_id); + + handle_remote_ra_stop(lrm_state, cmd); + free_cmd(cmd); + + return FALSE; +} + +static gboolean +monitor_timeout_cb(gpointer data) +{ + lrm_state_t *lrm_state = NULL; + remote_ra_cmd_t *cmd = data; + + lrm_state = lrm_state_find(cmd->rsc_id); + + crm_info("Timed out waiting for remote poke response from %s%s", + cmd->rsc_id, (lrm_state? "" : " (no LRM state)")); + cmd->monitor_timeout_id = 0; + pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_TIMEOUT, + "Remote executor did not respond"); + + if (lrm_state && lrm_state->remote_ra_data) { + remote_ra_data_t *ra_data = lrm_state->remote_ra_data; + + if (ra_data->cur_cmd == cmd) { + ra_data->cur_cmd = NULL; + } + if (ra_data->cmds) { + mainloop_set_trigger(ra_data->work); + } + } + + report_remote_ra_result(cmd); + free_cmd(cmd); + + if(lrm_state) { + lrm_state_disconnect(lrm_state); + } + return FALSE; +} + +static void +synthesize_lrmd_success(lrm_state_t *lrm_state, const char *rsc_id, const char *op_type) +{ + lrmd_event_data_t op = { 0, }; + + if (lrm_state == NULL) { + /* if lrm_state not given assume local */ + lrm_state = lrm_state_find(controld_globals.our_nodename); + } + CRM_ASSERT(lrm_state != NULL); + + op.type = lrmd_event_exec_complete; + op.rsc_id = rsc_id; + op.op_type = op_type; + op.t_run = (unsigned int) time(NULL); + op.t_rcchange = op.t_run; + op.call_id = generate_callid(); + lrmd__set_result(&op, PCMK_OCF_OK, PCMK_EXEC_DONE, NULL); + process_lrm_event(lrm_state, &op, NULL, NULL); +} + +void +remote_lrm_op_callback(lrmd_event_data_t * op) +{ + gboolean cmd_handled = FALSE; + lrm_state_t *lrm_state = NULL; + remote_ra_data_t *ra_data = NULL; + remote_ra_cmd_t *cmd = NULL; + + crm_debug("Processing '%s%s%s' event on remote connection to %s: %s " + "(%d) status=%s (%d)", + (op->op_type? op->op_type : ""), (op->op_type? " " : ""), + lrmd_event_type2str(op->type), op->remote_nodename, + services_ocf_exitcode_str(op->rc), op->rc, + pcmk_exec_status_str(op->op_status), op->op_status); + + lrm_state = lrm_state_find(op->remote_nodename); + if (!lrm_state || !lrm_state->remote_ra_data) { + crm_debug("No state information found for remote connection event"); + return; + } + ra_data = lrm_state->remote_ra_data; + + if (op->type == lrmd_event_new_client) { + // Another client has connected to the remote daemon + + if (pcmk_is_set(ra_data->status, expect_takeover)) { + // Great, we knew this was coming + lrm_remote_clear_flags(lrm_state, expect_takeover); + lrm_remote_set_flags(lrm_state, takeover_complete); + + } else { + crm_err("Disconnecting from Pacemaker Remote node %s due to " + "unexpected client takeover", op->remote_nodename); + /* In this case, lrmd_tls_connection_destroy() will be called under the control of mainloop. */ + /* Do not free lrm_state->conn yet. */ + /* It'll be freed in the following stop action. */ + lrm_state_disconnect_only(lrm_state); + } + return; + } + + /* filter all EXEC events up */ + if (op->type == lrmd_event_exec_complete) { + if (pcmk_is_set(ra_data->status, takeover_complete)) { + crm_debug("ignoring event, this connection is taken over by another node"); + } else { + lrm_op_callback(op); + } + return; + } + + if ((op->type == lrmd_event_disconnect) && (ra_data->cur_cmd == NULL)) { + + if (!pcmk_is_set(ra_data->status, remote_active)) { + crm_debug("Disconnection from Pacemaker Remote node %s complete", + lrm_state->node_name); + + } else if (!remote_ra_is_in_maintenance(lrm_state)) { + crm_err("Lost connection to Pacemaker Remote node %s", + lrm_state->node_name); + ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds); + ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds); + + } else { + crm_notice("Unmanaged Pacemaker Remote node %s disconnected", + lrm_state->node_name); + /* Do roughly what a 'stop' on the remote-resource would do */ + handle_remote_ra_stop(lrm_state, NULL); + remote_node_down(lrm_state->node_name, DOWN_KEEP_LRM); + /* now fake the reply of a successful 'stop' */ + synthesize_lrmd_success(NULL, lrm_state->node_name, "stop"); + } + return; + } + + if (!ra_data->cur_cmd) { + crm_debug("no event to match"); + return; + } + + cmd = ra_data->cur_cmd; + + /* Start actions and migrate from actions complete after connection + * comes back to us. */ + if (op->type == lrmd_event_connect && pcmk__strcase_any_of(cmd->action, "start", + "migrate_from", NULL)) { + if (op->connection_rc < 0) { + update_remaining_timeout(cmd); + + if ((op->connection_rc == -ENOKEY) + || (op->connection_rc == -EKEYREJECTED)) { + // Hard error, don't retry + pcmk__set_result(&(cmd->result), PCMK_OCF_INVALID_PARAM, + PCMK_EXEC_ERROR, + pcmk_strerror(op->connection_rc)); + + } else if (cmd->remaining_timeout > 3000) { + crm_trace("rescheduling start, remaining timeout %d", cmd->remaining_timeout); + g_timeout_add(1000, retry_start_cmd_cb, lrm_state); + return; + + } else { + crm_trace("can't reschedule start, remaining timeout too small %d", + cmd->remaining_timeout); + pcmk__format_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR, + PCMK_EXEC_TIMEOUT, + "%s without enough time to retry", + pcmk_strerror(op->connection_rc)); + } + + } else { + lrm_state_reset_tables(lrm_state, TRUE); + pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL); + lrm_remote_set_flags(lrm_state, remote_active); + } + + crm_debug("Remote connection event matched %s action", cmd->action); + report_remote_ra_result(cmd); + cmd_handled = TRUE; + + } else if (op->type == lrmd_event_poke && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) { + + if (cmd->monitor_timeout_id) { + g_source_remove(cmd->monitor_timeout_id); + cmd->monitor_timeout_id = 0; + } + + /* Only report success the first time, after that only worry about failures. + * For this function, if we get the poke pack, it is always a success. Pokes + * only fail if the send fails, or the response times out. */ + if (!pcmk_is_set(cmd->status, cmd_reported_success)) { + pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL); + report_remote_ra_result(cmd); + cmd_set_flags(cmd, cmd_reported_success); + } + + crm_debug("Remote poke event matched %s action", cmd->action); + + /* success, keep rescheduling if interval is present. */ + if (cmd->interval_ms && !pcmk_is_set(cmd->status, cmd_cancel)) { + ra_data->recurring_cmds = g_list_append(ra_data->recurring_cmds, cmd); + cmd->interval_id = g_timeout_add(cmd->interval_ms, + recurring_helper, cmd); + cmd = NULL; /* prevent free */ + } + cmd_handled = TRUE; + + } else if (op->type == lrmd_event_disconnect && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) { + if (pcmk_is_set(ra_data->status, remote_active) && + !pcmk_is_set(cmd->status, cmd_cancel)) { + pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR, + PCMK_EXEC_ERROR, + "Remote connection unexpectedly dropped " + "during monitor"); + report_remote_ra_result(cmd); + crm_err("Remote connection to %s unexpectedly dropped during monitor", + lrm_state->node_name); + } + cmd_handled = TRUE; + + } else if (op->type == lrmd_event_new_client && pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) { + + handle_remote_ra_stop(lrm_state, cmd); + cmd_handled = TRUE; + + } else { + crm_debug("Event did not match %s action", ra_data->cur_cmd->action); + } + + if (cmd_handled) { + ra_data->cur_cmd = NULL; + if (ra_data->cmds) { + mainloop_set_trigger(ra_data->work); + } + free_cmd(cmd); + } +} + +static void +handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd) +{ + remote_ra_data_t *ra_data = NULL; + + CRM_ASSERT(lrm_state); + ra_data = lrm_state->remote_ra_data; + + if (!pcmk_is_set(ra_data->status, takeover_complete)) { + /* delete pending ops when ever the remote connection is intentionally stopped */ + g_hash_table_remove_all(lrm_state->active_ops); + } else { + /* we no longer hold the history if this connection has been migrated, + * however, we keep metadata cache for future use */ + lrm_state_reset_tables(lrm_state, FALSE); + } + + lrm_remote_clear_flags(lrm_state, remote_active); + lrm_state_disconnect(lrm_state); + + if (ra_data->cmds) { + g_list_free_full(ra_data->cmds, free_cmd); + } + if (ra_data->recurring_cmds) { + g_list_free_full(ra_data->recurring_cmds, free_cmd); + } + ra_data->cmds = NULL; + ra_data->recurring_cmds = NULL; + ra_data->cur_cmd = NULL; + + if (cmd) { + pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL); + report_remote_ra_result(cmd); + } +} + +// \return Standard Pacemaker return code +static int +handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms) +{ + const char *server = NULL; + lrmd_key_value_t *tmp = NULL; + int port = 0; + int timeout_used = timeout_ms > MAX_START_TIMEOUT_MS ? MAX_START_TIMEOUT_MS : timeout_ms; + int rc = pcmk_rc_ok; + + for (tmp = cmd->params; tmp; tmp = tmp->next) { + if (pcmk__strcase_any_of(tmp->key, XML_RSC_ATTR_REMOTE_RA_ADDR, + XML_RSC_ATTR_REMOTE_RA_SERVER, NULL)) { + server = tmp->value; + } else if (pcmk__str_eq(tmp->key, XML_RSC_ATTR_REMOTE_RA_PORT, pcmk__str_casei)) { + port = atoi(tmp->value); + } else if (pcmk__str_eq(tmp->key, CRM_META "_" XML_RSC_ATTR_CONTAINER, pcmk__str_casei)) { + lrm_remote_set_flags(lrm_state, controlling_guest); + } + } + + rc = controld_connect_remote_executor(lrm_state, server, port, + timeout_used); + if (rc != pcmk_rc_ok) { + pcmk__format_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR, + PCMK_EXEC_ERROR, + "Could not connect to Pacemaker Remote node %s: %s", + lrm_state->node_name, pcmk_rc_str(rc)); + } + return rc; +} + +static gboolean +handle_remote_ra_exec(gpointer user_data) +{ + int rc = 0; + lrm_state_t *lrm_state = user_data; + remote_ra_data_t *ra_data = lrm_state->remote_ra_data; + remote_ra_cmd_t *cmd; + GList *first = NULL; + + if (ra_data->cur_cmd) { + /* still waiting on previous cmd */ + return TRUE; + } + + while (ra_data->cmds) { + first = ra_data->cmds; + cmd = first->data; + if (cmd->delay_id) { + /* still waiting for start delay timer to trip */ + return TRUE; + } + + ra_data->cmds = g_list_remove_link(ra_data->cmds, first); + g_list_free_1(first); + + if (!strcmp(cmd->action, "start") || !strcmp(cmd->action, "migrate_from")) { + lrm_remote_clear_flags(lrm_state, expect_takeover | takeover_complete); + if (handle_remote_ra_start(lrm_state, cmd, + cmd->timeout) == pcmk_rc_ok) { + /* take care of this later when we get async connection result */ + crm_debug("Initiated async remote connection, %s action will complete after connect event", + cmd->action); + ra_data->cur_cmd = cmd; + return TRUE; + } + report_remote_ra_result(cmd); + + } else if (!strcmp(cmd->action, "monitor")) { + + if (lrm_state_is_connected(lrm_state) == TRUE) { + rc = lrm_state_poke_connection(lrm_state); + if (rc < 0) { + pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR, + PCMK_EXEC_ERROR, pcmk_strerror(rc)); + } + } else { + rc = -1; + pcmk__set_result(&(cmd->result), PCMK_OCF_NOT_RUNNING, + PCMK_EXEC_DONE, "Remote connection inactive"); + } + + if (rc == 0) { + crm_debug("Poked Pacemaker Remote at node %s, waiting for async response", + cmd->rsc_id); + ra_data->cur_cmd = cmd; + cmd->monitor_timeout_id = g_timeout_add(cmd->timeout, monitor_timeout_cb, cmd); + return TRUE; + } + report_remote_ra_result(cmd); + + } else if (!strcmp(cmd->action, "stop")) { + + if (pcmk_is_set(ra_data->status, expect_takeover)) { + /* briefly wait on stop for the takeover event to occur. If the + * takeover event does not occur during the wait period, that's fine. + * It just means that the remote-node's lrm_status section is going to get + * cleared which will require all the resources running in the remote-node + * to be explicitly re-detected via probe actions. If the takeover does occur + * successfully, then we can leave the status section intact. */ + cmd->takeover_timeout_id = g_timeout_add((cmd->timeout/2), connection_takeover_timeout_cb, cmd); + ra_data->cur_cmd = cmd; + return TRUE; + } + + handle_remote_ra_stop(lrm_state, cmd); + + } else if (!strcmp(cmd->action, "migrate_to")) { + lrm_remote_clear_flags(lrm_state, takeover_complete); + lrm_remote_set_flags(lrm_state, expect_takeover); + pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL); + report_remote_ra_result(cmd); + } else if (pcmk__str_any_of(cmd->action, CRMD_ACTION_RELOAD, + CRMD_ACTION_RELOAD_AGENT, NULL)) { + /* Currently the only reloadable parameter is reconnect_interval, + * which is only used by the scheduler via the CIB, so reloads are a + * no-op. + * + * @COMPAT DC <2.1.0: We only need to check for "reload" in case + * we're in a rolling upgrade with a DC scheduling "reload" instead + * of "reload-agent". An OCF 1.1 "reload" would be a no-op anyway, + * so this would work for that purpose as well. + */ + pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL); + report_remote_ra_result(cmd); + } + + free_cmd(cmd); + } + + return TRUE; +} + +static void +remote_ra_data_init(lrm_state_t * lrm_state) +{ + remote_ra_data_t *ra_data = NULL; + + if (lrm_state->remote_ra_data) { + return; + } + + ra_data = calloc(1, sizeof(remote_ra_data_t)); + ra_data->work = mainloop_add_trigger(G_PRIORITY_HIGH, handle_remote_ra_exec, lrm_state); + lrm_state->remote_ra_data = ra_data; +} + +void +remote_ra_cleanup(lrm_state_t * lrm_state) +{ + remote_ra_data_t *ra_data = lrm_state->remote_ra_data; + + if (!ra_data) { + return; + } + + if (ra_data->cmds) { + g_list_free_full(ra_data->cmds, free_cmd); + } + + if (ra_data->recurring_cmds) { + g_list_free_full(ra_data->recurring_cmds, free_cmd); + } + mainloop_destroy_trigger(ra_data->work); + free(ra_data); + lrm_state->remote_ra_data = NULL; +} + +gboolean +is_remote_lrmd_ra(const char *agent, const char *provider, const char *id) +{ + if (agent && provider && !strcmp(agent, REMOTE_LRMD_RA) && !strcmp(provider, "pacemaker")) { + return TRUE; + } + if ((id != NULL) && (lrm_state_find(id) != NULL) + && !pcmk__str_eq(id, controld_globals.our_nodename, pcmk__str_casei)) { + return TRUE; + } + + return FALSE; +} + +lrmd_rsc_info_t * +remote_ra_get_rsc_info(lrm_state_t * lrm_state, const char *rsc_id) +{ + lrmd_rsc_info_t *info = NULL; + + if ((lrm_state_find(rsc_id))) { + info = calloc(1, sizeof(lrmd_rsc_info_t)); + + info->id = strdup(rsc_id); + info->type = strdup(REMOTE_LRMD_RA); + info->standard = strdup(PCMK_RESOURCE_CLASS_OCF); + info->provider = strdup("pacemaker"); + } + + return info; +} + +static gboolean +is_remote_ra_supported_action(const char *action) +{ + return pcmk__str_any_of(action, + CRMD_ACTION_START, + CRMD_ACTION_STOP, + CRMD_ACTION_STATUS, + CRMD_ACTION_MIGRATE, + CRMD_ACTION_MIGRATED, + CRMD_ACTION_RELOAD_AGENT, + CRMD_ACTION_RELOAD, + NULL); +} + +static GList * +fail_all_monitor_cmds(GList * list) +{ + GList *rm_list = NULL; + remote_ra_cmd_t *cmd = NULL; + GList *gIter = NULL; + + for (gIter = list; gIter != NULL; gIter = gIter->next) { + cmd = gIter->data; + if ((cmd->interval_ms > 0) && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) { + rm_list = g_list_append(rm_list, cmd); + } + } + + for (gIter = rm_list; gIter != NULL; gIter = gIter->next) { + cmd = gIter->data; + + pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR, + PCMK_EXEC_ERROR, "Lost connection to remote executor"); + crm_trace("Pre-emptively failing %s %s (interval=%u, %s)", + cmd->action, cmd->rsc_id, cmd->interval_ms, cmd->userdata); + report_remote_ra_result(cmd); + + list = g_list_remove(list, cmd); + free_cmd(cmd); + } + + /* frees only the list data, not the cmds */ + g_list_free(rm_list); + return list; +} + +static GList * +remove_cmd(GList * list, const char *action, guint interval_ms) +{ + remote_ra_cmd_t *cmd = NULL; + GList *gIter = NULL; + + for (gIter = list; gIter != NULL; gIter = gIter->next) { + cmd = gIter->data; + if ((cmd->interval_ms == interval_ms) + && pcmk__str_eq(cmd->action, action, pcmk__str_casei)) { + break; + } + cmd = NULL; + } + if (cmd) { + list = g_list_remove(list, cmd); + free_cmd(cmd); + } + return list; +} + +int +remote_ra_cancel(lrm_state_t *lrm_state, const char *rsc_id, + const char *action, guint interval_ms) +{ + lrm_state_t *connection_rsc = NULL; + remote_ra_data_t *ra_data = NULL; + + connection_rsc = lrm_state_find(rsc_id); + if (!connection_rsc || !connection_rsc->remote_ra_data) { + return -EINVAL; + } + + ra_data = connection_rsc->remote_ra_data; + ra_data->cmds = remove_cmd(ra_data->cmds, action, interval_ms); + ra_data->recurring_cmds = remove_cmd(ra_data->recurring_cmds, action, + interval_ms); + if (ra_data->cur_cmd && + (ra_data->cur_cmd->interval_ms == interval_ms) && + (pcmk__str_eq(ra_data->cur_cmd->action, action, pcmk__str_casei))) { + + cmd_set_flags(ra_data->cur_cmd, cmd_cancel); + } + + return 0; +} + +static remote_ra_cmd_t * +handle_dup_monitor(remote_ra_data_t *ra_data, guint interval_ms, + const char *userdata) +{ + GList *gIter = NULL; + remote_ra_cmd_t *cmd = NULL; + + /* there are 3 places a potential duplicate monitor operation + * could exist. + * 1. recurring_cmds list. where the op is waiting for its next interval + * 2. cmds list, where the op is queued to get executed immediately + * 3. cur_cmd, which means the monitor op is in flight right now. + */ + if (interval_ms == 0) { + return NULL; + } + + if (ra_data->cur_cmd && + !pcmk_is_set(ra_data->cur_cmd->status, cmd_cancel) && + (ra_data->cur_cmd->interval_ms == interval_ms) && + pcmk__str_eq(ra_data->cur_cmd->action, "monitor", pcmk__str_casei)) { + + cmd = ra_data->cur_cmd; + goto handle_dup; + } + + for (gIter = ra_data->recurring_cmds; gIter != NULL; gIter = gIter->next) { + cmd = gIter->data; + if ((cmd->interval_ms == interval_ms) + && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) { + goto handle_dup; + } + } + + for (gIter = ra_data->cmds; gIter != NULL; gIter = gIter->next) { + cmd = gIter->data; + if ((cmd->interval_ms == interval_ms) + && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) { + goto handle_dup; + } + } + + return NULL; + +handle_dup: + + crm_trace("merging duplicate monitor cmd " PCMK__OP_FMT, + cmd->rsc_id, "monitor", interval_ms); + + /* update the userdata */ + if (userdata) { + free(cmd->userdata); + cmd->userdata = strdup(userdata); + } + + /* if we've already reported success, generate a new call id */ + if (pcmk_is_set(cmd->status, cmd_reported_success)) { + cmd->start_time = time(NULL); + cmd->call_id = generate_callid(); + cmd_clear_flags(cmd, cmd_reported_success); + } + + /* if we have an interval_id set, that means we are in the process of + * waiting for this cmd's next interval. instead of waiting, cancel + * the timer and execute the action immediately */ + if (cmd->interval_id) { + g_source_remove(cmd->interval_id); + cmd->interval_id = 0; + recurring_helper(cmd); + } + + return cmd; +} + +/*! + * \internal + * \brief Execute an action using the (internal) ocf:pacemaker:remote agent + * + * \param[in] lrm_state Executor state object for remote connection + * \param[in] rsc_id Connection resource ID + * \param[in] action Action to execute + * \param[in] userdata String to copy and pass to execution callback + * \param[in] interval_ms Action interval (in milliseconds) + * \param[in] timeout_ms Action timeout (in milliseconds) + * \param[in] start_delay_ms Delay (in milliseconds) before executing action + * \param[in,out] params Connection resource parameters + * \param[out] call_id Where to store call ID on success + * + * \return Standard Pacemaker return code + * \note This takes ownership of \p params, which should not be used or freed + * after calling this function. + */ +int +controld_execute_remote_agent(const lrm_state_t *lrm_state, const char *rsc_id, + const char *action, const char *userdata, + guint interval_ms, int timeout_ms, + int start_delay_ms, lrmd_key_value_t *params, + int *call_id) +{ + lrm_state_t *connection_rsc = NULL; + remote_ra_cmd_t *cmd = NULL; + remote_ra_data_t *ra_data = NULL; + + *call_id = 0; + + CRM_CHECK((lrm_state != NULL) && (rsc_id != NULL) && (action != NULL) + && (userdata != NULL) && (call_id != NULL), + lrmd_key_value_freeall(params); return EINVAL); + + if (!is_remote_ra_supported_action(action)) { + lrmd_key_value_freeall(params); + return EOPNOTSUPP; + } + + connection_rsc = lrm_state_find(rsc_id); + if (connection_rsc == NULL) { + lrmd_key_value_freeall(params); + return ENOTCONN; + } + + remote_ra_data_init(connection_rsc); + ra_data = connection_rsc->remote_ra_data; + + cmd = handle_dup_monitor(ra_data, interval_ms, userdata); + if (cmd) { + *call_id = cmd->call_id; + lrmd_key_value_freeall(params); + return pcmk_rc_ok; + } + + cmd = calloc(1, sizeof(remote_ra_cmd_t)); + if (cmd == NULL) { + lrmd_key_value_freeall(params); + return ENOMEM; + } + + cmd->owner = strdup(lrm_state->node_name); + cmd->rsc_id = strdup(rsc_id); + cmd->action = strdup(action); + cmd->userdata = strdup(userdata); + if ((cmd->owner == NULL) || (cmd->rsc_id == NULL) || (cmd->action == NULL) + || (cmd->userdata == NULL)) { + free_cmd(cmd); + lrmd_key_value_freeall(params); + return ENOMEM; + } + + cmd->interval_ms = interval_ms; + cmd->timeout = timeout_ms; + cmd->start_delay = start_delay_ms; + cmd->params = params; + cmd->start_time = time(NULL); + + cmd->call_id = generate_callid(); + + if (cmd->start_delay) { + cmd->delay_id = g_timeout_add(cmd->start_delay, start_delay_helper, cmd); + } + + ra_data->cmds = g_list_append(ra_data->cmds, cmd); + mainloop_set_trigger(ra_data->work); + + *call_id = cmd->call_id; + return pcmk_rc_ok; +} + +/*! + * \internal + * \brief Immediately fail all monitors of a remote node, if proxied here + * + * \param[in] node_name Name of pacemaker_remote node + */ +void +remote_ra_fail(const char *node_name) +{ + lrm_state_t *lrm_state = lrm_state_find(node_name); + + if (lrm_state && lrm_state_is_connected(lrm_state)) { + remote_ra_data_t *ra_data = lrm_state->remote_ra_data; + + crm_info("Failing monitors on Pacemaker Remote node %s", node_name); + ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds); + ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds); + } +} + +/* A guest node fencing implied by host fencing looks like: + * + * <pseudo_event id="103" operation="stonith" operation_key="stonith-lxc1-off" + * on_node="lxc1" on_node_uuid="lxc1"> + * <attributes CRM_meta_on_node="lxc1" CRM_meta_on_node_uuid="lxc1" + * CRM_meta_stonith_action="off" crm_feature_set="3.0.12"/> + * <downed> + * <node id="lxc1"/> + * </downed> + * </pseudo_event> + */ +#define XPATH_PSEUDO_FENCE "/" XML_GRAPH_TAG_PSEUDO_EVENT \ + "[@" XML_LRM_ATTR_TASK "='stonith']/" XML_GRAPH_TAG_DOWNED \ + "/" XML_CIB_TAG_NODE + +/*! + * \internal + * \brief Check a pseudo-action for Pacemaker Remote node side effects + * + * \param[in,out] xml XML of pseudo-action to check + */ +void +remote_ra_process_pseudo(xmlNode *xml) +{ + xmlXPathObjectPtr search = xpath_search(xml, XPATH_PSEUDO_FENCE); + + if (numXpathResults(search) == 1) { + xmlNode *result = getXpathResult(search, 0); + + /* Normally, we handle the necessary side effects of a guest node stop + * action when reporting the remote agent's result. However, if the stop + * is implied due to fencing, it will be a fencing pseudo-event, and + * there won't be a result to report. Handle that case here. + * + * This will result in a duplicate call to remote_node_down() if the + * guest stop was real instead of implied, but that shouldn't hurt. + * + * There is still one corner case that isn't handled: if a guest node + * isn't running any resources when its host is fenced, it will appear + * to be cleanly stopped, so there will be no pseudo-fence, and our + * peer cache state will be incorrect unless and until the guest is + * recovered. + */ + if (result) { + const char *remote = ID(result); + + if (remote) { + remote_node_down(remote, DOWN_ERASE_LRM); + } + } + } + freeXpathObject(search); +} + +static void +remote_ra_maintenance(lrm_state_t * lrm_state, gboolean maintenance) +{ + xmlNode *update, *state; + int call_opt; + crm_node_t *node; + + call_opt = crmd_cib_smart_opt(); + node = crm_remote_peer_get(lrm_state->node_name); + CRM_CHECK(node != NULL, return); + update = create_xml_node(NULL, XML_CIB_TAG_STATUS); + state = create_node_state_update(node, node_update_none, update, + __func__); + crm_xml_add(state, XML_NODE_IS_MAINTENANCE, maintenance?"1":"0"); + if (controld_update_cib(XML_CIB_TAG_STATUS, update, call_opt, + NULL) == pcmk_rc_ok) { + /* TODO: still not 100% sure that async update will succeed ... */ + if (maintenance) { + lrm_remote_set_flags(lrm_state, remote_in_maint); + } else { + lrm_remote_clear_flags(lrm_state, remote_in_maint); + } + } + free_xml(update); +} + +#define XPATH_PSEUDO_MAINTENANCE "//" XML_GRAPH_TAG_PSEUDO_EVENT \ + "[@" XML_LRM_ATTR_TASK "='" CRM_OP_MAINTENANCE_NODES "']/" \ + XML_GRAPH_TAG_MAINTENANCE + +/*! + * \internal + * \brief Check a pseudo-action holding updates for maintenance state + * + * \param[in,out] xml XML of pseudo-action to check + */ +void +remote_ra_process_maintenance_nodes(xmlNode *xml) +{ + xmlXPathObjectPtr search = xpath_search(xml, XPATH_PSEUDO_MAINTENANCE); + + if (numXpathResults(search) == 1) { + xmlNode *node; + int cnt = 0, cnt_remote = 0; + + for (node = + first_named_child(getXpathResult(search, 0), XML_CIB_TAG_NODE); + node != NULL; node = pcmk__xml_next(node)) { + lrm_state_t *lrm_state = lrm_state_find(ID(node)); + + cnt++; + if (lrm_state && lrm_state->remote_ra_data && + pcmk_is_set(((remote_ra_data_t *) lrm_state->remote_ra_data)->status, remote_active)) { + int is_maint; + + cnt_remote++; + pcmk__scan_min_int(crm_element_value(node, XML_NODE_IS_MAINTENANCE), + &is_maint, 0); + remote_ra_maintenance(lrm_state, is_maint); + } + } + crm_trace("Action holds %d nodes (%d remotes found) " + "adjusting maintenance-mode", cnt, cnt_remote); + } + freeXpathObject(search); +} + +gboolean +remote_ra_is_in_maintenance(lrm_state_t * lrm_state) +{ + remote_ra_data_t *ra_data = lrm_state->remote_ra_data; + return pcmk_is_set(ra_data->status, remote_in_maint); +} + +gboolean +remote_ra_controlling_guest(lrm_state_t * lrm_state) +{ + remote_ra_data_t *ra_data = lrm_state->remote_ra_data; + return pcmk_is_set(ra_data->status, controlling_guest); +} |