diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 06:53:20 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 06:53:20 +0000 |
commit | e5a812082ae033afb1eed82c0f2df3d0f6bdc93f (patch) | |
tree | a6716c9275b4b413f6c9194798b34b91affb3cc7 /daemons/execd/execd_commands.c | |
parent | Initial commit. (diff) | |
download | pacemaker-e5a812082ae033afb1eed82c0f2df3d0f6bdc93f.tar.xz pacemaker-e5a812082ae033afb1eed82c0f2df3d0f6bdc93f.zip |
Adding upstream version 2.1.6.upstream/2.1.6
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'daemons/execd/execd_commands.c')
-rw-r--r-- | daemons/execd/execd_commands.c | 1927 |
1 files changed, 1927 insertions, 0 deletions
diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c new file mode 100644 index 0000000..fa2761e --- /dev/null +++ b/daemons/execd/execd_commands.c @@ -0,0 +1,1927 @@ +/* + * Copyright 2012-2023 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * + * This source code is licensed under the GNU Lesser General Public License + * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. + */ + +#include <crm_internal.h> +#include <crm/fencing/internal.h> + +#include <glib.h> + +// Check whether we have a high-resolution monotonic clock +#undef PCMK__TIME_USE_CGT +#if HAVE_DECL_CLOCK_MONOTONIC && defined(CLOCK_MONOTONIC) +# define PCMK__TIME_USE_CGT +# include <time.h> /* clock_gettime */ +#endif + +#include <unistd.h> + +#include <crm/crm.h> +#include <crm/fencing/internal.h> +#include <crm/services.h> +#include <crm/services_internal.h> +#include <crm/common/mainloop.h> +#include <crm/common/ipc.h> +#include <crm/common/ipc_internal.h> +#include <crm/msg_xml.h> + +#include "pacemaker-execd.h" + +GHashTable *rsc_list = NULL; + +typedef struct lrmd_cmd_s { + int timeout; + guint interval_ms; + int start_delay; + int timeout_orig; + + int call_id; + + int call_opts; + /* Timer ids, must be removed on cmd destruction. */ + int delay_id; + int stonith_recurring_id; + + int rsc_deleted; + + int service_flags; + + char *client_id; + char *origin; + char *rsc_id; + char *action; + char *real_action; + char *userdata_str; + + pcmk__action_result_t result; + + /* We can track operation queue time and run time, to be saved with the CIB + * resource history (and displayed in cluster status). We need + * high-resolution monotonic time for this purpose, so we use + * clock_gettime(CLOCK_MONOTONIC, ...) (if available, otherwise this feature + * is disabled). + * + * However, we also need epoch timestamps for recording the time the command + * last ran and the time its return value last changed, for use in time + * displays (as opposed to interval calculations). We keep time_t values for + * this purpose. + * + * The last run time is used for both purposes, so we keep redundant + * monotonic and epoch values for this. Technically the two could represent + * different times, but since time_t has only second resolution and the + * values are used for distinct purposes, that is not significant. + */ +#ifdef PCMK__TIME_USE_CGT + /* Recurring and systemd operations may involve more than one executor + * command per operation, so they need info about the original and the most + * recent. + */ + struct timespec t_first_run; // When op first ran + struct timespec t_run; // When op most recently ran + struct timespec t_first_queue; // When op was first queued + struct timespec t_queue; // When op was most recently queued +#endif + time_t epoch_last_run; // Epoch timestamp of when op last ran + time_t epoch_rcchange; // Epoch timestamp of when rc last changed + + bool first_notify_sent; + int last_notify_rc; + int last_notify_op_status; + int last_pid; + + GHashTable *params; +} lrmd_cmd_t; + +static void cmd_finalize(lrmd_cmd_t * cmd, lrmd_rsc_t * rsc); +static gboolean execute_resource_action(gpointer user_data); +static void cancel_all_recurring(lrmd_rsc_t * rsc, const char *client_id); + +#ifdef PCMK__TIME_USE_CGT + +/*! + * \internal + * \brief Check whether a struct timespec has been set + * + * \param[in] timespec Time to check + * + * \return true if timespec has been set (i.e. is nonzero), false otherwise + */ +static inline bool +time_is_set(const struct timespec *timespec) +{ + return (timespec != NULL) && + ((timespec->tv_sec != 0) || (timespec->tv_nsec != 0)); +} + +/* + * \internal + * \brief Set a timespec (and its original if unset) to the current time + * + * \param[out] t_current Where to store current time + * \param[out] t_orig Where to copy t_current if unset + */ +static void +get_current_time(struct timespec *t_current, struct timespec *t_orig) +{ + clock_gettime(CLOCK_MONOTONIC, t_current); + if ((t_orig != NULL) && !time_is_set(t_orig)) { + *t_orig = *t_current; + } +} + +/*! + * \internal + * \brief Return difference between two times in milliseconds + * + * \param[in] now More recent time (or NULL to use current time) + * \param[in] old Earlier time + * + * \return milliseconds difference (or 0 if old is NULL or unset) + * + * \note Can overflow on 32bit machines when the differences is around + * 24 days or more. + */ +static int +time_diff_ms(const struct timespec *now, const struct timespec *old) +{ + int diff_ms = 0; + + if (time_is_set(old)) { + struct timespec local_now = { 0, }; + + if (now == NULL) { + clock_gettime(CLOCK_MONOTONIC, &local_now); + now = &local_now; + } + diff_ms = (now->tv_sec - old->tv_sec) * 1000 + + (now->tv_nsec - old->tv_nsec) / 1000000; + } + return diff_ms; +} + +/*! + * \internal + * \brief Reset a command's operation times to their original values. + * + * Reset a command's run and queued timestamps to the timestamps of the original + * command, so we report the entire time since then and not just the time since + * the most recent command (for recurring and systemd operations). + * + * \param[in,out] cmd Executor command object to reset + * + * \note It's not obvious what the queued time should be for a systemd + * start/stop operation, which might go like this: + * initial command queued 5ms, runs 3s + * monitor command queued 10ms, runs 10s + * monitor command queued 10ms, runs 10s + * Is the queued time for that operation 5ms, 10ms or 25ms? The current + * implementation will report 5ms. If it's 25ms, then we need to + * subtract 20ms from the total exec time so as not to count it twice. + * We can implement that later if it matters to anyone ... + */ +static void +cmd_original_times(lrmd_cmd_t * cmd) +{ + cmd->t_run = cmd->t_first_run; + cmd->t_queue = cmd->t_first_queue; +} +#endif + +static inline bool +action_matches(const lrmd_cmd_t *cmd, const char *action, guint interval_ms) +{ + return (cmd->interval_ms == interval_ms) + && pcmk__str_eq(cmd->action, action, pcmk__str_casei); +} + +/*! + * \internal + * \brief Log the result of an asynchronous command + * + * \param[in] cmd Command to log result for + * \param[in] exec_time_ms Execution time in milliseconds, if known + * \param[in] queue_time_ms Queue time in milliseconds, if known + */ +static void +log_finished(const lrmd_cmd_t *cmd, int exec_time_ms, int queue_time_ms) +{ + int log_level = LOG_INFO; + GString *str = g_string_sized_new(100); // reasonable starting size + + if (pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) { + log_level = LOG_DEBUG; + } + + g_string_append_printf(str, "%s %s (call %d", + cmd->rsc_id, cmd->action, cmd->call_id); + if (cmd->last_pid != 0) { + g_string_append_printf(str, ", PID %d", cmd->last_pid); + } + if (cmd->result.execution_status == PCMK_EXEC_DONE) { + g_string_append_printf(str, ") exited with status %d", + cmd->result.exit_status); + } else { + pcmk__g_strcat(str, ") could not be executed: ", + pcmk_exec_status_str(cmd->result.execution_status), + NULL); + } + if (cmd->result.exit_reason != NULL) { + pcmk__g_strcat(str, " (", cmd->result.exit_reason, ")", NULL); + } + +#ifdef PCMK__TIME_USE_CGT + pcmk__g_strcat(str, " (execution time ", + pcmk__readable_interval(exec_time_ms), NULL); + if (queue_time_ms > 0) { + pcmk__g_strcat(str, " after being queued ", + pcmk__readable_interval(queue_time_ms), NULL); + } + g_string_append_c(str, ')'); +#endif + + do_crm_log(log_level, "%s", str->str); + g_string_free(str, TRUE); +} + +static void +log_execute(lrmd_cmd_t * cmd) +{ + int log_level = LOG_INFO; + + if (pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) { + log_level = LOG_DEBUG; + } + + do_crm_log(log_level, "executing - rsc:%s action:%s call_id:%d", + cmd->rsc_id, cmd->action, cmd->call_id); +} + +static const char * +normalize_action_name(lrmd_rsc_t * rsc, const char *action) +{ + if (pcmk__str_eq(action, "monitor", pcmk__str_casei) && + pcmk_is_set(pcmk_get_ra_caps(rsc->class), pcmk_ra_cap_status)) { + return "status"; + } + return action; +} + +static lrmd_rsc_t * +build_rsc_from_xml(xmlNode * msg) +{ + xmlNode *rsc_xml = get_xpath_object("//" F_LRMD_RSC, msg, LOG_ERR); + lrmd_rsc_t *rsc = NULL; + + rsc = calloc(1, sizeof(lrmd_rsc_t)); + + crm_element_value_int(msg, F_LRMD_CALLOPTS, &rsc->call_opts); + + rsc->rsc_id = crm_element_value_copy(rsc_xml, F_LRMD_RSC_ID); + rsc->class = crm_element_value_copy(rsc_xml, F_LRMD_CLASS); + rsc->provider = crm_element_value_copy(rsc_xml, F_LRMD_PROVIDER); + rsc->type = crm_element_value_copy(rsc_xml, F_LRMD_TYPE); + rsc->work = mainloop_add_trigger(G_PRIORITY_HIGH, execute_resource_action, + rsc); + + // Initialize fence device probes (to return "not running") + pcmk__set_result(&rsc->fence_probe_result, CRM_EX_ERROR, + PCMK_EXEC_NO_FENCE_DEVICE, NULL); + return rsc; +} + +static lrmd_cmd_t * +create_lrmd_cmd(xmlNode *msg, pcmk__client_t *client) +{ + int call_options = 0; + xmlNode *rsc_xml = get_xpath_object("//" F_LRMD_RSC, msg, LOG_ERR); + lrmd_cmd_t *cmd = NULL; + + cmd = calloc(1, sizeof(lrmd_cmd_t)); + + crm_element_value_int(msg, F_LRMD_CALLOPTS, &call_options); + cmd->call_opts = call_options; + cmd->client_id = strdup(client->id); + + crm_element_value_int(msg, F_LRMD_CALLID, &cmd->call_id); + crm_element_value_ms(rsc_xml, F_LRMD_RSC_INTERVAL, &cmd->interval_ms); + crm_element_value_int(rsc_xml, F_LRMD_TIMEOUT, &cmd->timeout); + crm_element_value_int(rsc_xml, F_LRMD_RSC_START_DELAY, &cmd->start_delay); + cmd->timeout_orig = cmd->timeout; + + cmd->origin = crm_element_value_copy(rsc_xml, F_LRMD_ORIGIN); + cmd->action = crm_element_value_copy(rsc_xml, F_LRMD_RSC_ACTION); + cmd->userdata_str = crm_element_value_copy(rsc_xml, F_LRMD_RSC_USERDATA_STR); + cmd->rsc_id = crm_element_value_copy(rsc_xml, F_LRMD_RSC_ID); + + cmd->params = xml2list(rsc_xml); + + if (pcmk__str_eq(g_hash_table_lookup(cmd->params, "CRM_meta_on_fail"), "block", pcmk__str_casei)) { + crm_debug("Setting flag to leave pid group on timeout and " + "only kill action pid for " PCMK__OP_FMT, + cmd->rsc_id, cmd->action, cmd->interval_ms); + cmd->service_flags = pcmk__set_flags_as(__func__, __LINE__, + LOG_TRACE, "Action", + cmd->action, 0, + SVC_ACTION_LEAVE_GROUP, + "SVC_ACTION_LEAVE_GROUP"); + } + return cmd; +} + +static void +stop_recurring_timer(lrmd_cmd_t *cmd) +{ + if (cmd) { + if (cmd->stonith_recurring_id) { + g_source_remove(cmd->stonith_recurring_id); + } + cmd->stonith_recurring_id = 0; + } +} + +static void +free_lrmd_cmd(lrmd_cmd_t * cmd) +{ + stop_recurring_timer(cmd); + if (cmd->delay_id) { + g_source_remove(cmd->delay_id); + } + if (cmd->params) { + g_hash_table_destroy(cmd->params); + } + pcmk__reset_result(&(cmd->result)); + free(cmd->origin); + free(cmd->action); + free(cmd->real_action); + free(cmd->userdata_str); + free(cmd->rsc_id); + free(cmd->client_id); + free(cmd); +} + +static gboolean +stonith_recurring_op_helper(gpointer data) +{ + lrmd_cmd_t *cmd = data; + lrmd_rsc_t *rsc; + + cmd->stonith_recurring_id = 0; + + if (!cmd->rsc_id) { + return FALSE; + } + + rsc = g_hash_table_lookup(rsc_list, cmd->rsc_id); + + CRM_ASSERT(rsc != NULL); + /* take it out of recurring_ops list, and put it in the pending ops + * to be executed */ + rsc->recurring_ops = g_list_remove(rsc->recurring_ops, cmd); + rsc->pending_ops = g_list_append(rsc->pending_ops, cmd); +#ifdef PCMK__TIME_USE_CGT + get_current_time(&(cmd->t_queue), &(cmd->t_first_queue)); +#endif + mainloop_set_trigger(rsc->work); + + return FALSE; +} + +static inline void +start_recurring_timer(lrmd_cmd_t *cmd) +{ + if (cmd && (cmd->interval_ms > 0)) { + cmd->stonith_recurring_id = g_timeout_add(cmd->interval_ms, + stonith_recurring_op_helper, + cmd); + } +} + +static gboolean +start_delay_helper(gpointer data) +{ + lrmd_cmd_t *cmd = data; + lrmd_rsc_t *rsc = NULL; + + cmd->delay_id = 0; + rsc = cmd->rsc_id ? g_hash_table_lookup(rsc_list, cmd->rsc_id) : NULL; + + if (rsc) { + mainloop_set_trigger(rsc->work); + } + + return FALSE; +} + +/*! + * \internal + * \brief Check whether a list already contains the equivalent of a given action + * + * \param[in] action_list List to search + * \param[in] cmd Action to search for + */ +static lrmd_cmd_t * +find_duplicate_action(const GList *action_list, const lrmd_cmd_t *cmd) +{ + for (const GList *item = action_list; item != NULL; item = item->next) { + lrmd_cmd_t *dup = item->data; + + if (action_matches(cmd, dup->action, dup->interval_ms)) { + return dup; + } + } + return NULL; +} + +static bool +merge_recurring_duplicate(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) +{ + lrmd_cmd_t * dup = NULL; + bool dup_pending = true; + + if (cmd->interval_ms == 0) { + return false; + } + + // Search for a duplicate of this action (in-flight or not) + dup = find_duplicate_action(rsc->pending_ops, cmd); + if (dup == NULL) { + dup_pending = false; + dup = find_duplicate_action(rsc->recurring_ops, cmd); + if (dup == NULL) { + return false; + } + } + + /* Do not merge fencing monitors marked for cancellation, so we can reply to + * the cancellation separately. + */ + if (pcmk__str_eq(rsc->class, PCMK_RESOURCE_CLASS_STONITH, + pcmk__str_casei) + && (dup->result.execution_status == PCMK_EXEC_CANCELLED)) { + return false; + } + + /* This should not occur. If it does, we need to investigate how something + * like this is possible in the controller. + */ + crm_warn("Duplicate recurring op entry detected (" PCMK__OP_FMT + "), merging with previous op entry", + rsc->rsc_id, normalize_action_name(rsc, dup->action), + dup->interval_ms); + + // Merge new action's call ID and user data into existing action + dup->first_notify_sent = false; + free(dup->userdata_str); + dup->userdata_str = cmd->userdata_str; + cmd->userdata_str = NULL; + dup->call_id = cmd->call_id; + free_lrmd_cmd(cmd); + cmd = NULL; + + /* If dup is not pending, that means it has already executed at least once + * and is waiting in the interval. In that case, stop waiting and initiate + * a new instance now. + */ + if (!dup_pending) { + if (pcmk__str_eq(rsc->class, PCMK_RESOURCE_CLASS_STONITH, + pcmk__str_casei)) { + stop_recurring_timer(dup); + stonith_recurring_op_helper(dup); + } else { + services_action_kick(rsc->rsc_id, + normalize_action_name(rsc, dup->action), + dup->interval_ms); + } + } + return true; +} + +static void +schedule_lrmd_cmd(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) +{ + CRM_CHECK(cmd != NULL, return); + CRM_CHECK(rsc != NULL, return); + + crm_trace("Scheduling %s on %s", cmd->action, rsc->rsc_id); + + if (merge_recurring_duplicate(rsc, cmd)) { + // Equivalent of cmd has already been scheduled + return; + } + + /* The controller expects the executor to automatically cancel + * recurring operations before a resource stops. + */ + if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) { + cancel_all_recurring(rsc, NULL); + } + + rsc->pending_ops = g_list_append(rsc->pending_ops, cmd); +#ifdef PCMK__TIME_USE_CGT + get_current_time(&(cmd->t_queue), &(cmd->t_first_queue)); +#endif + mainloop_set_trigger(rsc->work); + + if (cmd->start_delay) { + cmd->delay_id = g_timeout_add(cmd->start_delay, start_delay_helper, cmd); + } +} + +static xmlNode * +create_lrmd_reply(const char *origin, int rc, int call_id) +{ + xmlNode *reply = create_xml_node(NULL, T_LRMD_REPLY); + + crm_xml_add(reply, F_LRMD_ORIGIN, origin); + crm_xml_add_int(reply, F_LRMD_RC, rc); + crm_xml_add_int(reply, F_LRMD_CALLID, call_id); + return reply; +} + +static void +send_client_notify(gpointer key, gpointer value, gpointer user_data) +{ + xmlNode *update_msg = user_data; + pcmk__client_t *client = value; + int rc; + int log_level = LOG_WARNING; + const char *msg = NULL; + + CRM_CHECK(client != NULL, return); + if (client->name == NULL) { + crm_trace("Skipping notification to client without name"); + return; + } + if (pcmk_is_set(client->flags, pcmk__client_to_proxy)) { + /* We only want to notify clients of the executor IPC API. If we are + * running as Pacemaker Remote, we may have clients proxied to other + * IPC services in the cluster, so skip those. + */ + crm_trace("Skipping executor API notification to client %s", + pcmk__client_name(client)); + return; + } + + rc = lrmd_server_send_notify(client, update_msg); + if (rc == pcmk_rc_ok) { + return; + } + + switch (rc) { + case ENOTCONN: + case EPIPE: // Client exited without waiting for notification + log_level = LOG_INFO; + msg = "Disconnected"; + break; + + default: + msg = pcmk_rc_str(rc); + break; + } + do_crm_log(log_level, "Could not notify client %s: %s " CRM_XS " rc=%d", + pcmk__client_name(client), msg, rc); +} + +static void +send_cmd_complete_notify(lrmd_cmd_t * cmd) +{ + xmlNode *notify = NULL; + int exec_time = 0; + int queue_time = 0; + +#ifdef PCMK__TIME_USE_CGT + exec_time = time_diff_ms(NULL, &(cmd->t_run)); + queue_time = time_diff_ms(&cmd->t_run, &(cmd->t_queue)); +#endif + log_finished(cmd, exec_time, queue_time); + + /* If the originator requested to be notified only for changes in recurring + * operation results, skip the notification if the result hasn't changed. + */ + if (cmd->first_notify_sent + && pcmk_is_set(cmd->call_opts, lrmd_opt_notify_changes_only) + && (cmd->last_notify_rc == cmd->result.exit_status) + && (cmd->last_notify_op_status == cmd->result.execution_status)) { + return; + } + + cmd->first_notify_sent = true; + cmd->last_notify_rc = cmd->result.exit_status; + cmd->last_notify_op_status = cmd->result.execution_status; + + notify = create_xml_node(NULL, T_LRMD_NOTIFY); + + crm_xml_add(notify, F_LRMD_ORIGIN, __func__); + crm_xml_add_int(notify, F_LRMD_TIMEOUT, cmd->timeout); + crm_xml_add_ms(notify, F_LRMD_RSC_INTERVAL, cmd->interval_ms); + crm_xml_add_int(notify, F_LRMD_RSC_START_DELAY, cmd->start_delay); + crm_xml_add_int(notify, F_LRMD_EXEC_RC, cmd->result.exit_status); + crm_xml_add_int(notify, F_LRMD_OP_STATUS, cmd->result.execution_status); + crm_xml_add_int(notify, F_LRMD_CALLID, cmd->call_id); + crm_xml_add_int(notify, F_LRMD_RSC_DELETED, cmd->rsc_deleted); + + crm_xml_add_ll(notify, F_LRMD_RSC_RUN_TIME, + (long long) cmd->epoch_last_run); + crm_xml_add_ll(notify, F_LRMD_RSC_RCCHANGE_TIME, + (long long) cmd->epoch_rcchange); +#ifdef PCMK__TIME_USE_CGT + crm_xml_add_int(notify, F_LRMD_RSC_EXEC_TIME, exec_time); + crm_xml_add_int(notify, F_LRMD_RSC_QUEUE_TIME, queue_time); +#endif + + crm_xml_add(notify, F_LRMD_OPERATION, LRMD_OP_RSC_EXEC); + crm_xml_add(notify, F_LRMD_RSC_ID, cmd->rsc_id); + if(cmd->real_action) { + crm_xml_add(notify, F_LRMD_RSC_ACTION, cmd->real_action); + } else { + crm_xml_add(notify, F_LRMD_RSC_ACTION, cmd->action); + } + crm_xml_add(notify, F_LRMD_RSC_USERDATA_STR, cmd->userdata_str); + crm_xml_add(notify, F_LRMD_RSC_EXIT_REASON, cmd->result.exit_reason); + + if (cmd->result.action_stderr != NULL) { + crm_xml_add(notify, F_LRMD_RSC_OUTPUT, cmd->result.action_stderr); + + } else if (cmd->result.action_stdout != NULL) { + crm_xml_add(notify, F_LRMD_RSC_OUTPUT, cmd->result.action_stdout); + } + + if (cmd->params) { + char *key = NULL; + char *value = NULL; + GHashTableIter iter; + + xmlNode *args = create_xml_node(notify, XML_TAG_ATTRS); + + g_hash_table_iter_init(&iter, cmd->params); + while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & value)) { + hash2smartfield((gpointer) key, (gpointer) value, args); + } + } + if ((cmd->client_id != NULL) + && pcmk_is_set(cmd->call_opts, lrmd_opt_notify_orig_only)) { + + pcmk__client_t *client = pcmk__find_client_by_id(cmd->client_id); + + if (client != NULL) { + send_client_notify(client->id, client, notify); + } + } else { + pcmk__foreach_ipc_client(send_client_notify, notify); + } + + free_xml(notify); +} + +static void +send_generic_notify(int rc, xmlNode * request) +{ + if (pcmk__ipc_client_count() != 0) { + int call_id = 0; + xmlNode *notify = NULL; + xmlNode *rsc_xml = get_xpath_object("//" F_LRMD_RSC, request, LOG_ERR); + const char *rsc_id = crm_element_value(rsc_xml, F_LRMD_RSC_ID); + const char *op = crm_element_value(request, F_LRMD_OPERATION); + + crm_element_value_int(request, F_LRMD_CALLID, &call_id); + + notify = create_xml_node(NULL, T_LRMD_NOTIFY); + crm_xml_add(notify, F_LRMD_ORIGIN, __func__); + crm_xml_add_int(notify, F_LRMD_RC, rc); + crm_xml_add_int(notify, F_LRMD_CALLID, call_id); + crm_xml_add(notify, F_LRMD_OPERATION, op); + crm_xml_add(notify, F_LRMD_RSC_ID, rsc_id); + + pcmk__foreach_ipc_client(send_client_notify, notify); + + free_xml(notify); + } +} + +static void +cmd_reset(lrmd_cmd_t * cmd) +{ + cmd->last_pid = 0; +#ifdef PCMK__TIME_USE_CGT + memset(&cmd->t_run, 0, sizeof(cmd->t_run)); + memset(&cmd->t_queue, 0, sizeof(cmd->t_queue)); +#endif + cmd->epoch_last_run = 0; + + pcmk__reset_result(&(cmd->result)); + cmd->result.execution_status = PCMK_EXEC_DONE; +} + +static void +cmd_finalize(lrmd_cmd_t * cmd, lrmd_rsc_t * rsc) +{ + crm_trace("Resource operation rsc:%s action:%s completed (%p %p)", cmd->rsc_id, cmd->action, + rsc ? rsc->active : NULL, cmd); + + if (rsc && (rsc->active == cmd)) { + rsc->active = NULL; + mainloop_set_trigger(rsc->work); + } + + if (!rsc) { + cmd->rsc_deleted = 1; + } + + /* reset original timeout so client notification has correct information */ + cmd->timeout = cmd->timeout_orig; + + send_cmd_complete_notify(cmd); + + if ((cmd->interval_ms != 0) + && (cmd->result.execution_status == PCMK_EXEC_CANCELLED)) { + + if (rsc) { + rsc->recurring_ops = g_list_remove(rsc->recurring_ops, cmd); + rsc->pending_ops = g_list_remove(rsc->pending_ops, cmd); + } + free_lrmd_cmd(cmd); + } else if (cmd->interval_ms == 0) { + if (rsc) { + rsc->pending_ops = g_list_remove(rsc->pending_ops, cmd); + } + free_lrmd_cmd(cmd); + } else { + /* Clear all the values pertaining just to the last iteration of a recurring op. */ + cmd_reset(cmd); + } +} + +struct notify_new_client_data { + xmlNode *notify; + pcmk__client_t *new_client; +}; + +static void +notify_one_client(gpointer key, gpointer value, gpointer user_data) +{ + pcmk__client_t *client = value; + struct notify_new_client_data *data = user_data; + + if (!pcmk__str_eq(client->id, data->new_client->id, pcmk__str_casei)) { + send_client_notify(key, (gpointer) client, (gpointer) data->notify); + } +} + +void +notify_of_new_client(pcmk__client_t *new_client) +{ + struct notify_new_client_data data; + + data.new_client = new_client; + data.notify = create_xml_node(NULL, T_LRMD_NOTIFY); + crm_xml_add(data.notify, F_LRMD_ORIGIN, __func__); + crm_xml_add(data.notify, F_LRMD_OPERATION, LRMD_OP_NEW_CLIENT); + pcmk__foreach_ipc_client(notify_one_client, &data); + free_xml(data.notify); +} + +void +client_disconnect_cleanup(const char *client_id) +{ + GHashTableIter iter; + lrmd_rsc_t *rsc = NULL; + char *key = NULL; + + g_hash_table_iter_init(&iter, rsc_list); + while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & rsc)) { + if (pcmk_all_flags_set(rsc->call_opts, lrmd_opt_drop_recurring)) { + /* This client is disconnecting, drop any recurring operations + * it may have initiated on the resource */ + cancel_all_recurring(rsc, client_id); + } + } +} + +static void +action_complete(svc_action_t * action) +{ + lrmd_rsc_t *rsc; + lrmd_cmd_t *cmd = action->cb_data; + enum ocf_exitcode code; + +#ifdef PCMK__TIME_USE_CGT + const char *rclass = NULL; + bool goagain = false; +#endif + + if (!cmd) { + crm_err("Completed executor action (%s) does not match any known operations", + action->id); + return; + } + +#ifdef PCMK__TIME_USE_CGT + if (cmd->result.exit_status != action->rc) { + cmd->epoch_rcchange = time(NULL); + } +#endif + + cmd->last_pid = action->pid; + + // Cast variable instead of function return to keep compilers happy + code = services_result2ocf(action->standard, cmd->action, action->rc); + pcmk__set_result(&(cmd->result), (int) code, + action->status, services__exit_reason(action)); + + rsc = cmd->rsc_id ? g_hash_table_lookup(rsc_list, cmd->rsc_id) : NULL; + +#ifdef PCMK__TIME_USE_CGT + if (rsc && pcmk__str_eq(rsc->class, PCMK_RESOURCE_CLASS_SERVICE, pcmk__str_casei)) { + rclass = resources_find_service_class(rsc->type); + } else if(rsc) { + rclass = rsc->class; + } + + if (pcmk__str_eq(rclass, PCMK_RESOURCE_CLASS_SYSTEMD, pcmk__str_casei)) { + if (pcmk__result_ok(&(cmd->result)) + && pcmk__strcase_any_of(cmd->action, "start", "stop", NULL)) { + /* systemd returns from start and stop actions after the action + * begins, not after it completes. We have to jump through a few + * hoops so that we don't report 'complete' to the rest of pacemaker + * until it's actually done. + */ + goagain = true; + cmd->real_action = cmd->action; + cmd->action = strdup("monitor"); + + } else if (cmd->real_action != NULL) { + // This is follow-up monitor to check whether start/stop completed + if (cmd->result.execution_status == PCMK_EXEC_PENDING) { + goagain = true; + + } else if (pcmk__result_ok(&(cmd->result)) + && pcmk__str_eq(cmd->real_action, "stop", pcmk__str_casei)) { + goagain = true; + + } else { + int time_sum = time_diff_ms(NULL, &(cmd->t_first_run)); + int timeout_left = cmd->timeout_orig - time_sum; + + crm_debug("%s systemd %s is now complete (elapsed=%dms, " + "remaining=%dms): %s (%d)", + cmd->rsc_id, cmd->real_action, time_sum, timeout_left, + services_ocf_exitcode_str(cmd->result.exit_status), + cmd->result.exit_status); + cmd_original_times(cmd); + + // Monitors may return "not running", but start/stop shouldn't + if ((cmd->result.execution_status == PCMK_EXEC_DONE) + && (cmd->result.exit_status == PCMK_OCF_NOT_RUNNING)) { + + if (pcmk__str_eq(cmd->real_action, "start", pcmk__str_casei)) { + cmd->result.exit_status = PCMK_OCF_UNKNOWN_ERROR; + } else if (pcmk__str_eq(cmd->real_action, "stop", pcmk__str_casei)) { + cmd->result.exit_status = PCMK_OCF_OK; + } + } + } + } + } +#endif + +#if SUPPORT_NAGIOS + if (rsc && pcmk__str_eq(rsc->class, PCMK_RESOURCE_CLASS_NAGIOS, pcmk__str_casei)) { + if (action_matches(cmd, "monitor", 0) + && pcmk__result_ok(&(cmd->result))) { + /* Successfully executed --version for the nagios plugin */ + cmd->result.exit_status = PCMK_OCF_NOT_RUNNING; + + } else if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei) + && !pcmk__result_ok(&(cmd->result))) { +#ifdef PCMK__TIME_USE_CGT + goagain = true; +#endif + } + } +#endif + +#ifdef PCMK__TIME_USE_CGT + if (goagain) { + int time_sum = time_diff_ms(NULL, &(cmd->t_first_run)); + int timeout_left = cmd->timeout_orig - time_sum; + int delay = cmd->timeout_orig / 10; + + if(delay >= timeout_left && timeout_left > 20) { + delay = timeout_left/2; + } + + delay = QB_MIN(2000, delay); + if (delay < timeout_left) { + cmd->start_delay = delay; + cmd->timeout = timeout_left; + + if (pcmk__result_ok(&(cmd->result))) { + crm_debug("%s %s may still be in progress: re-scheduling (elapsed=%dms, remaining=%dms, start_delay=%dms)", + cmd->rsc_id, cmd->real_action, time_sum, timeout_left, delay); + + } else if (cmd->result.execution_status == PCMK_EXEC_PENDING) { + crm_info("%s %s is still in progress: re-scheduling (elapsed=%dms, remaining=%dms, start_delay=%dms)", + cmd->rsc_id, cmd->action, time_sum, timeout_left, delay); + + } else { + crm_notice("%s %s failed '%s' (%d): re-scheduling (elapsed=%dms, remaining=%dms, start_delay=%dms)", + cmd->rsc_id, cmd->action, + services_ocf_exitcode_str(cmd->result.exit_status), + cmd->result.exit_status, time_sum, timeout_left, + delay); + } + + cmd_reset(cmd); + if(rsc) { + rsc->active = NULL; + } + schedule_lrmd_cmd(rsc, cmd); + + /* Don't finalize cmd, we're not done with it yet */ + return; + + } else { + crm_notice("Giving up on %s %s (rc=%d): timeout (elapsed=%dms, remaining=%dms)", + cmd->rsc_id, + (cmd->real_action? cmd->real_action : cmd->action), + cmd->result.exit_status, time_sum, timeout_left); + pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR, + PCMK_EXEC_TIMEOUT, + "Investigate reason for timeout, and adjust " + "configured operation timeout if necessary"); + cmd_original_times(cmd); + } + } +#endif + + pcmk__set_result_output(&(cmd->result), services__grab_stdout(action), + services__grab_stderr(action)); + cmd_finalize(cmd, rsc); +} + +/*! + * \internal + * \brief Process the result of a fence device action (start, stop, or monitor) + * + * \param[in,out] cmd Fence device action that completed + * \param[in] exit_status Fencer API exit status for action + * \param[in] execution_status Fencer API execution status for action + * \param[in] exit_reason Human-friendly detail, if action failed + */ +static void +stonith_action_complete(lrmd_cmd_t *cmd, int exit_status, + enum pcmk_exec_status execution_status, + const char *exit_reason) +{ + // This can be NULL if resource was removed before command completed + lrmd_rsc_t *rsc = g_hash_table_lookup(rsc_list, cmd->rsc_id); + + // Simplify fencer exit status to uniform exit status + if (exit_status != CRM_EX_OK) { + exit_status = PCMK_OCF_UNKNOWN_ERROR; + } + + if (cmd->result.execution_status == PCMK_EXEC_CANCELLED) { + /* An in-flight fence action was cancelled. The execution status is + * already correct, so don't overwrite it. + */ + execution_status = PCMK_EXEC_CANCELLED; + + } else { + /* Some execution status codes have specific meanings for the fencer + * that executor clients may not expect, so map them to a simple error + * status. + */ + switch (execution_status) { + case PCMK_EXEC_NOT_CONNECTED: + case PCMK_EXEC_INVALID: + execution_status = PCMK_EXEC_ERROR; + break; + + case PCMK_EXEC_NO_FENCE_DEVICE: + /* This should be possible only for probes in practice, but + * interpret for all actions to be safe. + */ + if (pcmk__str_eq(cmd->action, CRMD_ACTION_STATUS, + pcmk__str_none)) { + exit_status = PCMK_OCF_NOT_RUNNING; + + } else if (pcmk__str_eq(cmd->action, CRMD_ACTION_STOP, + pcmk__str_none)) { + exit_status = PCMK_OCF_OK; + + } else { + exit_status = PCMK_OCF_NOT_INSTALLED; + } + execution_status = PCMK_EXEC_ERROR; + break; + + case PCMK_EXEC_NOT_SUPPORTED: + exit_status = PCMK_OCF_UNIMPLEMENT_FEATURE; + break; + + default: + break; + } + } + + pcmk__set_result(&cmd->result, exit_status, execution_status, exit_reason); + + // Certain successful actions change the known state of the resource + if ((rsc != NULL) && pcmk__result_ok(&(cmd->result))) { + + if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) { + pcmk__set_result(&rsc->fence_probe_result, CRM_EX_OK, + PCMK_EXEC_DONE, NULL); // "running" + + } else if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) { + pcmk__set_result(&rsc->fence_probe_result, CRM_EX_ERROR, + PCMK_EXEC_NO_FENCE_DEVICE, NULL); // "not running" + } + } + + /* The recurring timer should not be running at this point in any case, but + * as a failsafe, stop it if it is. + */ + stop_recurring_timer(cmd); + + /* Reschedule this command if appropriate. If a recurring command is *not* + * rescheduled, its status must be PCMK_EXEC_CANCELLED, otherwise it will + * not be removed from recurring_ops by cmd_finalize(). + */ + if (rsc && (cmd->interval_ms > 0) + && (cmd->result.execution_status != PCMK_EXEC_CANCELLED)) { + start_recurring_timer(cmd); + } + + cmd_finalize(cmd, rsc); +} + +static void +lrmd_stonith_callback(stonith_t * stonith, stonith_callback_data_t * data) +{ + if ((data == NULL) || (data->userdata == NULL)) { + crm_err("Ignoring fence action result: " + "Invalid callback arguments (bug?)"); + } else { + stonith_action_complete((lrmd_cmd_t *) data->userdata, + stonith__exit_status(data), + stonith__execution_status(data), + stonith__exit_reason(data)); + } +} + +void +stonith_connection_failed(void) +{ + GHashTableIter iter; + lrmd_rsc_t *rsc = NULL; + + crm_warn("Connection to fencer lost (any pending operations for " + "fence devices will be considered failed)"); + + g_hash_table_iter_init(&iter, rsc_list); + while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &rsc)) { + if (!pcmk__str_eq(rsc->class, PCMK_RESOURCE_CLASS_STONITH, + pcmk__str_none)) { + continue; + } + + /* If we registered this fence device, we don't know whether the + * fencer still has the registration or not. Cause future probes to + * return an error until the resource is stopped or started + * successfully. This is especially important if the controller also + * went away (possibly due to a cluster layer restart) and won't + * receive our client notification of any monitors finalized below. + */ + if (rsc->fence_probe_result.execution_status == PCMK_EXEC_DONE) { + pcmk__set_result(&rsc->fence_probe_result, CRM_EX_ERROR, + PCMK_EXEC_NOT_CONNECTED, + "Lost connection to fencer"); + } + + // Consider any active, pending, or recurring operations as failed + + for (GList *op = rsc->recurring_ops; op != NULL; op = op->next) { + lrmd_cmd_t *cmd = op->data; + + /* This won't free a recurring op but instead restart its timer. + * If cmd is rsc->active, this will set rsc->active to NULL, so we + * don't have to worry about finalizing it a second time below. + */ + stonith_action_complete(cmd, + CRM_EX_ERROR, PCMK_EXEC_NOT_CONNECTED, + "Lost connection to fencer"); + } + + if (rsc->active != NULL) { + rsc->pending_ops = g_list_prepend(rsc->pending_ops, rsc->active); + } + while (rsc->pending_ops != NULL) { + // This will free the op and remove it from rsc->pending_ops + stonith_action_complete((lrmd_cmd_t *) rsc->pending_ops->data, + CRM_EX_ERROR, PCMK_EXEC_NOT_CONNECTED, + "Lost connection to fencer"); + } + } +} + +/*! + * \internal + * \brief Execute a stonith resource "start" action + * + * Start a stonith resource by registering it with the fencer. + * (Stonith agents don't have a start command.) + * + * \param[in,out] stonith_api Connection to fencer + * \param[in] rsc Stonith resource to start + * \param[in] cmd Start command to execute + * + * \return pcmk_ok on success, -errno otherwise + */ +static int +execd_stonith_start(stonith_t *stonith_api, const lrmd_rsc_t *rsc, + const lrmd_cmd_t *cmd) +{ + char *key = NULL; + char *value = NULL; + stonith_key_value_t *device_params = NULL; + int rc = pcmk_ok; + + // Convert command parameters to stonith API key/values + if (cmd->params) { + GHashTableIter iter; + + g_hash_table_iter_init(&iter, cmd->params); + while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & value)) { + device_params = stonith_key_value_add(device_params, key, value); + } + } + + /* The fencer will automatically register devices via CIB notifications + * when the CIB changes, but to avoid a possible race condition between + * the fencer receiving the notification and the executor requesting that + * resource, the executor registers the device as well. The fencer knows how + * to handle duplicate registrations. + */ + rc = stonith_api->cmds->register_device(stonith_api, st_opt_sync_call, + cmd->rsc_id, rsc->provider, + rsc->type, device_params); + + stonith_key_value_freeall(device_params, 1, 1); + return rc; +} + +/*! + * \internal + * \brief Execute a stonith resource "stop" action + * + * Stop a stonith resource by unregistering it with the fencer. + * (Stonith agents don't have a stop command.) + * + * \param[in,out] stonith_api Connection to fencer + * \param[in] rsc Stonith resource to stop + * + * \return pcmk_ok on success, -errno otherwise + */ +static inline int +execd_stonith_stop(stonith_t *stonith_api, const lrmd_rsc_t *rsc) +{ + /* @TODO Failure would indicate a problem communicating with fencer; + * perhaps we should try reconnecting and retrying a few times? + */ + return stonith_api->cmds->remove_device(stonith_api, st_opt_sync_call, + rsc->rsc_id); +} + +/*! + * \internal + * \brief Initiate a stonith resource agent recurring "monitor" action + * + * \param[in,out] stonith_api Connection to fencer + * \param[in,out] rsc Stonith resource to monitor + * \param[in] cmd Monitor command being executed + * + * \return pcmk_ok if monitor was successfully initiated, -errno otherwise + */ +static inline int +execd_stonith_monitor(stonith_t *stonith_api, lrmd_rsc_t *rsc, lrmd_cmd_t *cmd) +{ + int rc = stonith_api->cmds->monitor(stonith_api, 0, cmd->rsc_id, + cmd->timeout / 1000); + + rc = stonith_api->cmds->register_callback(stonith_api, rc, 0, 0, cmd, + "lrmd_stonith_callback", + lrmd_stonith_callback); + if (rc == TRUE) { + rsc->active = cmd; + rc = pcmk_ok; + } else { + rc = -pcmk_err_generic; + } + return rc; +} + +static void +execute_stonith_action(lrmd_rsc_t *rsc, lrmd_cmd_t *cmd) +{ + int rc = 0; + bool do_monitor = FALSE; + + stonith_t *stonith_api = get_stonith_connection(); + + if (pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei) + && (cmd->interval_ms == 0)) { + // Probes don't require a fencer connection + stonith_action_complete(cmd, rsc->fence_probe_result.exit_status, + rsc->fence_probe_result.execution_status, + rsc->fence_probe_result.exit_reason); + return; + + } else if (stonith_api == NULL) { + stonith_action_complete(cmd, PCMK_OCF_UNKNOWN_ERROR, + PCMK_EXEC_NOT_CONNECTED, + "No connection to fencer"); + return; + + } else if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) { + rc = execd_stonith_start(stonith_api, rsc, cmd); + if (rc == pcmk_ok) { + do_monitor = TRUE; + } + + } else if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) { + rc = execd_stonith_stop(stonith_api, rsc); + + } else if (pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) { + do_monitor = TRUE; + + } else { + stonith_action_complete(cmd, PCMK_OCF_UNIMPLEMENT_FEATURE, + PCMK_EXEC_ERROR, + "Invalid fence device action (bug?)"); + return; + } + + if (do_monitor) { + rc = execd_stonith_monitor(stonith_api, rsc, cmd); + if (rc == pcmk_ok) { + // Don't clean up yet, we will find out result of the monitor later + return; + } + } + + stonith_action_complete(cmd, + ((rc == pcmk_ok)? CRM_EX_OK : CRM_EX_ERROR), + stonith__legacy2status(rc), + ((rc == -pcmk_err_generic)? NULL : pcmk_strerror(rc))); +} + +static void +execute_nonstonith_action(lrmd_rsc_t *rsc, lrmd_cmd_t *cmd) +{ + svc_action_t *action = NULL; + GHashTable *params_copy = NULL; + + CRM_ASSERT(rsc); + CRM_ASSERT(cmd); + + crm_trace("Creating action, resource:%s action:%s class:%s provider:%s agent:%s", + rsc->rsc_id, cmd->action, rsc->class, rsc->provider, rsc->type); + +#if SUPPORT_NAGIOS + /* Recurring operations are cancelled anyway for a stop operation */ + if (pcmk__str_eq(rsc->class, PCMK_RESOURCE_CLASS_NAGIOS, pcmk__str_casei) + && pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) { + + cmd->result.exit_status = PCMK_OCF_OK; + cmd_finalize(cmd, rsc); + return; + } +#endif + + params_copy = pcmk__str_table_dup(cmd->params); + + action = services__create_resource_action(rsc->rsc_id, rsc->class, rsc->provider, + rsc->type, + normalize_action_name(rsc, cmd->action), + cmd->interval_ms, cmd->timeout, + params_copy, cmd->service_flags); + + if (action == NULL) { + pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR, + PCMK_EXEC_ERROR, strerror(ENOMEM)); + cmd_finalize(cmd, rsc); + return; + } + + if (action->rc != PCMK_OCF_UNKNOWN) { + pcmk__set_result(&(cmd->result), action->rc, action->status, + services__exit_reason(action)); + services_action_free(action); + cmd_finalize(cmd, rsc); + return; + } + + action->cb_data = cmd; + + if (services_action_async(action, action_complete)) { + /* The services library has taken responsibility for the action. It + * could be pending, blocked, or merged into a duplicate recurring + * action, in which case the action callback (action_complete()) + * will be called when the action completes, otherwise the callback has + * already been called. + * + * action_complete() calls cmd_finalize() which can free cmd, so cmd + * cannot be used here. + */ + } else { + /* This is a recurring action that is not being cancelled and could not + * be initiated. It has been rescheduled, and the action callback + * (action_complete()) has been called, which in this case has already + * called cmd_finalize(), which in this case should only reset (not + * free) cmd. + */ + + pcmk__set_result(&(cmd->result), action->rc, action->status, + services__exit_reason(action)); + services_action_free(action); + } +} + +static gboolean +execute_resource_action(gpointer user_data) +{ + lrmd_rsc_t *rsc = (lrmd_rsc_t *) user_data; + lrmd_cmd_t *cmd = NULL; + + CRM_CHECK(rsc != NULL, return FALSE); + + if (rsc->active) { + crm_trace("%s is still active", rsc->rsc_id); + return TRUE; + } + + if (rsc->pending_ops) { + GList *first = rsc->pending_ops; + + cmd = first->data; + if (cmd->delay_id) { + crm_trace + ("Command %s %s was asked to run too early, waiting for start_delay timeout of %dms", + cmd->rsc_id, cmd->action, cmd->start_delay); + return TRUE; + } + rsc->pending_ops = g_list_remove_link(rsc->pending_ops, first); + g_list_free_1(first); + +#ifdef PCMK__TIME_USE_CGT + get_current_time(&(cmd->t_run), &(cmd->t_first_run)); +#endif + cmd->epoch_last_run = time(NULL); + } + + if (!cmd) { + crm_trace("Nothing further to do for %s", rsc->rsc_id); + return TRUE; + } + + rsc->active = cmd; /* only one op at a time for a rsc */ + if (cmd->interval_ms) { + rsc->recurring_ops = g_list_append(rsc->recurring_ops, cmd); + } + + log_execute(cmd); + + if (pcmk__str_eq(rsc->class, PCMK_RESOURCE_CLASS_STONITH, pcmk__str_casei)) { + execute_stonith_action(rsc, cmd); + } else { + execute_nonstonith_action(rsc, cmd); + } + + return TRUE; +} + +void +free_rsc(gpointer data) +{ + GList *gIter = NULL; + lrmd_rsc_t *rsc = data; + int is_stonith = pcmk__str_eq(rsc->class, PCMK_RESOURCE_CLASS_STONITH, + pcmk__str_casei); + + gIter = rsc->pending_ops; + while (gIter != NULL) { + GList *next = gIter->next; + lrmd_cmd_t *cmd = gIter->data; + + /* command was never executed */ + cmd->result.execution_status = PCMK_EXEC_CANCELLED; + cmd_finalize(cmd, NULL); + + gIter = next; + } + /* frees list, but not list elements. */ + g_list_free(rsc->pending_ops); + + gIter = rsc->recurring_ops; + while (gIter != NULL) { + GList *next = gIter->next; + lrmd_cmd_t *cmd = gIter->data; + + if (is_stonith) { + cmd->result.execution_status = PCMK_EXEC_CANCELLED; + /* If a stonith command is in-flight, just mark it as cancelled; + * it is not safe to finalize/free the cmd until the stonith api + * says it has either completed or timed out. + */ + if (rsc->active != cmd) { + cmd_finalize(cmd, NULL); + } + } else { + /* This command is already handed off to service library, + * let service library cancel it and tell us via the callback + * when it is cancelled. The rsc can be safely destroyed + * even if we are waiting for the cancel result */ + services_action_cancel(rsc->rsc_id, + normalize_action_name(rsc, cmd->action), + cmd->interval_ms); + } + + gIter = next; + } + /* frees list, but not list elements. */ + g_list_free(rsc->recurring_ops); + + free(rsc->rsc_id); + free(rsc->class); + free(rsc->provider); + free(rsc->type); + mainloop_destroy_trigger(rsc->work); + + free(rsc); +} + +static int +process_lrmd_signon(pcmk__client_t *client, xmlNode *request, int call_id, + xmlNode **reply) +{ + int rc = pcmk_ok; + time_t now = time(NULL); + const char *protocol_version = crm_element_value(request, F_LRMD_PROTOCOL_VERSION); + + if (compare_version(protocol_version, LRMD_MIN_PROTOCOL_VERSION) < 0) { + crm_err("Cluster API version must be greater than or equal to %s, not %s", + LRMD_MIN_PROTOCOL_VERSION, protocol_version); + rc = -EPROTO; + } + + if (pcmk__xe_attr_is_true(request, F_LRMD_IS_IPC_PROVIDER)) { +#ifdef PCMK__COMPILE_REMOTE + if ((client->remote != NULL) + && pcmk_is_set(client->flags, + pcmk__client_tls_handshake_complete)) { + + // This is a remote connection from a cluster node's controller + ipc_proxy_add_provider(client); + } else { + rc = -EACCES; + } +#else + rc = -EPROTONOSUPPORT; +#endif + } + + *reply = create_lrmd_reply(__func__, rc, call_id); + crm_xml_add(*reply, F_LRMD_OPERATION, CRM_OP_REGISTER); + crm_xml_add(*reply, F_LRMD_CLIENTID, client->id); + crm_xml_add(*reply, F_LRMD_PROTOCOL_VERSION, LRMD_PROTOCOL_VERSION); + crm_xml_add_ll(*reply, PCMK__XA_UPTIME, now - start_time); + + return rc; +} + +static int +process_lrmd_rsc_register(pcmk__client_t *client, uint32_t id, xmlNode *request) +{ + int rc = pcmk_ok; + lrmd_rsc_t *rsc = build_rsc_from_xml(request); + lrmd_rsc_t *dup = g_hash_table_lookup(rsc_list, rsc->rsc_id); + + if (dup && + pcmk__str_eq(rsc->class, dup->class, pcmk__str_casei) && + pcmk__str_eq(rsc->provider, dup->provider, pcmk__str_casei) && pcmk__str_eq(rsc->type, dup->type, pcmk__str_casei)) { + + crm_notice("Ignoring duplicate registration of '%s'", rsc->rsc_id); + free_rsc(rsc); + return rc; + } + + g_hash_table_replace(rsc_list, rsc->rsc_id, rsc); + crm_info("Cached agent information for '%s'", rsc->rsc_id); + return rc; +} + +static xmlNode * +process_lrmd_get_rsc_info(xmlNode *request, int call_id) +{ + int rc = pcmk_ok; + xmlNode *rsc_xml = get_xpath_object("//" F_LRMD_RSC, request, LOG_ERR); + const char *rsc_id = crm_element_value(rsc_xml, F_LRMD_RSC_ID); + xmlNode *reply = NULL; + lrmd_rsc_t *rsc = NULL; + + if (rsc_id == NULL) { + rc = -ENODEV; + } else { + rsc = g_hash_table_lookup(rsc_list, rsc_id); + if (rsc == NULL) { + crm_info("Agent information for '%s' not in cache", rsc_id); + rc = -ENODEV; + } + } + + reply = create_lrmd_reply(__func__, rc, call_id); + if (rsc) { + crm_xml_add(reply, F_LRMD_RSC_ID, rsc->rsc_id); + crm_xml_add(reply, F_LRMD_CLASS, rsc->class); + crm_xml_add(reply, F_LRMD_PROVIDER, rsc->provider); + crm_xml_add(reply, F_LRMD_TYPE, rsc->type); + } + return reply; +} + +static int +process_lrmd_rsc_unregister(pcmk__client_t *client, uint32_t id, + xmlNode *request) +{ + int rc = pcmk_ok; + lrmd_rsc_t *rsc = NULL; + xmlNode *rsc_xml = get_xpath_object("//" F_LRMD_RSC, request, LOG_ERR); + const char *rsc_id = crm_element_value(rsc_xml, F_LRMD_RSC_ID); + + if (!rsc_id) { + return -ENODEV; + } + + rsc = g_hash_table_lookup(rsc_list, rsc_id); + if (rsc == NULL) { + crm_info("Ignoring unregistration of resource '%s', which is not registered", + rsc_id); + return pcmk_ok; + } + + if (rsc->active) { + /* let the caller know there are still active ops on this rsc to watch for */ + crm_trace("Operation (%p) still in progress for unregistered resource %s", + rsc->active, rsc_id); + rc = -EINPROGRESS; + } + + g_hash_table_remove(rsc_list, rsc_id); + + return rc; +} + +static int +process_lrmd_rsc_exec(pcmk__client_t *client, uint32_t id, xmlNode *request) +{ + lrmd_rsc_t *rsc = NULL; + lrmd_cmd_t *cmd = NULL; + xmlNode *rsc_xml = get_xpath_object("//" F_LRMD_RSC, request, LOG_ERR); + const char *rsc_id = crm_element_value(rsc_xml, F_LRMD_RSC_ID); + int call_id; + + if (!rsc_id) { + return -EINVAL; + } + if (!(rsc = g_hash_table_lookup(rsc_list, rsc_id))) { + crm_info("Resource '%s' not found (%d active resources)", + rsc_id, g_hash_table_size(rsc_list)); + return -ENODEV; + } + + cmd = create_lrmd_cmd(request, client); + call_id = cmd->call_id; + + /* Don't reference cmd after handing it off to be scheduled. + * The cmd could get merged and freed. */ + schedule_lrmd_cmd(rsc, cmd); + + return call_id; +} + +static int +cancel_op(const char *rsc_id, const char *action, guint interval_ms) +{ + GList *gIter = NULL; + lrmd_rsc_t *rsc = g_hash_table_lookup(rsc_list, rsc_id); + + /* How to cancel an action. + * 1. Check pending ops list, if it hasn't been handed off + * to the service library or stonith recurring list remove + * it there and that will stop it. + * 2. If it isn't in the pending ops list, then it's either a + * recurring op in the stonith recurring list, or the service + * library's recurring list. Stop it there + * 3. If not found in any lists, then this operation has either + * been executed already and is not a recurring operation, or + * never existed. + */ + if (!rsc) { + return -ENODEV; + } + + for (gIter = rsc->pending_ops; gIter != NULL; gIter = gIter->next) { + lrmd_cmd_t *cmd = gIter->data; + + if (action_matches(cmd, action, interval_ms)) { + cmd->result.execution_status = PCMK_EXEC_CANCELLED; + cmd_finalize(cmd, rsc); + return pcmk_ok; + } + } + + if (pcmk__str_eq(rsc->class, PCMK_RESOURCE_CLASS_STONITH, pcmk__str_casei)) { + /* The service library does not handle stonith operations. + * We have to handle recurring stonith operations ourselves. */ + for (gIter = rsc->recurring_ops; gIter != NULL; gIter = gIter->next) { + lrmd_cmd_t *cmd = gIter->data; + + if (action_matches(cmd, action, interval_ms)) { + cmd->result.execution_status = PCMK_EXEC_CANCELLED; + if (rsc->active != cmd) { + cmd_finalize(cmd, rsc); + } + return pcmk_ok; + } + } + } else if (services_action_cancel(rsc_id, + normalize_action_name(rsc, action), + interval_ms) == TRUE) { + /* The service library will tell the action_complete callback function + * this action was cancelled, which will destroy the cmd and remove + * it from the recurring_op list. Do not do that in this function + * if the service library says it cancelled it. */ + return pcmk_ok; + } + + return -EOPNOTSUPP; +} + +static void +cancel_all_recurring(lrmd_rsc_t * rsc, const char *client_id) +{ + GList *cmd_list = NULL; + GList *cmd_iter = NULL; + + /* Notice a copy of each list is created when concat is called. + * This prevents odd behavior from occurring when the cmd_list + * is iterated through later on. It is possible the cancel_op + * function may end up modifying the recurring_ops and pending_ops + * lists. If we did not copy those lists, our cmd_list iteration + * could get messed up.*/ + if (rsc->recurring_ops) { + cmd_list = g_list_concat(cmd_list, g_list_copy(rsc->recurring_ops)); + } + if (rsc->pending_ops) { + cmd_list = g_list_concat(cmd_list, g_list_copy(rsc->pending_ops)); + } + if (!cmd_list) { + return; + } + + for (cmd_iter = cmd_list; cmd_iter; cmd_iter = cmd_iter->next) { + lrmd_cmd_t *cmd = cmd_iter->data; + + if (cmd->interval_ms == 0) { + continue; + } + + if (client_id && !pcmk__str_eq(cmd->client_id, client_id, pcmk__str_casei)) { + continue; + } + + cancel_op(rsc->rsc_id, cmd->action, cmd->interval_ms); + } + /* frees only the copied list data, not the cmds */ + g_list_free(cmd_list); +} + +static int +process_lrmd_rsc_cancel(pcmk__client_t *client, uint32_t id, xmlNode *request) +{ + xmlNode *rsc_xml = get_xpath_object("//" F_LRMD_RSC, request, LOG_ERR); + const char *rsc_id = crm_element_value(rsc_xml, F_LRMD_RSC_ID); + const char *action = crm_element_value(rsc_xml, F_LRMD_RSC_ACTION); + guint interval_ms = 0; + + crm_element_value_ms(rsc_xml, F_LRMD_RSC_INTERVAL, &interval_ms); + + if (!rsc_id || !action) { + return -EINVAL; + } + + return cancel_op(rsc_id, action, interval_ms); +} + +static void +add_recurring_op_xml(xmlNode *reply, lrmd_rsc_t *rsc) +{ + xmlNode *rsc_xml = create_xml_node(reply, F_LRMD_RSC); + + crm_xml_add(rsc_xml, F_LRMD_RSC_ID, rsc->rsc_id); + for (GList *item = rsc->recurring_ops; item != NULL; item = item->next) { + lrmd_cmd_t *cmd = item->data; + xmlNode *op_xml = create_xml_node(rsc_xml, T_LRMD_RSC_OP); + + crm_xml_add(op_xml, F_LRMD_RSC_ACTION, + (cmd->real_action? cmd->real_action : cmd->action)); + crm_xml_add_ms(op_xml, F_LRMD_RSC_INTERVAL, cmd->interval_ms); + crm_xml_add_int(op_xml, F_LRMD_TIMEOUT, cmd->timeout_orig); + } +} + +static xmlNode * +process_lrmd_get_recurring(xmlNode *request, int call_id) +{ + int rc = pcmk_ok; + const char *rsc_id = NULL; + lrmd_rsc_t *rsc = NULL; + xmlNode *reply = NULL; + xmlNode *rsc_xml = NULL; + + // Resource ID is optional + rsc_xml = first_named_child(request, F_LRMD_CALLDATA); + if (rsc_xml) { + rsc_xml = first_named_child(rsc_xml, F_LRMD_RSC); + } + if (rsc_xml) { + rsc_id = crm_element_value(rsc_xml, F_LRMD_RSC_ID); + } + + // If resource ID is specified, resource must exist + if (rsc_id != NULL) { + rsc = g_hash_table_lookup(rsc_list, rsc_id); + if (rsc == NULL) { + crm_info("Resource '%s' not found (%d active resources)", + rsc_id, g_hash_table_size(rsc_list)); + rc = -ENODEV; + } + } + + reply = create_lrmd_reply(__func__, rc, call_id); + + // If resource ID is not specified, check all resources + if (rsc_id == NULL) { + GHashTableIter iter; + char *key = NULL; + + g_hash_table_iter_init(&iter, rsc_list); + while (g_hash_table_iter_next(&iter, (gpointer *) &key, + (gpointer *) &rsc)) { + add_recurring_op_xml(reply, rsc); + } + } else if (rsc) { + add_recurring_op_xml(reply, rsc); + } + return reply; +} + +void +process_lrmd_message(pcmk__client_t *client, uint32_t id, xmlNode *request) +{ + int rc = pcmk_ok; + int call_id = 0; + const char *op = crm_element_value(request, F_LRMD_OPERATION); + int do_reply = 0; + int do_notify = 0; + xmlNode *reply = NULL; + + /* Certain IPC commands may be done only by privileged users (i.e. root or + * hacluster), because they would otherwise provide a means of bypassing + * ACLs. + */ + bool allowed = pcmk_is_set(client->flags, pcmk__client_privileged); + + crm_trace("Processing %s operation from %s", op, client->id); + crm_element_value_int(request, F_LRMD_CALLID, &call_id); + + if (pcmk__str_eq(op, CRM_OP_IPC_FWD, pcmk__str_none)) { +#ifdef PCMK__COMPILE_REMOTE + if (allowed) { + ipc_proxy_forward_client(client, request); + } else { + rc = -EACCES; + } +#else + rc = -EPROTONOSUPPORT; +#endif + do_reply = 1; + } else if (pcmk__str_eq(op, CRM_OP_REGISTER, pcmk__str_none)) { + rc = process_lrmd_signon(client, request, call_id, &reply); + do_reply = 1; + } else if (pcmk__str_eq(op, LRMD_OP_RSC_REG, pcmk__str_none)) { + if (allowed) { + rc = process_lrmd_rsc_register(client, id, request); + do_notify = 1; + } else { + rc = -EACCES; + } + do_reply = 1; + } else if (pcmk__str_eq(op, LRMD_OP_RSC_INFO, pcmk__str_none)) { + if (allowed) { + reply = process_lrmd_get_rsc_info(request, call_id); + } else { + rc = -EACCES; + } + do_reply = 1; + } else if (pcmk__str_eq(op, LRMD_OP_RSC_UNREG, pcmk__str_none)) { + if (allowed) { + rc = process_lrmd_rsc_unregister(client, id, request); + /* don't notify anyone about failed un-registers */ + if (rc == pcmk_ok || rc == -EINPROGRESS) { + do_notify = 1; + } + } else { + rc = -EACCES; + } + do_reply = 1; + } else if (pcmk__str_eq(op, LRMD_OP_RSC_EXEC, pcmk__str_none)) { + if (allowed) { + rc = process_lrmd_rsc_exec(client, id, request); + } else { + rc = -EACCES; + } + do_reply = 1; + } else if (pcmk__str_eq(op, LRMD_OP_RSC_CANCEL, pcmk__str_none)) { + if (allowed) { + rc = process_lrmd_rsc_cancel(client, id, request); + } else { + rc = -EACCES; + } + do_reply = 1; + } else if (pcmk__str_eq(op, LRMD_OP_POKE, pcmk__str_none)) { + do_notify = 1; + do_reply = 1; + } else if (pcmk__str_eq(op, LRMD_OP_CHECK, pcmk__str_none)) { + if (allowed) { + xmlNode *data = get_message_xml(request, F_LRMD_CALLDATA); + + CRM_LOG_ASSERT(data != NULL); + pcmk__valid_sbd_timeout(crm_element_value(data, F_LRMD_WATCHDOG)); + } else { + rc = -EACCES; + } + } else if (pcmk__str_eq(op, LRMD_OP_ALERT_EXEC, pcmk__str_none)) { + if (allowed) { + rc = process_lrmd_alert_exec(client, id, request); + } else { + rc = -EACCES; + } + do_reply = 1; + } else if (pcmk__str_eq(op, LRMD_OP_GET_RECURRING, pcmk__str_none)) { + if (allowed) { + reply = process_lrmd_get_recurring(request, call_id); + } else { + rc = -EACCES; + } + do_reply = 1; + } else { + rc = -EOPNOTSUPP; + do_reply = 1; + crm_err("Unknown IPC request '%s' from client %s", + op, pcmk__client_name(client)); + } + + if (rc == -EACCES) { + crm_warn("Rejecting IPC request '%s' from unprivileged client %s", + op, pcmk__client_name(client)); + } + + crm_debug("Processed %s operation from %s: rc=%d, reply=%d, notify=%d", + op, client->id, rc, do_reply, do_notify); + + if (do_reply) { + int send_rc = pcmk_rc_ok; + + if (reply == NULL) { + reply = create_lrmd_reply(__func__, rc, call_id); + } + send_rc = lrmd_server_send_reply(client, id, reply); + free_xml(reply); + if (send_rc != pcmk_rc_ok) { + crm_warn("Reply to client %s failed: %s " CRM_XS " rc=%d", + pcmk__client_name(client), pcmk_rc_str(send_rc), send_rc); + } + } + + if (do_notify) { + send_generic_notify(rc, request); + } +} |