diff options
Diffstat (limited to '')
-rw-r--r-- | lib/pengine/failcounts.c | 247 |
1 files changed, 156 insertions, 91 deletions
diff --git a/lib/pengine/failcounts.c b/lib/pengine/failcounts.c index a4a3e11..6990d3d 100644 --- a/lib/pengine/failcounts.c +++ b/lib/pengine/failcounts.c @@ -77,7 +77,8 @@ is_matched_failure(const char *rsc_id, const xmlNode *conf_op_xml, } static gboolean -block_failure(const pe_node_t *node, pe_resource_t *rsc, const xmlNode *xml_op) +block_failure(const pcmk_node_t *node, pcmk_resource_t *rsc, + const xmlNode *xml_op) { char *xml_name = clone_strip(rsc->id); @@ -180,11 +181,11 @@ block_failure(const pe_node_t *node, pe_resource_t *rsc, const xmlNode *xml_op) * \note The caller is responsible for freeing the result. */ static inline char * -rsc_fail_name(const pe_resource_t *rsc) +rsc_fail_name(const pcmk_resource_t *rsc) { const char *name = (rsc->clone_name? rsc->clone_name : rsc->id); - return pcmk_is_set(rsc->flags, pe_rsc_unique)? strdup(name) : clone_strip(name); + return pcmk_is_set(rsc->flags, pcmk_rsc_unique)? strdup(name) : clone_strip(name); } /*! @@ -236,7 +237,6 @@ generate_fail_regex(const char *prefix, const char *rsc_name, * \brief Compile regular expressions to match failure-related node attributes * * \param[in] rsc Resource being checked for failures - * \param[in] data_set Data set (for CRM feature set version) * \param[out] failcount_re Storage for regular expression for fail count * \param[out] lastfailure_re Storage for regular expression for last failure * @@ -245,23 +245,25 @@ generate_fail_regex(const char *prefix, const char *rsc_name, * regfree(). */ static int -generate_fail_regexes(const pe_resource_t *rsc, - const pe_working_set_t *data_set, +generate_fail_regexes(const pcmk_resource_t *rsc, regex_t *failcount_re, regex_t *lastfailure_re) { + int rc = pcmk_rc_ok; char *rsc_name = rsc_fail_name(rsc); - const char *version = crm_element_value(data_set->input, XML_ATTR_CRM_VERSION); + const char *version = crm_element_value(rsc->cluster->input, + XML_ATTR_CRM_VERSION); + + // @COMPAT Pacemaker <= 1.1.16 used a single fail count per resource gboolean is_legacy = (compare_version(version, "3.0.13") < 0); - int rc = pcmk_rc_ok; if (generate_fail_regex(PCMK__FAIL_COUNT_PREFIX, rsc_name, is_legacy, - pcmk_is_set(rsc->flags, pe_rsc_unique), + pcmk_is_set(rsc->flags, pcmk_rsc_unique), failcount_re) != pcmk_rc_ok) { rc = EINVAL; } else if (generate_fail_regex(PCMK__LAST_FAILURE_PREFIX, rsc_name, is_legacy, - pcmk_is_set(rsc->flags, pe_rsc_unique), + pcmk_is_set(rsc->flags, pcmk_rsc_unique), lastfailure_re) != pcmk_rc_ok) { rc = EINVAL; regfree(failcount_re); @@ -271,68 +273,137 @@ generate_fail_regexes(const pe_resource_t *rsc, return rc; } -int -pe_get_failcount(const pe_node_t *node, pe_resource_t *rsc, - time_t *last_failure, uint32_t flags, const xmlNode *xml_op) +// Data for fail-count-related iterators +struct failcount_data { + const pcmk_node_t *node;// Node to check for fail count + pcmk_resource_t *rsc; // Resource to check for fail count + uint32_t flags; // Fail count flags + const xmlNode *xml_op; // History entry for expiration purposes (or NULL) + regex_t failcount_re; // Fail count regular expression to match + regex_t lastfailure_re; // Last failure regular expression to match + int failcount; // Fail count so far + time_t last_failure; // Time of most recent failure so far +}; + +/*! + * \internal + * \brief Update fail count and last failure appropriately for a node attribute + * + * \param[in] key Node attribute name + * \param[in] value Node attribute value + * \param[in] user_data Fail count data to update + */ +static void +update_failcount_for_attr(gpointer key, gpointer value, gpointer user_data) { - char *key = NULL; - const char *value = NULL; - regex_t failcount_re, lastfailure_re; - int failcount = 0; - time_t last = 0; - GHashTableIter iter; - - CRM_CHECK(generate_fail_regexes(rsc, rsc->cluster, &failcount_re, - &lastfailure_re) == pcmk_rc_ok, - return 0); + struct failcount_data *fc_data = user_data; + + // If this is a matching fail count attribute, update fail count + if (regexec(&(fc_data->failcount_re), (const char *) key, 0, NULL, 0) == 0) { + fc_data->failcount = pcmk__add_scores(fc_data->failcount, + char2score(value)); + pe_rsc_trace(fc_data->rsc, "Added %s (%s) to %s fail count (now %s)", + (const char *) key, (const char *) value, fc_data->rsc->id, + pcmk_readable_score(fc_data->failcount)); + return; + } - /* Resource fail count is sum of all matching operation fail counts */ - g_hash_table_iter_init(&iter, node->details->attrs); - while (g_hash_table_iter_next(&iter, (gpointer *) &key, (gpointer *) &value)) { - if (regexec(&failcount_re, key, 0, NULL, 0) == 0) { - failcount = pcmk__add_scores(failcount, char2score(value)); - crm_trace("Added %s (%s) to %s fail count (now %s)", - key, value, rsc->id, pcmk_readable_score(failcount)); - } else if (regexec(&lastfailure_re, key, 0, NULL, 0) == 0) { - long long last_ll; - - if (pcmk__scan_ll(value, &last_ll, 0LL) == pcmk_rc_ok) { - last = (time_t) QB_MAX(last, last_ll); - } + // If this is a matching last failure attribute, update last failure + if (regexec(&(fc_data->lastfailure_re), (const char *) key, 0, NULL, + 0) == 0) { + long long last_ll; + + if (pcmk__scan_ll(value, &last_ll, 0LL) == pcmk_rc_ok) { + fc_data->last_failure = (time_t) QB_MAX(fc_data->last_failure, + last_ll); } } +} - regfree(&failcount_re); - regfree(&lastfailure_re); +/*! + * \internal + * \brief Update fail count and last failure appropriately for a filler resource + * + * \param[in] data Filler resource + * \param[in] user_data Fail count data to update + */ +static void +update_failcount_for_filler(gpointer data, gpointer user_data) +{ + pcmk_resource_t *filler = data; + struct failcount_data *fc_data = user_data; + time_t filler_last_failure = 0; + + fc_data->failcount += pe_get_failcount(fc_data->node, filler, + &filler_last_failure, fc_data->flags, + fc_data->xml_op); + fc_data->last_failure = QB_MAX(fc_data->last_failure, filler_last_failure); +} - if ((failcount > 0) && (last > 0) && (last_failure != NULL)) { - *last_failure = last; - } +/*! + * \internal + * \brief Get a resource's fail count on a node + * + * \param[in] node Node to check + * \param[in,out] rsc Resource to check + * \param[out] last_failure If not NULL, where to set time of most recent + * failure of \p rsc on \p node + * \param[in] flags Group of enum pcmk__fc_flags + * \param[in] xml_op If not NULL, consider only the action in this + * history entry when determining whether on-fail + * is configured as "blocked", otherwise consider + * all actions configured for \p rsc + * + * \return Fail count for \p rsc on \p node according to \p flags + */ +int +pe_get_failcount(const pcmk_node_t *node, pcmk_resource_t *rsc, + time_t *last_failure, uint32_t flags, const xmlNode *xml_op) +{ + struct failcount_data fc_data = { + .node = node, + .rsc = rsc, + .flags = flags, + .xml_op = xml_op, + .failcount = 0, + .last_failure = (time_t) 0, + }; + + // Calculate resource failcount as sum of all matching operation failcounts + CRM_CHECK(generate_fail_regexes(rsc, &fc_data.failcount_re, + &fc_data.lastfailure_re) == pcmk_rc_ok, + return 0); + g_hash_table_foreach(node->details->attrs, update_failcount_for_attr, + &fc_data); + regfree(&(fc_data.failcount_re)); + regfree(&(fc_data.lastfailure_re)); - /* If failure blocks the resource, disregard any failure timeout */ - if ((failcount > 0) && rsc->failure_timeout + // If failure blocks the resource, disregard any failure timeout + if ((fc_data.failcount > 0) && (rsc->failure_timeout > 0) && block_failure(node, rsc, xml_op)) { - pe_warn("Ignoring failure timeout %d for %s because it conflicts with on-fail=block", + pe_warn("Ignoring failure timeout %d for %s " + "because it conflicts with on-fail=block", rsc->failure_timeout, rsc->id); rsc->failure_timeout = 0; } - /* If all failures have expired, ignore fail count */ - if (pcmk_is_set(flags, pe_fc_effective) && (failcount > 0) && (last > 0) - && rsc->failure_timeout) { + // If all failures have expired, ignore fail count + if (pcmk_is_set(flags, pcmk__fc_effective) && (fc_data.failcount > 0) + && (fc_data.last_failure > 0) && (rsc->failure_timeout != 0)) { time_t now = get_effective_time(rsc->cluster); - if (now > (last + rsc->failure_timeout)) { - crm_debug("Failcount for %s on %s expired after %ds", - rsc->id, pe__node_name(node), rsc->failure_timeout); - failcount = 0; + if (now > (fc_data.last_failure + rsc->failure_timeout)) { + pe_rsc_debug(rsc, "Failcount for %s on %s expired after %ds", + rsc->id, pe__node_name(node), rsc->failure_timeout); + fc_data.failcount = 0; } } - /* We never want the fail counts of a bundle container's fillers to - * count towards the container's fail count. + /* Add the fail count of any filler resources, except that we never want the + * fail counts of a bundle container's fillers to count towards the + * container's fail count. * * Most importantly, a Pacemaker Remote connection to a bundle container * is a filler of the container, but can reside on a different node than the @@ -340,62 +411,56 @@ pe_get_failcount(const pe_node_t *node, pe_resource_t *rsc, * container's fail count on that node could lead to attempting to stop the * container on the wrong node. */ - - if (pcmk_is_set(flags, pe_fc_fillers) && rsc->fillers + if (pcmk_is_set(flags, pcmk__fc_fillers) && (rsc->fillers != NULL) && !pe_rsc_is_bundled(rsc)) { - GList *gIter = NULL; - - for (gIter = rsc->fillers; gIter != NULL; gIter = gIter->next) { - pe_resource_t *filler = (pe_resource_t *) gIter->data; - time_t filler_last_failure = 0; - - failcount += pe_get_failcount(node, filler, &filler_last_failure, - flags, xml_op); - - if (last_failure && filler_last_failure > *last_failure) { - *last_failure = filler_last_failure; - } - } - - if (failcount > 0) { - crm_info("Container %s and the resources within it " - "have failed %s time%s on %s", - rsc->id, pcmk_readable_score(failcount), - pcmk__plural_s(failcount), pe__node_name(node)); + g_list_foreach(rsc->fillers, update_failcount_for_filler, &fc_data); + if (fc_data.failcount > 0) { + pe_rsc_info(rsc, + "Container %s and the resources within it " + "have failed %s time%s on %s", + rsc->id, pcmk_readable_score(fc_data.failcount), + pcmk__plural_s(fc_data.failcount), pe__node_name(node)); } - } else if (failcount > 0) { - crm_info("%s has failed %s time%s on %s", - rsc->id, pcmk_readable_score(failcount), - pcmk__plural_s(failcount), pe__node_name(node)); + } else if (fc_data.failcount > 0) { + pe_rsc_info(rsc, "%s has failed %s time%s on %s", + rsc->id, pcmk_readable_score(fc_data.failcount), + pcmk__plural_s(fc_data.failcount), pe__node_name(node)); } - return failcount; + if (last_failure != NULL) { + if ((fc_data.failcount > 0) && (fc_data.last_failure > 0)) { + *last_failure = fc_data.last_failure; + } else { + *last_failure = 0; + } + } + return fc_data.failcount; } /*! * \brief Schedule a controller operation to clear a fail count * - * \param[in,out] rsc Resource with failure - * \param[in] node Node failure occurred on - * \param[in] reason Readable description why needed (for logging) - * \param[in,out] data_set Working set for cluster + * \param[in,out] rsc Resource with failure + * \param[in] node Node failure occurred on + * \param[in] reason Readable description why needed (for logging) + * \param[in,out] scheduler Scheduler data cluster * * \return Scheduled action */ -pe_action_t * -pe__clear_failcount(pe_resource_t *rsc, const pe_node_t *node, - const char *reason, pe_working_set_t *data_set) +pcmk_action_t * +pe__clear_failcount(pcmk_resource_t *rsc, const pcmk_node_t *node, + const char *reason, pcmk_scheduler_t *scheduler) { char *key = NULL; - pe_action_t *clear = NULL; + pcmk_action_t *clear = NULL; - CRM_CHECK(rsc && node && reason && data_set, return NULL); + CRM_CHECK(rsc && node && reason && scheduler, return NULL); - key = pcmk__op_key(rsc->id, CRM_OP_CLEAR_FAILCOUNT, 0); - clear = custom_action(rsc, key, CRM_OP_CLEAR_FAILCOUNT, node, FALSE, TRUE, - data_set); + key = pcmk__op_key(rsc->id, PCMK_ACTION_CLEAR_FAILCOUNT, 0); + clear = custom_action(rsc, key, PCMK_ACTION_CLEAR_FAILCOUNT, node, FALSE, + scheduler); add_hash_param(clear->meta, XML_ATTR_TE_NOWAIT, XML_BOOLEAN_TRUE); crm_notice("Clearing failure of %s on %s because %s " CRM_XS " %s", rsc->id, pe__node_name(node), reason, clear->uuid); |