summaryrefslogtreecommitdiffstats
path: root/lib/pengine/failcounts.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--lib/pengine/failcounts.c247
1 files changed, 156 insertions, 91 deletions
diff --git a/lib/pengine/failcounts.c b/lib/pengine/failcounts.c
index a4a3e11..6990d3d 100644
--- a/lib/pengine/failcounts.c
+++ b/lib/pengine/failcounts.c
@@ -77,7 +77,8 @@ is_matched_failure(const char *rsc_id, const xmlNode *conf_op_xml,
}
static gboolean
-block_failure(const pe_node_t *node, pe_resource_t *rsc, const xmlNode *xml_op)
+block_failure(const pcmk_node_t *node, pcmk_resource_t *rsc,
+ const xmlNode *xml_op)
{
char *xml_name = clone_strip(rsc->id);
@@ -180,11 +181,11 @@ block_failure(const pe_node_t *node, pe_resource_t *rsc, const xmlNode *xml_op)
* \note The caller is responsible for freeing the result.
*/
static inline char *
-rsc_fail_name(const pe_resource_t *rsc)
+rsc_fail_name(const pcmk_resource_t *rsc)
{
const char *name = (rsc->clone_name? rsc->clone_name : rsc->id);
- return pcmk_is_set(rsc->flags, pe_rsc_unique)? strdup(name) : clone_strip(name);
+ return pcmk_is_set(rsc->flags, pcmk_rsc_unique)? strdup(name) : clone_strip(name);
}
/*!
@@ -236,7 +237,6 @@ generate_fail_regex(const char *prefix, const char *rsc_name,
* \brief Compile regular expressions to match failure-related node attributes
*
* \param[in] rsc Resource being checked for failures
- * \param[in] data_set Data set (for CRM feature set version)
* \param[out] failcount_re Storage for regular expression for fail count
* \param[out] lastfailure_re Storage for regular expression for last failure
*
@@ -245,23 +245,25 @@ generate_fail_regex(const char *prefix, const char *rsc_name,
* regfree().
*/
static int
-generate_fail_regexes(const pe_resource_t *rsc,
- const pe_working_set_t *data_set,
+generate_fail_regexes(const pcmk_resource_t *rsc,
regex_t *failcount_re, regex_t *lastfailure_re)
{
+ int rc = pcmk_rc_ok;
char *rsc_name = rsc_fail_name(rsc);
- const char *version = crm_element_value(data_set->input, XML_ATTR_CRM_VERSION);
+ const char *version = crm_element_value(rsc->cluster->input,
+ XML_ATTR_CRM_VERSION);
+
+ // @COMPAT Pacemaker <= 1.1.16 used a single fail count per resource
gboolean is_legacy = (compare_version(version, "3.0.13") < 0);
- int rc = pcmk_rc_ok;
if (generate_fail_regex(PCMK__FAIL_COUNT_PREFIX, rsc_name, is_legacy,
- pcmk_is_set(rsc->flags, pe_rsc_unique),
+ pcmk_is_set(rsc->flags, pcmk_rsc_unique),
failcount_re) != pcmk_rc_ok) {
rc = EINVAL;
} else if (generate_fail_regex(PCMK__LAST_FAILURE_PREFIX, rsc_name,
is_legacy,
- pcmk_is_set(rsc->flags, pe_rsc_unique),
+ pcmk_is_set(rsc->flags, pcmk_rsc_unique),
lastfailure_re) != pcmk_rc_ok) {
rc = EINVAL;
regfree(failcount_re);
@@ -271,68 +273,137 @@ generate_fail_regexes(const pe_resource_t *rsc,
return rc;
}
-int
-pe_get_failcount(const pe_node_t *node, pe_resource_t *rsc,
- time_t *last_failure, uint32_t flags, const xmlNode *xml_op)
+// Data for fail-count-related iterators
+struct failcount_data {
+ const pcmk_node_t *node;// Node to check for fail count
+ pcmk_resource_t *rsc; // Resource to check for fail count
+ uint32_t flags; // Fail count flags
+ const xmlNode *xml_op; // History entry for expiration purposes (or NULL)
+ regex_t failcount_re; // Fail count regular expression to match
+ regex_t lastfailure_re; // Last failure regular expression to match
+ int failcount; // Fail count so far
+ time_t last_failure; // Time of most recent failure so far
+};
+
+/*!
+ * \internal
+ * \brief Update fail count and last failure appropriately for a node attribute
+ *
+ * \param[in] key Node attribute name
+ * \param[in] value Node attribute value
+ * \param[in] user_data Fail count data to update
+ */
+static void
+update_failcount_for_attr(gpointer key, gpointer value, gpointer user_data)
{
- char *key = NULL;
- const char *value = NULL;
- regex_t failcount_re, lastfailure_re;
- int failcount = 0;
- time_t last = 0;
- GHashTableIter iter;
-
- CRM_CHECK(generate_fail_regexes(rsc, rsc->cluster, &failcount_re,
- &lastfailure_re) == pcmk_rc_ok,
- return 0);
+ struct failcount_data *fc_data = user_data;
+
+ // If this is a matching fail count attribute, update fail count
+ if (regexec(&(fc_data->failcount_re), (const char *) key, 0, NULL, 0) == 0) {
+ fc_data->failcount = pcmk__add_scores(fc_data->failcount,
+ char2score(value));
+ pe_rsc_trace(fc_data->rsc, "Added %s (%s) to %s fail count (now %s)",
+ (const char *) key, (const char *) value, fc_data->rsc->id,
+ pcmk_readable_score(fc_data->failcount));
+ return;
+ }
- /* Resource fail count is sum of all matching operation fail counts */
- g_hash_table_iter_init(&iter, node->details->attrs);
- while (g_hash_table_iter_next(&iter, (gpointer *) &key, (gpointer *) &value)) {
- if (regexec(&failcount_re, key, 0, NULL, 0) == 0) {
- failcount = pcmk__add_scores(failcount, char2score(value));
- crm_trace("Added %s (%s) to %s fail count (now %s)",
- key, value, rsc->id, pcmk_readable_score(failcount));
- } else if (regexec(&lastfailure_re, key, 0, NULL, 0) == 0) {
- long long last_ll;
-
- if (pcmk__scan_ll(value, &last_ll, 0LL) == pcmk_rc_ok) {
- last = (time_t) QB_MAX(last, last_ll);
- }
+ // If this is a matching last failure attribute, update last failure
+ if (regexec(&(fc_data->lastfailure_re), (const char *) key, 0, NULL,
+ 0) == 0) {
+ long long last_ll;
+
+ if (pcmk__scan_ll(value, &last_ll, 0LL) == pcmk_rc_ok) {
+ fc_data->last_failure = (time_t) QB_MAX(fc_data->last_failure,
+ last_ll);
}
}
+}
- regfree(&failcount_re);
- regfree(&lastfailure_re);
+/*!
+ * \internal
+ * \brief Update fail count and last failure appropriately for a filler resource
+ *
+ * \param[in] data Filler resource
+ * \param[in] user_data Fail count data to update
+ */
+static void
+update_failcount_for_filler(gpointer data, gpointer user_data)
+{
+ pcmk_resource_t *filler = data;
+ struct failcount_data *fc_data = user_data;
+ time_t filler_last_failure = 0;
+
+ fc_data->failcount += pe_get_failcount(fc_data->node, filler,
+ &filler_last_failure, fc_data->flags,
+ fc_data->xml_op);
+ fc_data->last_failure = QB_MAX(fc_data->last_failure, filler_last_failure);
+}
- if ((failcount > 0) && (last > 0) && (last_failure != NULL)) {
- *last_failure = last;
- }
+/*!
+ * \internal
+ * \brief Get a resource's fail count on a node
+ *
+ * \param[in] node Node to check
+ * \param[in,out] rsc Resource to check
+ * \param[out] last_failure If not NULL, where to set time of most recent
+ * failure of \p rsc on \p node
+ * \param[in] flags Group of enum pcmk__fc_flags
+ * \param[in] xml_op If not NULL, consider only the action in this
+ * history entry when determining whether on-fail
+ * is configured as "blocked", otherwise consider
+ * all actions configured for \p rsc
+ *
+ * \return Fail count for \p rsc on \p node according to \p flags
+ */
+int
+pe_get_failcount(const pcmk_node_t *node, pcmk_resource_t *rsc,
+ time_t *last_failure, uint32_t flags, const xmlNode *xml_op)
+{
+ struct failcount_data fc_data = {
+ .node = node,
+ .rsc = rsc,
+ .flags = flags,
+ .xml_op = xml_op,
+ .failcount = 0,
+ .last_failure = (time_t) 0,
+ };
+
+ // Calculate resource failcount as sum of all matching operation failcounts
+ CRM_CHECK(generate_fail_regexes(rsc, &fc_data.failcount_re,
+ &fc_data.lastfailure_re) == pcmk_rc_ok,
+ return 0);
+ g_hash_table_foreach(node->details->attrs, update_failcount_for_attr,
+ &fc_data);
+ regfree(&(fc_data.failcount_re));
+ regfree(&(fc_data.lastfailure_re));
- /* If failure blocks the resource, disregard any failure timeout */
- if ((failcount > 0) && rsc->failure_timeout
+ // If failure blocks the resource, disregard any failure timeout
+ if ((fc_data.failcount > 0) && (rsc->failure_timeout > 0)
&& block_failure(node, rsc, xml_op)) {
- pe_warn("Ignoring failure timeout %d for %s because it conflicts with on-fail=block",
+ pe_warn("Ignoring failure timeout %d for %s "
+ "because it conflicts with on-fail=block",
rsc->failure_timeout, rsc->id);
rsc->failure_timeout = 0;
}
- /* If all failures have expired, ignore fail count */
- if (pcmk_is_set(flags, pe_fc_effective) && (failcount > 0) && (last > 0)
- && rsc->failure_timeout) {
+ // If all failures have expired, ignore fail count
+ if (pcmk_is_set(flags, pcmk__fc_effective) && (fc_data.failcount > 0)
+ && (fc_data.last_failure > 0) && (rsc->failure_timeout != 0)) {
time_t now = get_effective_time(rsc->cluster);
- if (now > (last + rsc->failure_timeout)) {
- crm_debug("Failcount for %s on %s expired after %ds",
- rsc->id, pe__node_name(node), rsc->failure_timeout);
- failcount = 0;
+ if (now > (fc_data.last_failure + rsc->failure_timeout)) {
+ pe_rsc_debug(rsc, "Failcount for %s on %s expired after %ds",
+ rsc->id, pe__node_name(node), rsc->failure_timeout);
+ fc_data.failcount = 0;
}
}
- /* We never want the fail counts of a bundle container's fillers to
- * count towards the container's fail count.
+ /* Add the fail count of any filler resources, except that we never want the
+ * fail counts of a bundle container's fillers to count towards the
+ * container's fail count.
*
* Most importantly, a Pacemaker Remote connection to a bundle container
* is a filler of the container, but can reside on a different node than the
@@ -340,62 +411,56 @@ pe_get_failcount(const pe_node_t *node, pe_resource_t *rsc,
* container's fail count on that node could lead to attempting to stop the
* container on the wrong node.
*/
-
- if (pcmk_is_set(flags, pe_fc_fillers) && rsc->fillers
+ if (pcmk_is_set(flags, pcmk__fc_fillers) && (rsc->fillers != NULL)
&& !pe_rsc_is_bundled(rsc)) {
- GList *gIter = NULL;
-
- for (gIter = rsc->fillers; gIter != NULL; gIter = gIter->next) {
- pe_resource_t *filler = (pe_resource_t *) gIter->data;
- time_t filler_last_failure = 0;
-
- failcount += pe_get_failcount(node, filler, &filler_last_failure,
- flags, xml_op);
-
- if (last_failure && filler_last_failure > *last_failure) {
- *last_failure = filler_last_failure;
- }
- }
-
- if (failcount > 0) {
- crm_info("Container %s and the resources within it "
- "have failed %s time%s on %s",
- rsc->id, pcmk_readable_score(failcount),
- pcmk__plural_s(failcount), pe__node_name(node));
+ g_list_foreach(rsc->fillers, update_failcount_for_filler, &fc_data);
+ if (fc_data.failcount > 0) {
+ pe_rsc_info(rsc,
+ "Container %s and the resources within it "
+ "have failed %s time%s on %s",
+ rsc->id, pcmk_readable_score(fc_data.failcount),
+ pcmk__plural_s(fc_data.failcount), pe__node_name(node));
}
- } else if (failcount > 0) {
- crm_info("%s has failed %s time%s on %s",
- rsc->id, pcmk_readable_score(failcount),
- pcmk__plural_s(failcount), pe__node_name(node));
+ } else if (fc_data.failcount > 0) {
+ pe_rsc_info(rsc, "%s has failed %s time%s on %s",
+ rsc->id, pcmk_readable_score(fc_data.failcount),
+ pcmk__plural_s(fc_data.failcount), pe__node_name(node));
}
- return failcount;
+ if (last_failure != NULL) {
+ if ((fc_data.failcount > 0) && (fc_data.last_failure > 0)) {
+ *last_failure = fc_data.last_failure;
+ } else {
+ *last_failure = 0;
+ }
+ }
+ return fc_data.failcount;
}
/*!
* \brief Schedule a controller operation to clear a fail count
*
- * \param[in,out] rsc Resource with failure
- * \param[in] node Node failure occurred on
- * \param[in] reason Readable description why needed (for logging)
- * \param[in,out] data_set Working set for cluster
+ * \param[in,out] rsc Resource with failure
+ * \param[in] node Node failure occurred on
+ * \param[in] reason Readable description why needed (for logging)
+ * \param[in,out] scheduler Scheduler data cluster
*
* \return Scheduled action
*/
-pe_action_t *
-pe__clear_failcount(pe_resource_t *rsc, const pe_node_t *node,
- const char *reason, pe_working_set_t *data_set)
+pcmk_action_t *
+pe__clear_failcount(pcmk_resource_t *rsc, const pcmk_node_t *node,
+ const char *reason, pcmk_scheduler_t *scheduler)
{
char *key = NULL;
- pe_action_t *clear = NULL;
+ pcmk_action_t *clear = NULL;
- CRM_CHECK(rsc && node && reason && data_set, return NULL);
+ CRM_CHECK(rsc && node && reason && scheduler, return NULL);
- key = pcmk__op_key(rsc->id, CRM_OP_CLEAR_FAILCOUNT, 0);
- clear = custom_action(rsc, key, CRM_OP_CLEAR_FAILCOUNT, node, FALSE, TRUE,
- data_set);
+ key = pcmk__op_key(rsc->id, PCMK_ACTION_CLEAR_FAILCOUNT, 0);
+ clear = custom_action(rsc, key, PCMK_ACTION_CLEAR_FAILCOUNT, node, FALSE,
+ scheduler);
add_hash_param(clear->meta, XML_ATTR_TE_NOWAIT, XML_BOOLEAN_TRUE);
crm_notice("Clearing failure of %s on %s because %s " CRM_XS " %s",
rsc->id, pe__node_name(node), reason, clear->uuid);