diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 06:53:20 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 06:53:20 +0000 |
commit | e5a812082ae033afb1eed82c0f2df3d0f6bdc93f (patch) | |
tree | a6716c9275b4b413f6c9194798b34b91affb3cc7 /lib/pengine/failcounts.c | |
parent | Initial commit. (diff) | |
download | pacemaker-e5a812082ae033afb1eed82c0f2df3d0f6bdc93f.tar.xz pacemaker-e5a812082ae033afb1eed82c0f2df3d0f6bdc93f.zip |
Adding upstream version 2.1.6.upstream/2.1.6
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'lib/pengine/failcounts.c')
-rw-r--r-- | lib/pengine/failcounts.c | 403 |
1 files changed, 403 insertions, 0 deletions
diff --git a/lib/pengine/failcounts.c b/lib/pengine/failcounts.c new file mode 100644 index 0000000..a4a3e11 --- /dev/null +++ b/lib/pengine/failcounts.c @@ -0,0 +1,403 @@ +/* + * Copyright 2008-2023 the Pacemaker project contributors + * + * This source code is licensed under the GNU Lesser General Public License + * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. + */ + +#include <crm_internal.h> + +#include <sys/types.h> +#include <regex.h> +#include <glib.h> + +#include <crm/crm.h> +#include <crm/msg_xml.h> +#include <crm/common/xml.h> +#include <crm/common/util.h> +#include <crm/pengine/internal.h> + +static gboolean +is_matched_failure(const char *rsc_id, const xmlNode *conf_op_xml, + const xmlNode *lrm_op_xml) +{ + gboolean matched = FALSE; + const char *conf_op_name = NULL; + const char *lrm_op_task = NULL; + const char *conf_op_interval_spec = NULL; + guint conf_op_interval_ms = 0; + guint lrm_op_interval_ms = 0; + const char *lrm_op_id = NULL; + char *last_failure_key = NULL; + + if (rsc_id == NULL || conf_op_xml == NULL || lrm_op_xml == NULL) { + return FALSE; + } + + // Get name and interval from configured op + conf_op_name = crm_element_value(conf_op_xml, "name"); + conf_op_interval_spec = crm_element_value(conf_op_xml, + XML_LRM_ATTR_INTERVAL); + conf_op_interval_ms = crm_parse_interval_spec(conf_op_interval_spec); + + // Get name and interval from op history entry + lrm_op_task = crm_element_value(lrm_op_xml, XML_LRM_ATTR_TASK); + crm_element_value_ms(lrm_op_xml, XML_LRM_ATTR_INTERVAL_MS, + &lrm_op_interval_ms); + + if ((conf_op_interval_ms != lrm_op_interval_ms) + || !pcmk__str_eq(conf_op_name, lrm_op_task, pcmk__str_casei)) { + return FALSE; + } + + lrm_op_id = ID(lrm_op_xml); + last_failure_key = pcmk__op_key(rsc_id, "last_failure", 0); + + if (pcmk__str_eq(last_failure_key, lrm_op_id, pcmk__str_casei)) { + matched = TRUE; + + } else { + char *expected_op_key = pcmk__op_key(rsc_id, conf_op_name, + conf_op_interval_ms); + + if (pcmk__str_eq(expected_op_key, lrm_op_id, pcmk__str_casei)) { + int rc = 0; + int target_rc = pe__target_rc_from_xml(lrm_op_xml); + + crm_element_value_int(lrm_op_xml, XML_LRM_ATTR_RC, &rc); + if (rc != target_rc) { + matched = TRUE; + } + } + free(expected_op_key); + } + + free(last_failure_key); + return matched; +} + +static gboolean +block_failure(const pe_node_t *node, pe_resource_t *rsc, const xmlNode *xml_op) +{ + char *xml_name = clone_strip(rsc->id); + + /* @TODO This xpath search occurs after template expansion, but it is unable + * to properly detect on-fail in id-ref, operation meta-attributes, or + * op_defaults, or evaluate rules. + * + * Also, on-fail defaults to block (in unpack_operation()) for stop actions + * when stonith is disabled. + * + * Ideally, we'd unpack the operation before this point, and pass in a + * meta-attributes table that takes all that into consideration. + */ + char *xpath = crm_strdup_printf("//" XML_CIB_TAG_RESOURCE + "[@" XML_ATTR_ID "='%s']" + "//" XML_ATTR_OP + "[@" XML_OP_ATTR_ON_FAIL "='block']", + xml_name); + + xmlXPathObject *xpathObj = xpath_search(rsc->xml, xpath); + gboolean should_block = FALSE; + + free(xpath); + + if (xpathObj) { + int max = numXpathResults(xpathObj); + int lpc = 0; + + for (lpc = 0; lpc < max; lpc++) { + xmlNode *pref = getXpathResult(xpathObj, lpc); + + if (xml_op) { + should_block = is_matched_failure(xml_name, pref, xml_op); + if (should_block) { + break; + } + + } else { + const char *conf_op_name = NULL; + const char *conf_op_interval_spec = NULL; + guint conf_op_interval_ms = 0; + char *lrm_op_xpath = NULL; + xmlXPathObject *lrm_op_xpathObj = NULL; + + // Get name and interval from configured op + conf_op_name = crm_element_value(pref, "name"); + conf_op_interval_spec = crm_element_value(pref, XML_LRM_ATTR_INTERVAL); + conf_op_interval_ms = crm_parse_interval_spec(conf_op_interval_spec); + +#define XPATH_FMT "//" XML_CIB_TAG_STATE "[@" XML_ATTR_UNAME "='%s']" \ + "//" XML_LRM_TAG_RESOURCE "[@" XML_ATTR_ID "='%s']" \ + "/" XML_LRM_TAG_RSC_OP "[@" XML_LRM_ATTR_TASK "='%s']" \ + "[@" XML_LRM_ATTR_INTERVAL "='%u']" + + lrm_op_xpath = crm_strdup_printf(XPATH_FMT, + node->details->uname, xml_name, + conf_op_name, + conf_op_interval_ms); + lrm_op_xpathObj = xpath_search(rsc->cluster->input, lrm_op_xpath); + + free(lrm_op_xpath); + + if (lrm_op_xpathObj) { + int max2 = numXpathResults(lrm_op_xpathObj); + int lpc2 = 0; + + for (lpc2 = 0; lpc2 < max2; lpc2++) { + xmlNode *lrm_op_xml = getXpathResult(lrm_op_xpathObj, + lpc2); + + should_block = is_matched_failure(xml_name, pref, + lrm_op_xml); + if (should_block) { + break; + } + } + } + freeXpathObject(lrm_op_xpathObj); + + if (should_block) { + break; + } + } + } + } + + free(xml_name); + freeXpathObject(xpathObj); + + return should_block; +} + +/*! + * \internal + * \brief Get resource name as used in failure-related node attributes + * + * \param[in] rsc Resource to check + * + * \return Newly allocated string containing resource's fail name + * \note The caller is responsible for freeing the result. + */ +static inline char * +rsc_fail_name(const pe_resource_t *rsc) +{ + const char *name = (rsc->clone_name? rsc->clone_name : rsc->id); + + return pcmk_is_set(rsc->flags, pe_rsc_unique)? strdup(name) : clone_strip(name); +} + +/*! + * \internal + * \brief Compile regular expression to match a failure-related node attribute + * + * \param[in] prefix Attribute prefix to match + * \param[in] rsc_name Resource name to match as used in failure attributes + * \param[in] is_legacy Whether DC uses per-resource fail counts + * \param[in] is_unique Whether the resource is a globally unique clone + * \param[out] re Where to store resulting regular expression + * + * \return Standard Pacemaker return code + * \note Fail attributes are named like PREFIX-RESOURCE#OP_INTERVAL. + * The caller is responsible for freeing re with regfree(). + */ +static int +generate_fail_regex(const char *prefix, const char *rsc_name, + gboolean is_legacy, gboolean is_unique, regex_t *re) +{ + char *pattern; + + /* @COMPAT DC < 1.1.17: Fail counts used to be per-resource rather than + * per-operation. + */ + const char *op_pattern = (is_legacy? "" : "#.+_[0-9]+"); + + /* Ignore instance numbers for anything other than globally unique clones. + * Anonymous clone fail counts could contain an instance number if the + * clone was initially unique, failed, then was converted to anonymous. + * @COMPAT Also, before 1.1.8, anonymous clone fail counts always contained + * clone instance numbers. + */ + const char *instance_pattern = (is_unique? "" : "(:[0-9]+)?"); + + pattern = crm_strdup_printf("^%s-%s%s%s$", prefix, rsc_name, + instance_pattern, op_pattern); + if (regcomp(re, pattern, REG_EXTENDED|REG_NOSUB) != 0) { + free(pattern); + return EINVAL; + } + + free(pattern); + return pcmk_rc_ok; +} + +/*! + * \internal + * \brief Compile regular expressions to match failure-related node attributes + * + * \param[in] rsc Resource being checked for failures + * \param[in] data_set Data set (for CRM feature set version) + * \param[out] failcount_re Storage for regular expression for fail count + * \param[out] lastfailure_re Storage for regular expression for last failure + * + * \return Standard Pacemaker return code + * \note On success, the caller is responsible for freeing the expressions with + * regfree(). + */ +static int +generate_fail_regexes(const pe_resource_t *rsc, + const pe_working_set_t *data_set, + regex_t *failcount_re, regex_t *lastfailure_re) +{ + char *rsc_name = rsc_fail_name(rsc); + const char *version = crm_element_value(data_set->input, XML_ATTR_CRM_VERSION); + gboolean is_legacy = (compare_version(version, "3.0.13") < 0); + int rc = pcmk_rc_ok; + + if (generate_fail_regex(PCMK__FAIL_COUNT_PREFIX, rsc_name, is_legacy, + pcmk_is_set(rsc->flags, pe_rsc_unique), + failcount_re) != pcmk_rc_ok) { + rc = EINVAL; + + } else if (generate_fail_regex(PCMK__LAST_FAILURE_PREFIX, rsc_name, + is_legacy, + pcmk_is_set(rsc->flags, pe_rsc_unique), + lastfailure_re) != pcmk_rc_ok) { + rc = EINVAL; + regfree(failcount_re); + } + + free(rsc_name); + return rc; +} + +int +pe_get_failcount(const pe_node_t *node, pe_resource_t *rsc, + time_t *last_failure, uint32_t flags, const xmlNode *xml_op) +{ + char *key = NULL; + const char *value = NULL; + regex_t failcount_re, lastfailure_re; + int failcount = 0; + time_t last = 0; + GHashTableIter iter; + + CRM_CHECK(generate_fail_regexes(rsc, rsc->cluster, &failcount_re, + &lastfailure_re) == pcmk_rc_ok, + return 0); + + /* Resource fail count is sum of all matching operation fail counts */ + g_hash_table_iter_init(&iter, node->details->attrs); + while (g_hash_table_iter_next(&iter, (gpointer *) &key, (gpointer *) &value)) { + if (regexec(&failcount_re, key, 0, NULL, 0) == 0) { + failcount = pcmk__add_scores(failcount, char2score(value)); + crm_trace("Added %s (%s) to %s fail count (now %s)", + key, value, rsc->id, pcmk_readable_score(failcount)); + } else if (regexec(&lastfailure_re, key, 0, NULL, 0) == 0) { + long long last_ll; + + if (pcmk__scan_ll(value, &last_ll, 0LL) == pcmk_rc_ok) { + last = (time_t) QB_MAX(last, last_ll); + } + } + } + + regfree(&failcount_re); + regfree(&lastfailure_re); + + if ((failcount > 0) && (last > 0) && (last_failure != NULL)) { + *last_failure = last; + } + + /* If failure blocks the resource, disregard any failure timeout */ + if ((failcount > 0) && rsc->failure_timeout + && block_failure(node, rsc, xml_op)) { + + pe_warn("Ignoring failure timeout %d for %s because it conflicts with on-fail=block", + rsc->failure_timeout, rsc->id); + rsc->failure_timeout = 0; + } + + /* If all failures have expired, ignore fail count */ + if (pcmk_is_set(flags, pe_fc_effective) && (failcount > 0) && (last > 0) + && rsc->failure_timeout) { + + time_t now = get_effective_time(rsc->cluster); + + if (now > (last + rsc->failure_timeout)) { + crm_debug("Failcount for %s on %s expired after %ds", + rsc->id, pe__node_name(node), rsc->failure_timeout); + failcount = 0; + } + } + + /* We never want the fail counts of a bundle container's fillers to + * count towards the container's fail count. + * + * Most importantly, a Pacemaker Remote connection to a bundle container + * is a filler of the container, but can reside on a different node than the + * container itself. Counting its fail count on its node towards the + * container's fail count on that node could lead to attempting to stop the + * container on the wrong node. + */ + + if (pcmk_is_set(flags, pe_fc_fillers) && rsc->fillers + && !pe_rsc_is_bundled(rsc)) { + + GList *gIter = NULL; + + for (gIter = rsc->fillers; gIter != NULL; gIter = gIter->next) { + pe_resource_t *filler = (pe_resource_t *) gIter->data; + time_t filler_last_failure = 0; + + failcount += pe_get_failcount(node, filler, &filler_last_failure, + flags, xml_op); + + if (last_failure && filler_last_failure > *last_failure) { + *last_failure = filler_last_failure; + } + } + + if (failcount > 0) { + crm_info("Container %s and the resources within it " + "have failed %s time%s on %s", + rsc->id, pcmk_readable_score(failcount), + pcmk__plural_s(failcount), pe__node_name(node)); + } + + } else if (failcount > 0) { + crm_info("%s has failed %s time%s on %s", + rsc->id, pcmk_readable_score(failcount), + pcmk__plural_s(failcount), pe__node_name(node)); + } + + return failcount; +} + +/*! + * \brief Schedule a controller operation to clear a fail count + * + * \param[in,out] rsc Resource with failure + * \param[in] node Node failure occurred on + * \param[in] reason Readable description why needed (for logging) + * \param[in,out] data_set Working set for cluster + * + * \return Scheduled action + */ +pe_action_t * +pe__clear_failcount(pe_resource_t *rsc, const pe_node_t *node, + const char *reason, pe_working_set_t *data_set) +{ + char *key = NULL; + pe_action_t *clear = NULL; + + CRM_CHECK(rsc && node && reason && data_set, return NULL); + + key = pcmk__op_key(rsc->id, CRM_OP_CLEAR_FAILCOUNT, 0); + clear = custom_action(rsc, key, CRM_OP_CLEAR_FAILCOUNT, node, FALSE, TRUE, + data_set); + add_hash_param(clear->meta, XML_ATTR_TE_NOWAIT, XML_BOOLEAN_TRUE); + crm_notice("Clearing failure of %s on %s because %s " CRM_XS " %s", + rsc->id, pe__node_name(node), reason, clear->uuid); + return clear; +} |