diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 07:45:40 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 07:45:40 +0000 |
commit | 07d7f4cfa4b10de87a31b68191036ff446add675 (patch) | |
tree | 7162524d8aaf1aef62d2f4fa51f595ed113981ff /python | |
parent | Adding upstream version 2.1.6. (diff) | |
download | pacemaker-07d7f4cfa4b10de87a31b68191036ff446add675.tar.xz pacemaker-07d7f4cfa4b10de87a31b68191036ff446add675.zip |
Adding upstream version 2.1.7.upstream/2.1.7
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'python')
55 files changed, 7304 insertions, 87 deletions
diff --git a/python/Makefile.am b/python/Makefile.am index 6cefb63..803fb0c 100644 --- a/python/Makefile.am +++ b/python/Makefile.am @@ -11,10 +11,16 @@ MAINTAINERCLEANFILES = Makefile.in EXTRA_DIST = pylintrc -SUBDIRS = pacemaker tests +SUBDIRS = pacemaker \ + tests +.PHONY: check-local check-local: - $(PYTHON) -m unittest discover -v -s tests + if [ "x$(top_srcdir)" != "x$(top_builddir)" ]; then \ + cp -r $(top_srcdir)/python/* $(abs_top_builddir)/python/; \ + fi + PYTHONPATH=$(top_builddir)/python $(PYTHON) -m unittest discover -v -s $(top_builddir)/python/tests +.PHONY: pylint pylint: pylint $(SUBDIRS) diff --git a/python/pacemaker/Makefile.am b/python/pacemaker/Makefile.am index f209bba..df9cc46 100644 --- a/python/pacemaker/Makefile.am +++ b/python/pacemaker/Makefile.am @@ -9,8 +9,8 @@ MAINTAINERCLEANFILES = Makefile.in -pkgpython_PYTHON = __init__.py \ - exitstatus.py +pkgpython_PYTHON = __init__.py \ + exitstatus.py nodist_pkgpython_PYTHON = buildoptions.py diff --git a/python/pacemaker/_cts/CTS.py b/python/pacemaker/_cts/CTS.py index 4ca7e59..166ea10 100644 --- a/python/pacemaker/_cts/CTS.py +++ b/python/pacemaker/_cts/CTS.py @@ -10,6 +10,7 @@ import traceback from pacemaker.exitstatus import ExitStatus from pacemaker._cts.environment import EnvFactory +from pacemaker._cts.input import should_continue from pacemaker._cts.logging import LogFactory from pacemaker._cts.remote import RemoteFactory @@ -32,7 +33,7 @@ class CtsLab: def __init__(self, args=None): """ Create a new CtsLab instance. This class can be treated kind of like a dictionary due to the presence of typical dict functions - like has_key, __getitem__, and __setitem__. However, it is not a + like __contains__, __getitem__, and __setitem__. However, it is not a dictionary so do not rely on standard dictionary behavior. Arguments: @@ -48,10 +49,12 @@ class CtsLab: self._env.dump() - def has_key(self, key): + def __contains__(self, key): """ Does the given environment key exist? """ - return key in list(self._env.keys()) + # pylint gets confused because of EnvFactory here. + # pylint: disable=unsupported-membership-test + return key in self._env def __getitem__(self, key): """ Return the given environment key, or raise KeyError if it does @@ -90,7 +93,7 @@ class CtsLab: for node in self._env["nodes"]: self._logger.log(" * %s" % (node)) - if not scenario.SetUp(): + if not scenario.setup(): return ExitStatus.ERROR # We want to alert on any exceptions caused by running a scenario, so @@ -103,16 +106,16 @@ class CtsLab: self._logger.traceback(traceback) scenario.summarize() - scenario.TearDown() + scenario.teardown() return ExitStatus.ERROR - scenario.TearDown() + scenario.teardown() scenario.summarize() - if scenario.Stats["failure"] > 0: + if scenario.stats["failure"] > 0: return ExitStatus.ERROR - if scenario.Stats["success"] != iterations: + if scenario.stats["success"] != iterations: self._logger.log("No failure count but success != requested iterations") return ExitStatus.ERROR @@ -177,15 +180,7 @@ class NodeStatus: timeout -= 1 LogFactory().log("%s did not come up within %d tries" % (node, initial_timeout)) - if self._env["continue"]: - answer = "Y" - else: - try: - answer = input('Continue? [nY]') - except EOFError: - answer = "n" - - if answer and answer == "n": + if not should_continue(self._env["continue"]): raise ValueError("%s did not come up within %d tries" % (node, initial_timeout)) return False @@ -241,4 +236,4 @@ class Process: (rc, _) = self._cm.rsh(node, "killall -9 %s" % self.name) if rc != 0: - self._cm.log ("ERROR: Kill %s failed on node %s" % (self.name, node)) + self._cm.log("ERROR: Kill %s failed on node %s" % (self.name, node)) diff --git a/python/pacemaker/_cts/Makefile.am b/python/pacemaker/_cts/Makefile.am index 3b3e3f8..efb0019 100644 --- a/python/pacemaker/_cts/Makefile.am +++ b/python/pacemaker/_cts/Makefile.am @@ -11,14 +11,6 @@ MAINTAINERCLEANFILES = Makefile.in pkgpythondir = $(pythondir)/$(PACKAGE)/_cts -pkgpython_PYTHON = CTS.py \ - __init__.py \ - corosync.py \ - environment.py \ - errors.py \ - logging.py \ - patterns.py \ - process.py \ - remote.py \ - test.py \ - watcher.py +pkgpython_PYTHON = $(wildcard *.py) + +SUBDIRS = tests diff --git a/python/pacemaker/_cts/audits.py b/python/pacemaker/_cts/audits.py new file mode 100644 index 0000000..dc66f96 --- /dev/null +++ b/python/pacemaker/_cts/audits.py @@ -0,0 +1,1029 @@ +""" Auditing classes for Pacemaker's Cluster Test Suite (CTS) """ + +__all__ = ["AuditConstraint", "AuditResource", "ClusterAudit", "audit_list"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +import re +import time +import uuid + +from pacemaker.buildoptions import BuildOptions +from pacemaker._cts.input import should_continue +from pacemaker._cts.watcher import LogKind, LogWatcher + + +class ClusterAudit: + """ The base class for various kinds of auditors. Specific audit implementations + should be built on top of this one. Audits can do all kinds of checks on the + system. The basic interface for callers is the `__call__` method, which + returns True if the audit passes and False if it fails. + """ + + def __init__(self, cm): + """ Create a new ClusterAudit instance + + Arguments: + + cm -- A ClusterManager instance + """ + + # pylint: disable=invalid-name + self._cm = cm + self.name = None + + def __call__(self): + raise NotImplementedError + + def is_applicable(self): + """ Return True if this audit is applicable in the current test configuration. + This method must be implemented by all subclasses. + """ + + raise NotImplementedError + + def log(self, args): + """ Log a message """ + + self._cm.log("audit: %s" % args) + + def debug(self, args): + """ Log a debug message """ + + self._cm.debug("audit: %s" % args) + + +class LogAudit(ClusterAudit): + """ Audit each cluster node to verify that some logging system is usable. + This is done by logging a unique test message and then verifying that + we can read back that test message using logging tools. + """ + + def __init__(self, cm): + """ Create a new LogAudit instance + + Arguments: + + cm -- A ClusterManager instance + """ + + ClusterAudit.__init__(self, cm) + self.name = "LogAudit" + + def _restart_cluster_logging(self, nodes=None): + """ Restart logging on the given nodes, or all if none are given """ + + if not nodes: + nodes = self._cm.env["nodes"] + + self._cm.debug("Restarting logging on: %r" % nodes) + + for node in nodes: + if self._cm.env["have_systemd"]: + (rc, _) = self._cm.rsh(node, "systemctl stop systemd-journald.socket") + if rc != 0: + self._cm.log("ERROR: Cannot stop 'systemd-journald' on %s" % node) + + (rc, _) = self._cm.rsh(node, "systemctl start systemd-journald.service") + if rc != 0: + self._cm.log("ERROR: Cannot start 'systemd-journald' on %s" % node) + + (rc, _) = self._cm.rsh(node, "service %s restart" % self._cm.env["syslogd"]) + if rc != 0: + self._cm.log("ERROR: Cannot restart '%s' on %s" % (self._cm.env["syslogd"], node)) + + def _create_watcher(self, patterns, kind): + """ Create a new LogWatcher instance for the given patterns """ + + watch = LogWatcher(self._cm.env["LogFileName"], patterns, + self._cm.env["nodes"], kind, "LogAudit", 5, + silent=True) + watch.set_watch() + return watch + + def _test_logging(self): + """ Perform the log audit """ + + patterns = [] + prefix = "Test message from" + suffix = str(uuid.uuid4()) + watch = {} + + for node in self._cm.env["nodes"]: + # Look for the node name in two places to make sure + # that syslog is logging with the correct hostname + m = re.search("^([^.]+).*", node) + if m: + simple = m.group(1) + else: + simple = node + + patterns.append("%s.*%s %s %s" % (simple, prefix, node, suffix)) + + watch_pref = self._cm.env["LogWatcher"] + if watch_pref == LogKind.ANY: + kinds = [LogKind.FILE] + if self._cm.env["have_systemd"]: + kinds += [LogKind.JOURNAL] + kinds += [LogKind.REMOTE_FILE] + for k in kinds: + watch[k] = self._create_watcher(patterns, k) + self._cm.log("Logging test message with identifier %s" % suffix) + else: + watch[watch_pref] = self._create_watcher(patterns, watch_pref) + + for node in self._cm.env["nodes"]: + cmd = "logger -p %s.info %s %s %s" % (self._cm.env["SyslogFacility"], prefix, node, suffix) + + (rc, _) = self._cm.rsh(node, cmd, synchronous=False, verbose=0) + if rc != 0: + self._cm.log("ERROR: Cannot execute remote command [%s] on %s" % (cmd, node)) + + for k in list(watch.keys()): + w = watch[k] + if watch_pref == LogKind.ANY: + self._cm.log("Checking for test message in %s logs" % k) + w.look_for_all(silent=True) + if w.unmatched: + for regex in w.unmatched: + self._cm.log("Test message [%s] not found in %s logs" % (regex, w.kind)) + else: + if watch_pref == LogKind.ANY: + self._cm.log("Found test message in %s logs" % k) + self._cm.env["LogWatcher"] = k + return 1 + + return False + + def __call__(self): + max_attempts = 3 + attempt = 0 + + self._cm.ns.wait_for_all_nodes(self._cm.env["nodes"]) + while attempt <= max_attempts and not self._test_logging(): + attempt += 1 + self._restart_cluster_logging() + time.sleep(60*attempt) + + if attempt > max_attempts: + self._cm.log("ERROR: Cluster logging unrecoverable.") + return False + + return True + + def is_applicable(self): + """ Return True if this audit is applicable in the current test configuration. """ + + if self._cm.env["LogAuditDisabled"]: + return False + + return True + + +class DiskAudit(ClusterAudit): + """ Audit disk usage on cluster nodes to verify that there is enough free + space left on whichever mounted file system holds the logs. + + Warn on: less than 100 MB or 10% of free space + Error on: less than 10 MB or 5% of free space + """ + + def __init__(self, cm): + """ Create a new DiskAudit instance + + Arguments: + + cm -- A ClusterManager instance + """ + + ClusterAudit.__init__(self, cm) + self.name = "DiskspaceAudit" + + def __call__(self): + result = True + + # @TODO Use directory of PCMK_logfile if set on host + dfcmd = "df -BM %s | tail -1 | awk '{print $(NF-1)\" \"$(NF-2)}' | tr -d 'M%%'" % BuildOptions.LOG_DIR + + self._cm.ns.wait_for_all_nodes(self._cm.env["nodes"]) + for node in self._cm.env["nodes"]: + (_, dfout) = self._cm.rsh(node, dfcmd, verbose=1) + if not dfout: + self._cm.log("ERROR: Cannot execute remote df command [%s] on %s" % (dfcmd, node)) + continue + + dfout = dfout[0].strip() + + try: + (used, remain) = dfout.split() + used_percent = int(used) + remaining_mb = int(remain) + except (ValueError, TypeError): + self._cm.log("Warning: df output '%s' from %s was invalid [%s, %s]" + % (dfout, node, used, remain)) + else: + if remaining_mb < 10 or used_percent > 95: + self._cm.log("CRIT: Out of log disk space on %s (%d%% / %dMB)" + % (node, used_percent, remaining_mb)) + result = False + + if not should_continue(self._cm.env): + raise ValueError("Disk full on %s" % node) + + elif remaining_mb < 100 or used_percent > 90: + self._cm.log("WARN: Low on log disk space (%dMB) on %s" % (remaining_mb, node)) + + return result + + def is_applicable(self): + """ Return True if this audit is applicable in the current test configuration. """ + + return True + + +class FileAudit(ClusterAudit): + """ Audit the filesystem looking for various failure conditions: + + * The presence of core dumps from corosync or Pacemaker daemons + * Stale IPC files + """ + + def __init__(self, cm): + """ Create a new FileAudit instance + + Arguments: + + cm -- A ClusterManager instance + """ + + ClusterAudit.__init__(self, cm) + self.known = [] + self.name = "FileAudit" + + def __call__(self): + result = True + + self._cm.ns.wait_for_all_nodes(self._cm.env["nodes"]) + for node in self._cm.env["nodes"]: + + (_, lsout) = self._cm.rsh(node, "ls -al /var/lib/pacemaker/cores/* | grep core.[0-9]", verbose=1) + for line in lsout: + line = line.strip() + + if line not in self.known: + result = False + self.known.append(line) + self._cm.log("Warning: Pacemaker core file on %s: %s" % (node, line)) + + (_, lsout) = self._cm.rsh(node, "ls -al /var/lib/corosync | grep core.[0-9]", verbose=1) + for line in lsout: + line = line.strip() + + if line not in self.known: + result = False + self.known.append(line) + self._cm.log("Warning: Corosync core file on %s: %s" % (node, line)) + + if self._cm.expected_status.get(node) == "down": + clean = False + (_, lsout) = self._cm.rsh(node, "ls -al /dev/shm | grep qb-", verbose=1) + + for line in lsout: + result = False + clean = True + self._cm.log("Warning: Stale IPC file on %s: %s" % (node, line)) + + if clean: + (_, lsout) = self._cm.rsh(node, "ps axf | grep -e pacemaker -e corosync", verbose=1) + + for line in lsout: + self._cm.debug("ps[%s]: %s" % (node, line)) + + self._cm.rsh(node, "rm -rf /dev/shm/qb-*") + + else: + self._cm.debug("Skipping %s" % node) + + return result + + def is_applicable(self): + """ Return True if this audit is applicable in the current test configuration. """ + + return True + + +class AuditResource: + """ A base class for storing information about a cluster resource """ + + def __init__(self, cm, line): + """ Create a new AuditResource instance + + Arguments: + + cm -- A ClusterManager instance + line -- One line of output from `crm_resource` describing a single + resource + """ + + # pylint: disable=invalid-name + fields = line.split() + self._cm = cm + self.line = line + self.type = fields[1] + self.id = fields[2] + self.clone_id = fields[3] + self.parent = fields[4] + self.rprovider = fields[5] + self.rclass = fields[6] + self.rtype = fields[7] + self.host = fields[8] + self.needs_quorum = fields[9] + self.flags = int(fields[10]) + self.flags_s = fields[11] + + if self.parent == "NA": + self.parent = None + + @property + def unique(self): + """ Is this resource unique? """ + + return self.flags & 0x20 + + @property + def orphan(self): + """ Is this resource an orphan? """ + + return self.flags & 0x01 + + @property + def managed(self): + """ Is this resource managed by the cluster? """ + + return self.flags & 0x02 + + +class AuditConstraint: + """ A base class for storing information about a cluster constraint """ + + def __init__(self, cm, line): + """ Create a new AuditConstraint instance + + Arguments: + + cm -- A ClusterManager instance + line -- One line of output from `crm_resource` describing a single + constraint + """ + + # pylint: disable=invalid-name + fields = line.split() + self._cm = cm + self.line = line + self.type = fields[1] + self.id = fields[2] + self.rsc = fields[3] + self.target = fields[4] + self.score = fields[5] + self.rsc_role = fields[6] + self.target_role = fields[7] + + if self.rsc_role == "NA": + self.rsc_role = None + + if self.target_role == "NA": + self.target_role = None + + +class PrimitiveAudit(ClusterAudit): + """ Audit primitive resources to verify a variety of conditions, including that + they are active and managed only when expected; they are active on the + expected clusted node; and that they are not orphaned. + """ + + def __init__(self, cm): + """ Create a new PrimitiveAudit instance + + Arguments: + + cm -- A ClusterManager instance + """ + + ClusterAudit.__init__(self, cm) + self.name = "PrimitiveAudit" + + self._active_nodes = [] + self._constraints = [] + self._inactive_nodes = [] + self._resources = [] + self._target = None + + def _audit_resource(self, resource, quorum): + """ Perform the audit of a single resource """ + + rc = True + active = self._cm.resource_location(resource.id) + + if len(active) == 1: + if quorum: + self.debug("Resource %s active on %r" % (resource.id, active)) + + elif resource.needs_quorum == 1: + self._cm.log("Resource %s active without quorum: %r" % (resource.id, active)) + rc = False + + elif not resource.managed: + self._cm.log("Resource %s not managed. Active on %r" % (resource.id, active)) + + elif not resource.unique: + # TODO: Figure out a clever way to actually audit these resource types + if len(active) > 1: + self.debug("Non-unique resource %s is active on: %r" % (resource.id, active)) + else: + self.debug("Non-unique resource %s is not active" % resource.id) + + elif len(active) > 1: + self._cm.log("Resource %s is active multiple times: %r" % (resource.id, active)) + rc = False + + elif resource.orphan: + self.debug("Resource %s is an inactive orphan" % resource.id) + + elif not self._inactive_nodes: + self._cm.log("WARN: Resource %s not served anywhere" % resource.id) + rc = False + + elif self._cm.env["warn-inactive"]: + if quorum or not resource.needs_quorum: + self._cm.log("WARN: Resource %s not served anywhere (Inactive nodes: %r)" + % (resource.id, self._inactive_nodes)) + else: + self.debug("Resource %s not served anywhere (Inactive nodes: %r)" + % (resource.id, self._inactive_nodes)) + + elif quorum or not resource.needs_quorum: + self.debug("Resource %s not served anywhere (Inactive nodes: %r)" + % (resource.id, self._inactive_nodes)) + + return rc + + def _setup(self): + """ Verify cluster nodes are active, and collect resource and colocation + information used for performing the audit. + """ + + for node in self._cm.env["nodes"]: + if self._cm.expected_status[node] == "up": + self._active_nodes.append(node) + else: + self._inactive_nodes.append(node) + + for node in self._cm.env["nodes"]: + if self._target is None and self._cm.expected_status[node] == "up": + self._target = node + + if not self._target: + # TODO: In Pacemaker 1.0 clusters we'll be able to run crm_resource + # with CIB_file=/path/to/cib.xml even when the cluster isn't running + self.debug("No nodes active - skipping %s" % self.name) + return False + + (_, lines) = self._cm.rsh(self._target, "crm_resource -c", verbose=1) + + for line in lines: + if re.search("^Resource", line): + self._resources.append(AuditResource(self._cm, line)) + elif re.search("^Constraint", line): + self._constraints.append(AuditConstraint(self._cm, line)) + else: + self._cm.log("Unknown entry: %s" % line) + + return True + + def __call__(self): + result = True + + if not self._setup(): + return result + + quorum = self._cm.has_quorum(None) + for resource in self._resources: + if resource.type == "primitive" and not self._audit_resource(resource, quorum): + result = False + + return result + + def is_applicable(self): + """ Return True if this audit is applicable in the current test configuration. """ + + # @TODO Due to long-ago refactoring, this name test would never match, + # so this audit (and those derived from it) would never run. + # Uncommenting the next lines fixes the name test, but that then + # exposes pre-existing bugs that need to be fixed. + #if self._cm["Name"] == "crm-corosync": + # return True + return False + + +class GroupAudit(PrimitiveAudit): + """ Audit group resources to verify that each of its child primitive + resources is active on the expected cluster node. + """ + + def __init__(self, cm): + """ Create a new GroupAudit instance + + Arguments: + + cm -- A ClusterManager instance + """ + + PrimitiveAudit.__init__(self, cm) + self.name = "GroupAudit" + + def __call__(self): + result = True + + if not self._setup(): + return result + + for group in self._resources: + if group.type != "group": + continue + + first_match = True + group_location = None + + for child in self._resources: + if child.parent != group.id: + continue + + nodes = self._cm.resource_location(child.id) + + if first_match and len(nodes) > 0: + group_location = nodes[0] + + first_match = False + + if len(nodes) > 1: + result = False + self._cm.log("Child %s of %s is active more than once: %r" + % (child.id, group.id, nodes)) + + elif not nodes: + # Groups are allowed to be partially active + # However we do need to make sure later children aren't running + group_location = None + self.debug("Child %s of %s is stopped" % (child.id, group.id)) + + elif nodes[0] != group_location: + result = False + self._cm.log("Child %s of %s is active on the wrong node (%s) expected %s" + % (child.id, group.id, nodes[0], group_location)) + else: + self.debug("Child %s of %s is active on %s" % (child.id, group.id, nodes[0])) + + return result + + +class CloneAudit(PrimitiveAudit): + """ Audit clone resources. NOTE: Currently, this class does not perform + any actual audit functions. + """ + + def __init__(self, cm): + """ Create a new CloneAudit instance + + Arguments: + + cm -- A ClusterManager instance + """ + + PrimitiveAudit.__init__(self, cm) + self.name = "CloneAudit" + + def __call__(self): + result = True + + if not self._setup(): + return result + + for clone in self._resources: + if clone.type != "clone": + continue + + for child in self._resources: + if child.parent == clone.id and child.type == "primitive": + self.debug("Checking child %s of %s..." % (child.id, clone.id)) + # Check max and node_max + # Obtain with: + # crm_resource -g clone_max --meta -r child.id + # crm_resource -g clone_node_max --meta -r child.id + + return result + + +class ColocationAudit(PrimitiveAudit): + """ Audit cluster resources to verify that those that should be colocated + with each other actually are. + """ + + def __init__(self, cm): + """ Create a new ColocationAudit instance + + Arguments: + + cm -- A ClusterManager instance + """ + + PrimitiveAudit.__init__(self, cm) + self.name = "ColocationAudit" + + def _crm_location(self, resource): + """ Return a list of cluster nodes where a given resource is running """ + + (rc, lines) = self._cm.rsh(self._target, "crm_resource -W -r %s -Q" % resource, verbose=1) + hosts = [] + + if rc == 0: + for line in lines: + fields = line.split() + hosts.append(fields[0]) + + return hosts + + def __call__(self): + result = True + + if not self._setup(): + return result + + for coloc in self._constraints: + if coloc.type != "rsc_colocation": + continue + + source = self._crm_location(coloc.rsc) + target = self._crm_location(coloc.target) + + if not source: + self.debug("Colocation audit (%s): %s not running" % (coloc.id, coloc.rsc)) + else: + for node in source: + if not node in target: + result = False + self._cm.log("Colocation audit (%s): %s running on %s (not in %r)" + % (coloc.id, coloc.rsc, node, target)) + else: + self.debug("Colocation audit (%s): %s running on %s (in %r)" + % (coloc.id, coloc.rsc, node, target)) + + return result + + +class ControllerStateAudit(ClusterAudit): + """ Audit cluster nodes to verify that those we expect to be active are + active, and those that are expected to be inactive are inactive. + """ + + def __init__(self, cm): + """ Create a new ControllerStateAudit instance + + Arguments: + + cm -- A ClusterManager instance + """ + + ClusterAudit.__init__(self, cm) + self.name = "ControllerStateAudit" + + def __call__(self): + result = True + up_are_down = 0 + down_are_up = 0 + unstable_list = [] + + for node in self._cm.env["nodes"]: + should_be = self._cm.expected_status[node] + rc = self._cm.test_node_cm(node) + + if rc > 0: + if should_be == "down": + down_are_up += 1 + + if rc == 1: + unstable_list.append(node) + + elif should_be == "up": + up_are_down += 1 + + if len(unstable_list) > 0: + result = False + self._cm.log("Cluster is not stable: %d (of %d): %r" + % (len(unstable_list), self._cm.upcount(), unstable_list)) + + if up_are_down > 0: + result = False + self._cm.log("%d (of %d) nodes expected to be up were down." + % (up_are_down, len(self._cm.env["nodes"]))) + + if down_are_up > 0: + result = False + self._cm.log("%d (of %d) nodes expected to be down were up." + % (down_are_up, len(self._cm.env["nodes"]))) + + return result + + def is_applicable(self): + """ Return True if this audit is applicable in the current test configuration. """ + + # @TODO Due to long-ago refactoring, this name test would never match, + # so this audit (and those derived from it) would never run. + # Uncommenting the next lines fixes the name test, but that then + # exposes pre-existing bugs that need to be fixed. + #if self._cm["Name"] == "crm-corosync": + # return True + return False + + +class CIBAudit(ClusterAudit): + """ Audit the CIB by verifying that it is identical across cluster nodes """ + + def __init__(self, cm): + """ Create a new CIBAudit instance + + Arguments: + + cm -- A ClusterManager instance + """ + + ClusterAudit.__init__(self, cm) + self.name = "CibAudit" + + def __call__(self): + result = True + ccm_partitions = self._cm.find_partitions() + + if not ccm_partitions: + self.debug("\tNo partitions to audit") + return result + + for partition in ccm_partitions: + self.debug("\tAuditing CIB consistency for: %s" % partition) + + if self._audit_cib_contents(partition) == 0: + result = False + + return result + + def _audit_cib_contents(self, hostlist): + """ Perform the CIB audit on the given hosts """ + + passed = True + node0 = None + node0_xml = None + + partition_hosts = hostlist.split() + for node in partition_hosts: + node_xml = self._store_remote_cib(node, node0) + + if node_xml is None: + self._cm.log("Could not perform audit: No configuration from %s" % node) + passed = False + + elif node0 is None: + node0 = node + node0_xml = node_xml + + elif node0_xml is None: + self._cm.log("Could not perform audit: No configuration from %s" % node0) + passed = False + + else: + (rc, result) = self._cm.rsh( + node0, "crm_diff -VV -cf --new %s --original %s" % (node_xml, node0_xml), verbose=1) + + if rc != 0: + self._cm.log("Diff between %s and %s failed: %d" % (node0_xml, node_xml, rc)) + passed = False + + for line in result: + if not re.search("<diff/>", line): + passed = False + self.debug("CibDiff[%s-%s]: %s" % (node0, node, line)) + else: + self.debug("CibDiff[%s-%s] Ignoring: %s" % (node0, node, line)) + + return passed + + def _store_remote_cib(self, node, target): + """ Store a copy of the given node's CIB on the given target node. If + no target is given, store the CIB on the given node. + """ + + filename = "/tmp/ctsaudit.%s.xml" % node + + if not target: + target = node + + (rc, lines) = self._cm.rsh(node, self._cm["CibQuery"], verbose=1) + if rc != 0: + self._cm.log("Could not retrieve configuration") + return None + + self._cm.rsh("localhost", "rm -f %s" % filename) + for line in lines: + self._cm.rsh("localhost", "echo \'%s\' >> %s" % (line[:-1], filename), verbose=0) + + if self._cm.rsh.copy(filename, "root@%s:%s" % (target, filename), silent=True) != 0: + self._cm.log("Could not store configuration") + return None + + return filename + + def is_applicable(self): + """ Return True if this audit is applicable in the current test configuration. """ + + # @TODO Due to long-ago refactoring, this name test would never match, + # so this audit (and those derived from it) would never run. + # Uncommenting the next lines fixes the name test, but that then + # exposes pre-existing bugs that need to be fixed. + #if self._cm["Name"] == "crm-corosync": + # return True + return False + + +class PartitionAudit(ClusterAudit): + """ Audit each partition in a cluster to verify a variety of conditions: + + * The number of partitions and the nodes in each is as expected + * Each node is active when it should be active and inactive when it + should be inactive + * The status and epoch of each node is as expected + * A partition has quorum + * A partition has a DC when expected + """ + + def __init__(self, cm): + """ Create a new PartitionAudit instance + + Arguments: + + cm -- A ClusterManager instance + """ + + ClusterAudit.__init__(self, cm) + self.name = "PartitionAudit" + + self._node_epoch = {} + self._node_state = {} + self._node_quorum = {} + + def __call__(self): + result = True + ccm_partitions = self._cm.find_partitions() + + if not ccm_partitions: + return result + + self._cm.cluster_stable(double_check=True) + + if len(ccm_partitions) != self._cm.partitions_expected: + self._cm.log("ERROR: %d cluster partitions detected:" % len(ccm_partitions)) + result = False + + for partition in ccm_partitions: + self._cm.log("\t %s" % partition) + + for partition in ccm_partitions: + if self._audit_partition(partition) == 0: + result = False + + return result + + def _trim_string(self, avalue): + """ Remove the last character from a multi-character string """ + + if not avalue: + return None + + if len(avalue) > 1: + return avalue[:-1] + + return avalue + + def _trim2int(self, avalue): + """ Remove the last character from a multi-character string and convert + the result to an int. + """ + + trimmed = self._trim_string(avalue) + if trimmed: + return int(trimmed) + + return None + + def _audit_partition(self, partition): + """ Perform the audit of a single partition """ + + passed = True + dc_found = [] + dc_allowed_list = [] + lowest_epoch = None + node_list = partition.split() + + self.debug("Auditing partition: %s" % partition) + for node in node_list: + if self._cm.expected_status[node] != "up": + self._cm.log("Warn: Node %s appeared out of nowhere" % node) + self._cm.expected_status[node] = "up" + # not in itself a reason to fail the audit (not what we're + # checking for in this audit) + + (_, out) = self._cm.rsh(node, self._cm["StatusCmd"] % node, verbose=1) + self._node_state[node] = out[0].strip() + + (_, out) = self._cm.rsh(node, self._cm["EpochCmd"], verbose=1) + self._node_epoch[node] = out[0].strip() + + (_, out) = self._cm.rsh(node, self._cm["QuorumCmd"], verbose=1) + self._node_quorum[node] = out[0].strip() + + self.debug("Node %s: %s - %s - %s." % (node, self._node_state[node], self._node_epoch[node], self._node_quorum[node])) + self._node_state[node] = self._trim_string(self._node_state[node]) + self._node_epoch[node] = self._trim2int(self._node_epoch[node]) + self._node_quorum[node] = self._trim_string(self._node_quorum[node]) + + if not self._node_epoch[node]: + self._cm.log("Warn: Node %s dissappeared: cant determin epoch" % node) + self._cm.expected_status[node] = "down" + # not in itself a reason to fail the audit (not what we're + # checking for in this audit) + elif lowest_epoch is None or self._node_epoch[node] < lowest_epoch: + lowest_epoch = self._node_epoch[node] + + if not lowest_epoch: + self._cm.log("Lowest epoch not determined in %s" % partition) + passed = False + + for node in node_list: + if self._cm.expected_status[node] != "up": + continue + + if self._cm.is_node_dc(node, self._node_state[node]): + dc_found.append(node) + if self._node_epoch[node] == lowest_epoch: + self.debug("%s: OK" % node) + elif not self._node_epoch[node]: + self.debug("Check on %s ignored: no node epoch" % node) + elif not lowest_epoch: + self.debug("Check on %s ignored: no lowest epoch" % node) + else: + self._cm.log("DC %s is not the oldest node (%d vs. %d)" + % (node, self._node_epoch[node], lowest_epoch)) + passed = False + + if not dc_found: + self._cm.log("DC not found on any of the %d allowed nodes: %s (of %s)" + % (len(dc_allowed_list), str(dc_allowed_list), str(node_list))) + + elif len(dc_found) > 1: + self._cm.log("%d DCs (%s) found in cluster partition: %s" + % (len(dc_found), str(dc_found), str(node_list))) + passed = False + + if not passed: + for node in node_list: + if self._cm.expected_status[node] == "up": + self._cm.log("epoch %s : %s" + % (self._node_epoch[node], self._node_state[node])) + + return passed + + def is_applicable(self): + """ Return True if this audit is applicable in the current test configuration. """ + + # @TODO Due to long-ago refactoring, this name test would never match, + # so this audit (and those derived from it) would never run. + # Uncommenting the next lines fixes the name test, but that then + # exposes pre-existing bugs that need to be fixed. + #if self._cm["Name"] == "crm-corosync": + # return True + return False + + +# pylint: disable=invalid-name +def audit_list(cm): + """ Return a list of instances of applicable audits that can be performed + for the given ClusterManager. + """ + + result = [] + + for auditclass in [DiskAudit, FileAudit, LogAudit, ControllerStateAudit, + PartitionAudit, PrimitiveAudit, GroupAudit, CloneAudit, + ColocationAudit, CIBAudit]: + a = auditclass(cm) + if a.is_applicable(): + result.append(a) + + return result diff --git a/python/pacemaker/_cts/cib.py b/python/pacemaker/_cts/cib.py new file mode 100644 index 0000000..b8b5d5d --- /dev/null +++ b/python/pacemaker/_cts/cib.py @@ -0,0 +1,425 @@ +""" CIB generator for Pacemaker's Cluster Test Suite (CTS) """ + +__all__ = ["ConfigFactory"] +__copyright__ = "Copyright 2008-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +import warnings +import tempfile + +from pacemaker.buildoptions import BuildOptions +from pacemaker._cts.cibxml import Alerts, Clone, Expression, FencingTopology, Group, Nodes, OpDefaults, Option, Resource, Rule +from pacemaker._cts.network import next_ip + + +class CIB: + """ A class for generating, representing, and installing a CIB file onto + cluster nodes + """ + + def __init__(self, cm, version, factory, tmpfile=None): + """ Create a new CIB instance + + Arguments: + + cm -- A ClusterManager instance + version -- The schema syntax version + factory -- A ConfigFactory instance + tmpfile -- Where to store the CIB, or None to use a new tempfile + """ + + # pylint: disable=invalid-name + self._cib = None + self._cm = cm + self._counter = 1 + self._factory = factory + self._num_nodes = 0 + + self.version = version + + if not tmpfile: + warnings.filterwarnings("ignore") + + # pylint: disable=consider-using-with + f = tempfile.NamedTemporaryFile(delete=True) + f.close() + tmpfile = f.name + + warnings.resetwarnings() + + self._factory.tmpfile = tmpfile + + def _show(self): + """ Query a cluster node for its generated CIB; log and return the result """ + + output = "" + (_, result) = self._factory.rsh(self._factory.target, "HOME=/root CIB_file=%s cibadmin -Ql" % self._factory.tmpfile, verbose=1) + + for line in result: + output += line + self._factory.debug("Generated Config: %s" % line) + + return output + + def new_ip(self, name=None): + """ Generate an IP resource for the next available IP address, optionally + specifying the resource's name. + """ + + if self._cm.env["IPagent"] == "IPaddr2": + ip = next_ip(self._cm.env["IPBase"]) + if not name: + if ":" in ip: + (_, _, suffix) = ip.rpartition(":") + name = "r%s" % suffix + else: + name = "r%s" % ip + + r = Resource(self._factory, name, self._cm.env["IPagent"], "ocf") + r["ip"] = ip + + if ":" in ip: + r["cidr_netmask"] = "64" + r["nic"] = "eth0" + else: + r["cidr_netmask"] = "32" + + else: + if not name: + name = "r%s%d" % (self._cm.env["IPagent"], self._counter) + self._counter += 1 + + r = Resource(self._factory, name, self._cm.env["IPagent"], "ocf") + + r.add_op("monitor", "5s") + return r + + def get_node_id(self, node_name): + """ Check the cluster configuration for the node ID for the given node_name """ + + # We can't account for every possible configuration, + # so we only return a node ID if: + # * The node is specified in /etc/corosync/corosync.conf + # with "ring0_addr:" equal to node_name and "nodeid:" + # explicitly specified. + # In all other cases, we return 0. + node_id = 0 + + # awkward command: use } as record separator + # so each corosync.conf "object" is one record; + # match the "node {" record that has "ring0_addr: node_name"; + # then print the substring of that record after "nodeid:" + awk = r"""awk -v RS="}" """ \ + r"""'/^(\s*nodelist\s*{)?\s*node\s*{.*(ring0_addr|name):\s*%s(\s+|$)/""" \ + r"""{gsub(/.*nodeid:\s*/,"");gsub(/\s+.*$/,"");print}' %s""" \ + % (node_name, BuildOptions.COROSYNC_CONFIG_FILE) + + (rc, output) = self._factory.rsh(self._factory.target, awk, verbose=1) + + if rc == 0 and len(output) == 1: + try: + node_id = int(output[0]) + except ValueError: + node_id = 0 + + return node_id + + def install(self, target): + """ Generate a CIB file and install it to the given cluster node """ + + old = self._factory.tmpfile + + # Force a rebuild + self._cib = None + + self._factory.tmpfile = "%s/cib.xml" % BuildOptions.CIB_DIR + self.contents(target) + self._factory.rsh(self._factory.target, "chown %s %s" % (BuildOptions.DAEMON_USER, self._factory.tmpfile)) + + self._factory.tmpfile = old + + def contents(self, target): + """ Generate a complete CIB file """ + + # fencing resource + if self._cib: + return self._cib + + if target: + self._factory.target = target + + self._factory.rsh(self._factory.target, "HOME=/root cibadmin --empty %s > %s" % (self.version, self._factory.tmpfile)) + self._num_nodes = len(self._cm.env["nodes"]) + + no_quorum = "stop" + if self._num_nodes < 3: + no_quorum = "ignore" + self._factory.log("Cluster only has %d nodes, configuring: no-quorum-policy=ignore" % self._num_nodes) + + # We don't need a nodes section unless we add attributes + stn = None + + # Fencing resource + # Define first so that the shell doesn't reject every update + if self._cm.env["DoFencing"]: + + # Define the "real" fencing device + st = Resource(self._factory, "Fencing", self._cm.env["stonith-type"], "stonith") + + # Set a threshold for unreliable stonith devices such as the vmware one + st.add_meta("migration-threshold", "5") + st.add_op("monitor", "120s", timeout="120s") + st.add_op("stop", "0", timeout="60s") + st.add_op("start", "0", timeout="60s") + + # For remote node tests, a cluster node is stopped and brought back up + # as a remote node with the name "remote-OLDNAME". To allow fencing + # devices to fence these nodes, create a list of all possible node names. + all_node_names = [prefix+n for n in self._cm.env["nodes"] for prefix in ('', 'remote-')] + + # Add all parameters specified by user + entries = self._cm.env["stonith-params"].split(',') + for entry in entries: + try: + (name, value) = entry.split('=', 1) + except ValueError: + print("Warning: skipping invalid fencing parameter: %s" % entry) + continue + + # Allow user to specify "all" as the node list, and expand it here + if name in ["hostlist", "pcmk_host_list"] and value == "all": + value = ' '.join(all_node_names) + + st[name] = value + + st.commit() + + # Test advanced fencing logic + stf_nodes = [] + stt_nodes = [] + attr_nodes = {} + + # Create the levels + stl = FencingTopology(self._factory) + for node in self._cm.env["nodes"]: + # Remote node tests will rename the node + remote_node = "remote-%s" % node + + # Randomly assign node to a fencing method + ftype = self._cm.env.random_gen.choice(["levels-and", "levels-or ", "broadcast "]) + + # For levels-and, randomly choose targeting by node name or attribute + by = "" + + if ftype == "levels-and": + node_id = self.get_node_id(node) + + if node_id == 0 or self._cm.env.random_gen.choice([True, False]): + by = " (by name)" + else: + attr_nodes[node] = node_id + by = " (by attribute)" + + self._cm.log(" - Using %s fencing for node: %s%s" % (ftype, node, by)) + + if ftype == "levels-and": + # If targeting by name, add a topology level for this node + if node not in attr_nodes: + stl.level(1, node, "FencingPass,Fencing") + + # Always target remote nodes by name, otherwise we would need to add + # an attribute to the remote node only during remote tests (we don't + # want nonexistent remote nodes showing up in the non-remote tests). + # That complexity is not worth the effort. + stl.level(1, remote_node, "FencingPass,Fencing") + + # Add the node (and its remote equivalent) to the list of levels-and nodes. + stt_nodes.extend([node, remote_node]) + + elif ftype == "levels-or ": + for n in [node, remote_node]: + stl.level(1, n, "FencingFail") + stl.level(2, n, "Fencing") + + stf_nodes.extend([node, remote_node]) + + # If any levels-and nodes were targeted by attribute, + # create the attributes and a level for the attribute. + if attr_nodes: + stn = Nodes(self._factory) + + for (node_name, node_id) in attr_nodes.items(): + stn.add_node(node_name, node_id, {"cts-fencing": "levels-and"}) + + stl.level(1, None, "FencingPass,Fencing", "cts-fencing", "levels-and") + + # Create a Dummy agent that always passes for levels-and + if stt_nodes: + stt = Resource(self._factory, "FencingPass", "fence_dummy", "stonith") + stt["pcmk_host_list"] = " ".join(stt_nodes) + # Wait this many seconds before doing anything, handy for letting disks get flushed too + stt["random_sleep_range"] = "30" + stt["mode"] = "pass" + stt.commit() + + # Create a Dummy agent that always fails for levels-or + if stf_nodes: + stf = Resource(self._factory, "FencingFail", "fence_dummy", "stonith") + stf["pcmk_host_list"] = " ".join(stf_nodes) + # Wait this many seconds before doing anything, handy for letting disks get flushed too + stf["random_sleep_range"] = "30" + stf["mode"] = "fail" + stf.commit() + + # Now commit the levels themselves + stl.commit() + + o = Option(self._factory) + o["stonith-enabled"] = self._cm.env["DoFencing"] + o["start-failure-is-fatal"] = "false" + o["pe-input-series-max"] = "5000" + o["shutdown-escalation"] = "5min" + o["batch-limit"] = "10" + o["dc-deadtime"] = "5s" + o["no-quorum-policy"] = no_quorum + + o.commit() + + o = OpDefaults(self._factory) + o["timeout"] = "90s" + o.commit() + + # Commit the nodes section if we defined one + if stn is not None: + stn.commit() + + # Add an alerts section if possible + if self._factory.rsh.exists_on_all(self._cm.env["notification-agent"], self._cm.env["nodes"]): + alerts = Alerts(self._factory) + alerts.add_alert(self._cm.env["notification-agent"], + self._cm.env["notification-recipient"]) + alerts.commit() + + # Add resources? + if self._cm.env["CIBResource"]: + self.add_resources() + + # generate cib + self._cib = self._show() + + if self._factory.tmpfile != "%s/cib.xml" % BuildOptions.CIB_DIR: + self._factory.rsh(self._factory.target, "rm -f %s" % self._factory.tmpfile) + + return self._cib + + def add_resources(self): + """ Add various resources and their constraints to the CIB """ + + # Per-node resources + for node in self._cm.env["nodes"]: + name = "rsc_%s" % node + r = self.new_ip(name) + r.prefer(node, "100") + r.commit() + + # Migrator + # Make this slightly sticky (since we have no other location constraints) to avoid relocation during Reattach + m = Resource(self._factory, "migrator", "Dummy", "ocf", "pacemaker") + m["passwd"] = "whatever" + m.add_meta("resource-stickiness", "1") + m.add_meta("allow-migrate", "1") + m.add_op("monitor", "P10S") + m.commit() + + # Ping the test exerciser + p = Resource(self._factory, "ping-1", "ping", "ocf", "pacemaker") + p.add_op("monitor", "60s") + p["host_list"] = self._cm.env["cts-exerciser"] + p["name"] = "connected" + p["debug"] = "true" + + c = Clone(self._factory, "Connectivity", p) + c["globally-unique"] = "false" + c.commit() + + # promotable clone resource + s = Resource(self._factory, "stateful-1", "Stateful", "ocf", "pacemaker") + s.add_op("monitor", "15s", timeout="60s") + s.add_op("monitor", "16s", timeout="60s", role="Promoted") + ms = Clone(self._factory, "promotable-1", s) + ms["promotable"] = "true" + ms["clone-max"] = self._num_nodes + ms["clone-node-max"] = 1 + ms["promoted-max"] = 1 + ms["promoted-node-max"] = 1 + + # Require connectivity to run the promotable clone + r = Rule(self._factory, "connected", "-INFINITY", op="or") + r.add_child(Expression(self._factory, "m1-connected-1", "connected", "lt", "1")) + r.add_child(Expression(self._factory, "m1-connected-2", "connected", "not_defined", None)) + ms.prefer("connected", rule=r) + + ms.commit() + + # Group Resource + g = Group(self._factory, "group-1") + g.add_child(self.new_ip()) + + if self._cm.env["have_systemd"]: + sysd = Resource(self._factory, "petulant", "pacemaker-cts-dummyd@10", "service") + sysd.add_op("monitor", "P10S") + g.add_child(sysd) + else: + g.add_child(self.new_ip()) + + g.add_child(self.new_ip()) + + # Make group depend on the promotable clone + g.after("promotable-1", first="promote", then="start") + g.colocate("promotable-1", "INFINITY", withrole="Promoted") + + g.commit() + + # LSB resource + lsb = Resource(self._factory, "lsb-dummy", "LSBDummy", "lsb") + lsb.add_op("monitor", "5s") + + # LSB with group + lsb.after("group-1") + lsb.colocate("group-1") + + lsb.commit() + + +class ConfigFactory: + """ Singleton to generate a CIB file for the environment's schema version """ + + def __init__(self, cm): + """ Create a new ConfigFactory instance + + Arguments: + + cm -- A ClusterManager instance + """ + + # pylint: disable=invalid-name + self._cm = cm + self.rsh = self._cm.rsh + if not self._cm.env["ListTests"]: + self.target = self._cm.env["nodes"][0] + self.tmpfile = None + + def log(self, args): + """ Log a message """ + + self._cm.log("cib: %s" % args) + + def debug(self, args): + """ Log a debug message """ + + self._cm.debug("cib: %s" % args) + + def create_config(self, name="pacemaker-%s" % BuildOptions.CIB_SCHEMA_VERSION): + """ Return a CIB object for the given schema version """ + + return CIB(self._cm, name, self) diff --git a/python/pacemaker/_cts/cibxml.py b/python/pacemaker/_cts/cibxml.py new file mode 100644 index 0000000..52e3721 --- /dev/null +++ b/python/pacemaker/_cts/cibxml.py @@ -0,0 +1,734 @@ +""" CIB XML generator for Pacemaker's Cluster Test Suite (CTS) """ + +__all__ = [ + "Alerts", + "Clone", + "Expression", + "FencingTopology", + "Group", + "Nodes", + "OpDefaults", + "Option", + "Resource", + "Rule", +] +__copyright__ = "Copyright 2008-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + + +def key_val_string(**kwargs): + """ Given keyword arguments as key=value pairs, construct a single string + containing all those pairs separated by spaces. This is suitable for + using in an XML element as a list of its attributes. + + Any pairs that have value=None will be skipped. + + Note that a dictionary can be passed to this function instead of kwargs + by using a construction like: + + key_val_string(**{"a": 1, "b": 2}) + """ + + retval = "" + + for (k, v) in kwargs.items(): + if v is None: + continue + + retval += ' %s="%s"' % (k, v) + + return retval + + +def element(element_name, **kwargs): + """ Create an XML element string with the given element_name and attributes. + This element does not support having any children, so it will be closed + on the same line. The attributes are processed by key_val_string. + """ + + return "<%s %s/>" % (element_name, key_val_string(**kwargs)) + + +def containing_element(element_name, inner, **kwargs): + """ Like element, but surrounds some child text passed by the inner + parameter. + """ + + attrs = key_val_string(**kwargs) + return "<%s %s>%s</%s>" % (element_name, attrs, inner, element_name) + + +class XmlBase: + """ A base class for deriving all kinds of XML sections in the CIB. This + class contains only the most basic operations common to all sections. + It is up to subclasses to provide most behavior. + + Note that subclasses of this base class often have different sets of + arguments to their __init__ methods. In general this is not a great + practice, however it is so thoroughly used in these classes that trying + to straighten it out is likely to cause more bugs than just leaving it + alone for now. + """ + + def __init__(self, factory, tag, _id, **kwargs): + """ Create a new XmlBase instance + + Arguments: + + factory -- A ConfigFactory instance + tag -- The XML element's start and end tag + _id -- A unique name for the element + kwargs -- Any additional key/value pairs that should be added to + this element as attributes + """ + + self._children = [] + self._factory = factory + self._kwargs = kwargs + self._tag = tag + + self.name = _id + + def __repr__(self): + """ Return a short string description of this XML section """ + + return "%s-%s" % (self._tag, self.name) + + def add_child(self, child): + """ Add an XML section as a child of this one """ + + self._children.append(child) + + def __setitem__(self, key, value): + """ Add a key/value pair to this element, resulting in it becoming an + XML attribute. If value is None, remove the key. + """ + + if value: + self._kwargs[key] = value + else: + self._kwargs.pop(key, None) + + def show(self): + """ Return a string representation of this XML section, including all + of its children + """ + + text = '''<%s''' % self._tag + if self.name: + text += ''' id="%s"''' % self.name + + text += key_val_string(**self._kwargs) + + if not self._children: + text += '''/>''' + return text + + text += '''>''' + + for c in self._children: + text += c.show() + + text += '''</%s>''' % self._tag + return text + + def _run(self, operation, xml, section, options=""): + """ Update the CIB on the cluster to include this XML section, including + all of its children + + Arguments: + + operation -- Whether this update is a "create" or "modify" operation + xml -- The XML to update the CIB with, typically the result + of calling show + section -- Which section of the CIB this update applies to (see + the --scope argument to cibadmin for allowed values) + options -- Extra options to pass to cibadmin + """ + + if self.name: + label = self.name + else: + label = "<%s>" % self._tag + + self._factory.debug("Writing out %s" % label) + + fixed = "HOME=/root CIB_file=%s" % self._factory.tmpfile + fixed += " cibadmin --%s --scope %s %s --xml-text '%s'" % (operation, section, options, xml) + + (rc, _) = self._factory.rsh(self._factory.target, fixed) + if rc != 0: + raise RuntimeError("Configure call failed: %s" % fixed) + + +class InstanceAttributes(XmlBase): + """ A class that creates an <instance_attributes> XML section with + key/value pairs + """ + + def __init__(self, factory, _id, attrs): + """ Create a new InstanceAttributes instance + + Arguments: + + factory -- A ConfigFactory instance + _id -- A unique name for the element + attrs -- Key/value pairs to add as nvpair child elements + """ + + XmlBase.__init__(self, factory, "instance_attributes", _id) + + # Create an <nvpair> for each attribute + for (attr, value) in attrs.items(): + self.add_child(XmlBase(factory, "nvpair", "%s-%s" % (_id, attr), + name=attr, value=value)) + + +class Node(XmlBase): + """ A class that creates a <node> XML section for a single node, complete + with node attributes + """ + + def __init__(self, factory, node_name, node_id, node_attrs): + """ Create a new Node instance + + Arguments: + + factory -- A ConfigFactory instance + node_name -- The value of the uname attribute for this node + node_id -- A unique name for the element + node_attrs -- Additional key/value pairs to set as instance + attributes for this node + """ + + XmlBase.__init__(self, factory, "node", node_id, uname=node_name) + self.add_child(InstanceAttributes(factory, "%s-1" % node_name, node_attrs)) + + +class Nodes(XmlBase): + """ A class that creates a <nodes> XML section containing multiple Node + instances as children + """ + + def __init__(self, factory): + """ Create a new Nodes instance + + Arguments: + + factory -- A ConfigFactory instance + """ + + XmlBase.__init__(self, factory, "nodes", None) + + def add_node(self, node_name, node_id, node_attrs): + """ Add a child node element + + Arguments: + + node_name -- The value of the uname attribute for this node + node_id -- A unique name for the element + node_attrs -- Additional key/value pairs to set as instance + attributes for this node + """ + + self.add_child(Node(self._factory, node_name, node_id, node_attrs)) + + def commit(self): + """ Modify the CIB on the cluster to include this XML section """ + + self._run("modify", self.show(), "configuration", "--allow-create") + + +class FencingTopology(XmlBase): + """ A class that creates a <fencing-topology> XML section describing how + fencing is configured in the cluster + """ + + def __init__(self, factory): + """ Create a new FencingTopology instance + + Arguments: + + factory -- A ConfigFactory instance + """ + + XmlBase.__init__(self, factory, "fencing-topology", None) + + def level(self, index, target, devices, target_attr=None, target_value=None): + """ Generate a <fencing-level> XML element + + index -- The order in which to attempt fencing-levels + (1 through 9). Levels are attempted in ascending + order until one succeeds. + target -- The name of a single node to which this level applies + devices -- A list of devices that must all be tried for this + level + target_attr -- The name of a node attribute that is set for nodes + to which this level applies + target_value -- The value of a node attribute that is set for nodes + to which this level applies + """ + + if target: + xml_id = "cts-%s.%d" % (target, index) + self.add_child(XmlBase(self._factory, "fencing-level", xml_id, target=target, index=index, devices=devices)) + + else: + xml_id = "%s-%s.%d" % (target_attr, target_value, index) + child = XmlBase(self._factory, "fencing-level", xml_id, index=index, devices=devices) + child["target-attribute"] = target_attr + child["target-value"] = target_value + self.add_child(child) + + def commit(self): + """ Create this XML section in the CIB """ + + self._run("create", self.show(), "configuration", "--allow-create") + + +class Option(XmlBase): + """ A class that creates a <cluster_property_set> XML section of key/value + pairs for cluster-wide configuration settings + """ + + def __init__(self, factory, _id="cib-bootstrap-options"): + """ Create a new Option instance + + Arguments: + + factory -- A ConfigFactory instance + _id -- A unique name for the element + """ + + XmlBase.__init__(self, factory, "cluster_property_set", _id) + + def __setitem__(self, key, value): + """ Add a child nvpair element containing the given key/value pair """ + + self.add_child(XmlBase(self._factory, "nvpair", "cts-%s" % key, name=key, value=value)) + + def commit(self): + """ Modify the CIB on the cluster to include this XML section """ + + self._run("modify", self.show(), "crm_config", "--allow-create") + + +class OpDefaults(XmlBase): + """ A class that creates a <cts-op_defaults-meta> XML section of key/value + pairs for operation default settings + """ + + def __init__(self, factory): + """ Create a new OpDefaults instance + + Arguments: + + factory -- A ConfigFactory instance + """ + + XmlBase.__init__(self, factory, "op_defaults", None) + self.meta = XmlBase(self._factory, "meta_attributes", "cts-op_defaults-meta") + self.add_child(self.meta) + + def __setitem__(self, key, value): + """ Add a child nvpair meta_attribute element containing the given + key/value pair + """ + + self.meta.add_child(XmlBase(self._factory, "nvpair", "cts-op_defaults-%s" % key, name=key, value=value)) + + def commit(self): + """ Modify the CIB on the cluster to include this XML section """ + + self._run("modify", self.show(), "configuration", "--allow-create") + + +class Alerts(XmlBase): + """ A class that creates an <alerts> XML section """ + + def __init__(self, factory): + """ Create a new Alerts instance + + Arguments: + + factory -- A ConfigFactory instance + """ + + XmlBase.__init__(self, factory, "alerts", None) + self._alert_count = 0 + + def add_alert(self, path, recipient): + """ Create a new alert as a child of this XML section + + Arguments: + + path -- The path to a script to be called when a cluster + event occurs + recipient -- An environment variable to be passed to the script + """ + + self._alert_count += 1 + alert = XmlBase(self._factory, "alert", "alert-%d" % self._alert_count, + path=path) + recipient1 = XmlBase(self._factory, "recipient", + "alert-%d-recipient-1" % self._alert_count, + value=recipient) + alert.add_child(recipient1) + self.add_child(alert) + + def commit(self): + """ Modify the CIB on the cluster to include this XML section """ + + self._run("modify", self.show(), "configuration", "--allow-create") + + +class Expression(XmlBase): + """ A class that creates an <expression> XML element as part of some + constraint rule + """ + + def __init__(self, factory, _id, attr, op, value=None): + """ Create a new Expression instance + + Arguments: + + factory -- A ConfigFactory instance + _id -- A unique name for the element + attr -- The attribute to be tested + op -- The comparison to perform ("lt", "eq", "defined", etc.) + value -- Value for comparison (can be None for "defined" and + "not_defined" operations) + """ + + XmlBase.__init__(self, factory, "expression", _id, attribute=attr, operation=op) + if value: + self["value"] = value + + +class Rule(XmlBase): + """ A class that creates a <rule> XML section consisting of one or more + expressions, as part of some constraint + """ + + def __init__(self, factory, _id, score, op="and", expr=None): + """ Create a new Rule instance + + Arguments: + + factory -- A ConfigFactory instance + _id -- A unique name for the element + score -- If this rule is used in a location constraint and + evaluates to true, apply this score to the constraint + op -- If this rule contains more than one expression, use this + boolean op when evaluating + expr -- An Expression instance that can be added to this Rule + when it is created + """ + + XmlBase.__init__(self, factory, "rule", _id) + + self["boolean-op"] = op + self["score"] = score + + if expr: + self.add_child(expr) + + +class Resource(XmlBase): + """ A base class that creates all kinds of <resource> XML sections fully + describing a single cluster resource. This defaults to primitive + resources, but subclasses can create other types. + """ + + def __init__(self, factory, _id, rtype, standard, provider=None): + """ Create a new Resource instance + + Arguments: + + factory -- A ConfigFactory instance + _id -- A unique name for the element + rtype -- The name of the resource agent + standard -- The standard the resource agent follows ("ocf", + "systemd", etc.) + provider -- The vendor providing the resource agent + """ + + XmlBase.__init__(self, factory, "native", _id) + + self._provider = provider + self._rtype = rtype + self._standard = standard + + self._meta = {} + self._op = [] + self._param = {} + + self._coloc = {} + self._needs = {} + self._scores = {} + + if self._standard == "ocf" and not provider: + self._provider = "heartbeat" + elif self._standard == "lsb": + self._provider = None + + def __setitem__(self, key, value): + """ Add a child nvpair element containing the given key/value pair as + an instance attribute + """ + + self._add_param(key, value) + + def add_op(self, _id, interval, **kwargs): + """ Add an operation child XML element to this resource + + Arguments: + + _id -- A unique name for the element. Also, the action to + perform ("monitor", "start", "stop", etc.) + interval -- How frequently (in seconds) to perform the operation + kwargs -- Any additional key/value pairs that should be added to + this element as attributes + """ + + self._op.append(XmlBase(self._factory, "op", "%s-%s" % (_id, interval), + name=_id, interval=interval, **kwargs)) + + def _add_param(self, name, value): + """ Add a child nvpair element containing the given key/value pair as + an instance attribute + """ + + self._param[name] = value + + def add_meta(self, name, value): + """ Add a child nvpair element containing the given key/value pair as + a meta attribute + """ + + self._meta[name] = value + + def prefer(self, node, score="INFINITY", rule=None): + """ Add a location constraint where this resource prefers some node + + Arguments: + + node -- The name of the node to prefer + score -- Apply this score to the location constraint + rule -- A Rule instance to use in creating this constraint, instead + of creating a new rule + """ + + if not rule: + rule = Rule(self._factory, "prefer-%s-r" % node, score, + expr=Expression(self._factory, "prefer-%s-e" % node, "#uname", "eq", node)) + + self._scores[node] = rule + + def after(self, resource, kind="Mandatory", first="start", then="start", **kwargs): + """ Create an ordering constraint between this resource and some other + + Arguments: + + resource -- The name of the dependent resource + kind -- How to enforce the constraint ("mandatory", "optional", + "serialize") + first -- The action that this resource must complete before the + then-action can be initiated for the dependent resource + ("start", "stop", "promote", "demote") + then -- The action that the dependent resource can execute only + after the first-action has completed (same values as + first) + kwargs -- Any additional key/value pairs that should be added to + this element as attributes + """ + + kargs = kwargs.copy() + kargs["kind"] = kind + + if then: + kargs["first-action"] = "start" + kargs["then-action"] = then + + if first: + kargs["first-action"] = first + + self._needs[resource] = kargs + + def colocate(self, resource, score="INFINITY", role=None, withrole=None, **kwargs): + """ Create a colocation constraint between this resource and some other + + Arguments: + + resource -- The name of the resource that should be located relative + this one + score -- Apply this score to the colocation constraint + role -- Apply this colocation constraint only to promotable clones + in this role ("started", "promoted", "unpromoted") + withrole -- Apply this colocation constraint only to with-rsc promotable + clones in this role + kwargs -- Any additional key/value pairs that should be added to + this element as attributes + """ + + kargs = kwargs.copy() + kargs["score"] = score + + if role: + kargs["rsc-role"] = role + + if withrole: + kargs["with-rsc-role"] = withrole + + self._coloc[resource] = kargs + + def _constraints(self): + """ Generate a <constraints> XML section containing all previously added + ordering and colocation constraints + """ + + text = "<constraints>" + + for (k, v) in self._scores.items(): + attrs = {"id": "prefer-%s" % k, "rsc": self.name} + text += containing_element("rsc_location", v.show(), **attrs) + + for (k, kargs) in self._needs.items(): + attrs = {"id": "%s-after-%s" % (self.name, k), "first": k, "then": self.name} + text += element("rsc_order", **attrs, **kargs) + + for (k, kargs) in self._coloc.items(): + attrs = {"id": "%s-with-%s" % (self.name, k), "rsc": self.name, "with-rsc": k} + text += element("rsc_colocation", **attrs) + + text += "</constraints>" + return text + + def show(self): + """ Return a string representation of this XML section, including all + of its children + """ + + text = '''<primitive id="%s" class="%s" type="%s"''' % (self.name, self._standard, self._rtype) + + if self._provider: + text += ''' provider="%s"''' % self._provider + + text += '''>''' + + if self._meta: + nvpairs = "" + for (p, v) in self._meta.items(): + attrs = {"id": "%s-%s" % (self.name, p), "name": p, "value": v} + nvpairs += element("nvpair", **attrs) + + text += containing_element("meta_attributes", nvpairs, + id="%s-meta" % self.name) + + if self._param: + nvpairs = "" + for (p, v) in self._param.items(): + attrs = {"id": "%s-%s" % (self.name, p), "name": p, "value": v} + nvpairs += element("nvpair", **attrs) + + text += containing_element("instance_attributes", nvpairs, + id="%s-params" % self.name) + + if self._op: + text += '''<operations>''' + + for o in self._op: + key = o.name + o.name = "%s-%s" % (self.name, key) + text += o.show() + o.name = key + + text += '''</operations>''' + + text += '''</primitive>''' + return text + + def commit(self): + """ Modify the CIB on the cluster to include this XML section """ + + self._run("create", self.show(), "resources") + self._run("modify", self._constraints(), "constraints") + + +class Group(Resource): + """ A specialized Resource subclass that creates a <group> XML section + describing a single group resource consisting of multiple child + primitive resources + """ + + def __init__(self, factory, _id): + """ Create a new Group instance + + Arguments: + + factory -- A ConfigFactory instance + _id -- A unique name for the element + """ + + Resource.__init__(self, factory, _id, None, None) + self.tag = "group" + + def __setitem__(self, key, value): + self.add_meta(key, value) + + def show(self): + """ Return a string representation of this XML section, including all + of its children + """ + + text = '''<%s id="%s">''' % (self.tag, self.name) + + if len(self._meta) > 0: + nvpairs = "" + for (p, v) in self._meta.items(): + attrs = {"id": "%s-%s" % (self.name, p), "name": p, "value": v} + nvpairs += element("nvpair", **attrs) + + text += containing_element("meta_attributes", nvpairs, + id="%s-meta" % self.name) + + for c in self._children: + text += c.show() + + text += '''</%s>''' % self.tag + return text + + +class Clone(Group): + """ A specialized Group subclass that creates a <clone> XML section + describing a clone resource containing multiple instances of a + single primitive resource + """ + + def __init__(self, factory, _id, child=None): + """ Create a new Clone instance + + Arguments: + + factory -- A ConfigFactory instance + _id -- A unique name for the element + child -- A Resource instance that can be added to this Clone + when it is created. Alternately, use add_child later. + Note that a Clone may only have one child. + """ + + Group.__init__(self, factory, _id) + self.tag = "clone" + + if child: + self.add_child(child) + + def add_child(self, child): + """ Add the given resource as a child of this Clone. Note that a + Clone resource only supports one child at a time. + """ + + if not self._children: + self._children.append(child) + else: + self._factory.log("Clones can only have a single child. Ignoring %s" % child.name) diff --git a/python/pacemaker/_cts/clustermanager.py b/python/pacemaker/_cts/clustermanager.py new file mode 100644 index 0000000..652108f --- /dev/null +++ b/python/pacemaker/_cts/clustermanager.py @@ -0,0 +1,916 @@ +""" ClusterManager class for Pacemaker's Cluster Test Suite (CTS) """ + +__all__ = ["ClusterManager"] +__copyright__ = """Copyright 2000-2023 the Pacemaker project contributors. +Certain portions by Huang Zhen <zhenhltc@cn.ibm.com> are copyright 2004 +International Business Machines. The version control history for this file +may have further details.""" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +import os +import re +import time + +from collections import UserDict + +from pacemaker.buildoptions import BuildOptions +from pacemaker._cts.CTS import NodeStatus +from pacemaker._cts.audits import AuditResource +from pacemaker._cts.cib import ConfigFactory +from pacemaker._cts.environment import EnvFactory +from pacemaker._cts.logging import LogFactory +from pacemaker._cts.patterns import PatternSelector +from pacemaker._cts.remote import RemoteFactory +from pacemaker._cts.watcher import LogWatcher + +# pylint doesn't understand that self._rsh is callable (it stores the +# singleton instance of RemoteExec, as returned by the getInstance method +# of RemoteFactory). It's possible we could fix this with type annotations, +# but those were introduced with python 3.5 and we only support python 3.4. +# I think we could also fix this by getting rid of the getInstance methods, +# but that's a project for another day. For now, just disable the warning. +# pylint: disable=not-callable + +# ClusterManager has a lot of methods. +# pylint: disable=too-many-public-methods + +class ClusterManager(UserDict): + """ An abstract base class for managing the cluster. This class implements + high-level operations on the cluster and/or its cluster managers. + Actual cluster-specific management classes should be subclassed from this + one. + + Among other things, this class tracks the state every node is expected to + be in. + """ + + def _final_conditions(self): + """ Check all keys to make sure they have a non-None value """ + + for (key, val) in self._data.items(): + if val is None: + raise ValueError("Improper derivation: self[%s] must be overridden by subclass." % key) + + def __init__(self): + """ Create a new ClusterManager instance. This class can be treated + kind of like a dictionary due to the process of certain dict + functions like __getitem__ and __setitem__. This is because it + contains a lot of name/value pairs. However, it is not actually + a dictionary so do not rely on standard dictionary behavior. + """ + + # Eventually, ClusterManager should not be a UserDict subclass. Until + # that point... + # pylint: disable=super-init-not-called + self.__instance_errors_to_ignore = [] + + self._cib_installed = False + self._data = {} + self._logger = LogFactory() + + self.env = EnvFactory().getInstance() + self.expected_status = {} + self.name = self.env["Name"] + # pylint: disable=invalid-name + self.ns = NodeStatus(self.env) + self.our_node = os.uname()[1].lower() + self.partitions_expected = 1 + self.rsh = RemoteFactory().getInstance() + self.templates = PatternSelector(self.env["Name"]) + + self._final_conditions() + + self._cib_factory = ConfigFactory(self) + self._cib = self._cib_factory.create_config(self.env["Schema"]) + self._cib_sync = {} + + def __getitem__(self, key): + if key == "Name": + return self.name + + print("FIXME: Getting %s from %r" % (key, self)) + if key in self._data: + return self._data[key] + + return self.templates.get_patterns(key) + + def __setitem__(self, key, value): + print("FIXME: Setting %s=%s on %r" % (key, value, self)) + self._data[key] = value + + def clear_instance_errors_to_ignore(self): + """ Reset instance-specific errors to ignore on each iteration """ + + self.__instance_errors_to_ignore = [] + + @property + def instance_errors_to_ignore(self): + """ Return a list of known errors that should be ignored for a specific + test instance + """ + + return self.__instance_errors_to_ignore + + @property + def errors_to_ignore(self): + """ Return a list of known error messages that should be ignored """ + + return self.templates.get_patterns("BadNewsIgnore") + + def log(self, args): + """ Log a message """ + + self._logger.log(args) + + def debug(self, args): + """ Log a debug message """ + + self._logger.debug(args) + + def upcount(self): + """ How many nodes are up? """ + + count = 0 + + for node in self.env["nodes"]: + if self.expected_status[node] == "up": + count += 1 + + return count + + def install_support(self, command="install"): + """ Install or uninstall the CTS support files - various init scripts and data, + daemons, fencing agents, etc. + """ + + for node in self.env["nodes"]: + self.rsh(node, "%s/cts-support %s" % (BuildOptions.DAEMON_DIR, command)) + + def prepare_fencing_watcher(self): + """ Return a LogWatcher object that watches for fencing log messages """ + + # If we don't have quorum now but get it as a result of starting this node, + # then a bunch of nodes might get fenced + if self.has_quorum(None): + self.debug("Have quorum") + return None + + if not self.templates["Pat:Fencing_start"]: + print("No start pattern") + return None + + if not self.templates["Pat:Fencing_ok"]: + print("No ok pattern") + return None + + stonith = None + stonith_pats = [] + for peer in self.env["nodes"]: + if self.expected_status[peer] == "up": + continue + + stonith_pats.extend([ + self.templates["Pat:Fencing_ok"] % peer, + self.templates["Pat:Fencing_start"] % peer, + ]) + + stonith = LogWatcher(self.env["LogFileName"], stonith_pats, self.env["nodes"], + self.env["LogWatcher"], "StartupFencing", 0) + stonith.set_watch() + return stonith + + def fencing_cleanup(self, node, stonith): + """ Wait for a previously fenced node to return to the cluster """ + + peer_list = [] + peer_state = {} + + self.debug("Looking for nodes that were fenced as a result of %s starting" % node) + + # If we just started a node, we may now have quorum (and permission to fence) + if not stonith: + self.debug("Nothing to do") + return peer_list + + q = self.has_quorum(None) + if not q and len(self.env["nodes"]) > 2: + # We didn't gain quorum - we shouldn't have shot anyone + self.debug("Quorum: %s Len: %d" % (q, len(self.env["nodes"]))) + return peer_list + + for n in self.env["nodes"]: + peer_state[n] = "unknown" + + # Now see if any states need to be updated + self.debug("looking for: %r" % stonith.regexes) + shot = stonith.look(0) + + while shot: + self.debug("Found: %r" % shot) + del stonith.regexes[stonith.whichmatch] + + # Extract node name + for n in self.env["nodes"]: + if re.search(self.templates["Pat:Fencing_ok"] % n, shot): + peer = n + peer_state[peer] = "complete" + self.__instance_errors_to_ignore.append(self.templates["Pat:Fencing_ok"] % peer) + + elif peer_state[n] != "complete" and re.search(self.templates["Pat:Fencing_start"] % n, shot): + # TODO: Correctly detect multiple fencing operations for the same host + peer = n + peer_state[peer] = "in-progress" + self.__instance_errors_to_ignore.append(self.templates["Pat:Fencing_start"] % peer) + + if not peer: + self._logger.log("ERROR: Unknown stonith match: %r" % shot) + + elif not peer in peer_list: + self.debug("Found peer: %s" % peer) + peer_list.append(peer) + + # Get the next one + shot = stonith.look(60) + + for peer in peer_list: + self.debug(" Peer %s was fenced as a result of %s starting: %s" % (peer, node, peer_state[peer])) + if self.env["at-boot"]: + self.expected_status[peer] = "up" + else: + self.expected_status[peer] = "down" + + if peer_state[peer] == "in-progress": + # Wait for any in-progress operations to complete + shot = stonith.look(60) + + while stonith.regexes and shot: + self.debug("Found: %r" % shot) + del stonith.regexes[stonith.whichmatch] + shot = stonith.look(60) + + # Now make sure the node is alive too + self.ns.wait_for_node(peer, self.env["DeadTime"]) + + # Poll until it comes up + if self.env["at-boot"]: + if not self.stat_cm(peer): + time.sleep(self.env["StartTime"]) + + if not self.stat_cm(peer): + self._logger.log("ERROR: Peer %s failed to restart after being fenced" % peer) + return None + + return peer_list + + def start_cm(self, node, verbose=False): + """ Start up the cluster manager on a given node """ + + if verbose: + self._logger.log("Starting %s on node %s" % (self.templates["Name"], node)) + else: + self.debug("Starting %s on node %s" % (self.templates["Name"], node)) + + if not node in self.expected_status: + self.expected_status[node] = "down" + + if self.expected_status[node] != "down": + return True + + # Technically we should always be able to notice ourselves starting + patterns = [ + self.templates["Pat:Local_started"] % node, + ] + + if self.upcount() == 0: + patterns.append(self.templates["Pat:DC_started"] % node) + else: + patterns.append(self.templates["Pat:NonDC_started"] % node) + + watch = LogWatcher(self.env["LogFileName"], patterns, self.env["nodes"], self.env["LogWatcher"], + "StartaCM", self.env["StartTime"] + 10) + + self.install_config(node) + + self.expected_status[node] = "any" + + if self.stat_cm(node) and self.cluster_stable(self.env["DeadTime"]): + self._logger.log("%s was already started" % node) + return True + + stonith = self.prepare_fencing_watcher() + watch.set_watch() + + (rc, _) = self.rsh(node, self.templates["StartCmd"]) + if rc != 0: + self._logger.log("Warn: Start command failed on node %s" % node) + self.fencing_cleanup(node, stonith) + return False + + self.expected_status[node] = "up" + watch_result = watch.look_for_all() + + if watch.unmatched: + for regex in watch.unmatched: + self._logger.log("Warn: Startup pattern not found: %s" % regex) + + if watch_result and self.cluster_stable(self.env["DeadTime"]): + self.fencing_cleanup(node, stonith) + return True + + if self.stat_cm(node) and self.cluster_stable(self.env["DeadTime"]): + self.fencing_cleanup(node, stonith) + return True + + self._logger.log("Warn: Start failed for node %s" % node) + return False + + def start_cm_async(self, node, verbose=False): + """ Start up the cluster manager on a given node without blocking """ + + if verbose: + self._logger.log("Starting %s on node %s" % (self["Name"], node)) + else: + self.debug("Starting %s on node %s" % (self["Name"], node)) + + self.install_config(node) + self.rsh(node, self.templates["StartCmd"], synchronous=False) + self.expected_status[node] = "up" + + def stop_cm(self, node, verbose=False, force=False): + """ Stop the cluster manager on a given node """ + + if verbose: + self._logger.log("Stopping %s on node %s" % (self["Name"], node)) + else: + self.debug("Stopping %s on node %s" % (self["Name"], node)) + + if self.expected_status[node] != "up" and not force: + return True + + (rc, _) = self.rsh(node, self.templates["StopCmd"]) + if rc == 0: + # Make sure we can continue even if corosync leaks + self.expected_status[node] = "down" + self.cluster_stable(self.env["DeadTime"]) + return True + + self._logger.log("ERROR: Could not stop %s on node %s" % (self["Name"], node)) + return False + + def stop_cm_async(self, node): + """ Stop the cluster manager on a given node without blocking """ + + self.debug("Stopping %s on node %s" % (self["Name"], node)) + + self.rsh(node, self.templates["StopCmd"], synchronous=False) + self.expected_status[node] = "down" + + def startall(self, nodelist=None, verbose=False, quick=False): + """ Start the cluster manager on every node in the cluster, or on every + node in nodelist if not None + """ + + if not nodelist: + nodelist = self.env["nodes"] + + for node in nodelist: + if self.expected_status[node] == "down": + self.ns.wait_for_all_nodes(nodelist, 300) + + if not quick: + # This is used for "basic sanity checks", so only start one node ... + return self.start_cm(nodelist[0], verbose=verbose) + + # Approximation of SimulStartList for --boot + watchpats = [ + self.templates["Pat:DC_IDLE"], + ] + for node in nodelist: + watchpats.extend([ + self.templates["Pat:InfraUp"] % node, + self.templates["Pat:PacemakerUp"] % node, + self.templates["Pat:Local_started"] % node, + self.templates["Pat:They_up"] % (nodelist[0], node), + ]) + + # Start all the nodes - at about the same time... + watch = LogWatcher(self.env["LogFileName"], watchpats, self.env["nodes"], + self.env["LogWatcher"], "fast-start", self.env["DeadTime"] + 10) + watch.set_watch() + + if not self.start_cm(nodelist[0], verbose=verbose): + return False + + for node in nodelist: + self.start_cm_async(node, verbose=verbose) + + watch.look_for_all() + if watch.unmatched: + for regex in watch.unmatched: + self._logger.log("Warn: Startup pattern not found: %s" % regex) + + if not self.cluster_stable(): + self._logger.log("Cluster did not stabilize") + return False + + return True + + def stopall(self, nodelist=None, verbose=False, force=False): + """ Stop the cluster manager on every node in the cluster, or on every + node in nodelist if not None + """ + + ret = True + + if not nodelist: + nodelist = self.env["nodes"] + + for node in self.env["nodes"]: + if self.expected_status[node] == "up" or force: + if not self.stop_cm(node, verbose=verbose, force=force): + ret = False + + return ret + + def statall(self, nodelist=None): + """ Return the status of the cluster manager on every node in the cluster, + or on every node in nodelist if not None + """ + + result = {} + + if not nodelist: + nodelist = self.env["nodes"] + + for node in nodelist: + if self.stat_cm(node): + result[node] = "up" + else: + result[node] = "down" + + return result + + def isolate_node(self, target, nodes=None): + """ Break communication between the target node and all other nodes in the + cluster, or nodes if not None + """ + + if not nodes: + nodes = self.env["nodes"] + + for node in nodes: + if node == target: + continue + + (rc, _) = self.rsh(target, self.templates["BreakCommCmd"] % node) + if rc != 0: + self._logger.log("Could not break the communication between %s and %s: %d" % (target, node, rc)) + return False + + self.debug("Communication cut between %s and %s" % (target, node)) + + return True + + def unisolate_node(self, target, nodes=None): + """ Re-establish communication between the target node and all other nodes + in the cluster, or nodes if not None + """ + + if not nodes: + nodes = self.env["nodes"] + + for node in nodes: + if node == target: + continue + + # Limit the amount of time we have asynchronous connectivity for + # Restore both sides as simultaneously as possible + self.rsh(target, self.templates["FixCommCmd"] % node, synchronous=False) + self.rsh(node, self.templates["FixCommCmd"] % target, synchronous=False) + self.debug("Communication restored between %s and %s" % (target, node)) + + def oprofile_start(self, node=None): + """ Start profiling on the given node, or all nodes in the cluster """ + + if not node: + for n in self.env["oprofile"]: + self.oprofile_start(n) + + elif node in self.env["oprofile"]: + self.debug("Enabling oprofile on %s" % node) + self.rsh(node, "opcontrol --init") + self.rsh(node, "opcontrol --setup --no-vmlinux --separate=lib --callgraph=20 --image=all") + self.rsh(node, "opcontrol --start") + self.rsh(node, "opcontrol --reset") + + def oprofile_save(self, test, node=None): + """ Save profiling data and restart profiling on the given node, or all + nodes in the cluster if None + """ + + if not node: + for n in self.env["oprofile"]: + self.oprofile_save(test, n) + + elif node in self.env["oprofile"]: + self.rsh(node, "opcontrol --dump") + self.rsh(node, "opcontrol --save=cts.%d" % test) + # Read back with: opreport -l session:cts.0 image:<directory>/c* + self.oprofile_stop(node) + self.oprofile_start(node) + + def oprofile_stop(self, node=None): + """ Start profiling on the given node, or all nodes in the cluster. This + does not save profiling data, so call oprofile_save first if needed. + """ + + if not node: + for n in self.env["oprofile"]: + self.oprofile_stop(n) + + elif node in self.env["oprofile"]: + self.debug("Stopping oprofile on %s" % node) + self.rsh(node, "opcontrol --reset") + self.rsh(node, "opcontrol --shutdown 2>&1 > /dev/null") + + def install_config(self, node): + """ Remove and re-install the CIB on the first node in the cluster """ + + if not self.ns.wait_for_node(node): + self.log("Node %s is not up." % node) + return + + if node in self._cib_sync or not self.env["ClobberCIB"]: + return + + self._cib_sync[node] = True + self.rsh(node, "rm -f %s/cib*" % BuildOptions.CIB_DIR) + + # Only install the CIB on the first node, all the other ones will pick it up from there + if self._cib_installed: + return + + self._cib_installed = True + if self.env["CIBfilename"] is None: + self.log("Installing Generated CIB on node %s" % node) + self._cib.install(node) + + else: + self.log("Installing CIB (%s) on node %s" % (self.env["CIBfilename"], node)) + + rc = self.rsh.copy(self.env["CIBfilename"], "root@" + (self.templates["CIBfile"] % node)) + + if rc != 0: + raise ValueError("Can not scp file to %s %d" % (node, rc)) + + self.rsh(node, "chown %s %s/cib.xml" % (BuildOptions.DAEMON_USER, BuildOptions.CIB_DIR)) + + def prepare(self): + """ Finish initialization by clearing out the expected status and recording + the current status of every node in the cluster + """ + + self.partitions_expected = 1 + for node in self.env["nodes"]: + self.expected_status[node] = "" + + if self.env["experimental-tests"]: + self.unisolate_node(node) + + self.stat_cm(node) + + def test_node_cm(self, node): + """ Check the status of a given node. Returns 0 if the node is + down, 1 if the node is up but unstable, and 2 if the node is + up and stable + """ + + watchpats = [ + "Current ping state: (S_IDLE|S_NOT_DC)", + self.templates["Pat:NonDC_started"] % node, + self.templates["Pat:DC_started"] % node, + ] + + idle_watch = LogWatcher(self.env["LogFileName"], watchpats, [node], + self.env["LogWatcher"], "ClusterIdle") + idle_watch.set_watch() + + (_, out) = self.rsh(node, self.templates["StatusCmd"] % node, verbose=1) + + if not out: + out = "" + else: + out = out[0].strip() + + self.debug("Node %s status: '%s'" % (node, out)) + + if out.find('ok') < 0: + if self.expected_status[node] == "up": + self.log("Node status for %s is %s but we think it should be %s" + % (node, "down", self.expected_status[node])) + + self.expected_status[node] = "down" + return 0 + + if self.expected_status[node] == "down": + self.log("Node status for %s is %s but we think it should be %s: %s" + % (node, "up", self.expected_status[node], out)) + + self.expected_status[node] = "up" + + # check the output first - because syslog-ng loses messages + if out.find('S_NOT_DC') != -1: + # Up and stable + return 2 + + if out.find('S_IDLE') != -1: + # Up and stable + return 2 + + # fall back to syslog-ng and wait + if not idle_watch.look(): + # just up + self.debug("Warn: Node %s is unstable: %s" % (node, out)) + return 1 + + # Up and stable + return 2 + + def stat_cm(self, node): + """ Report the status of the cluster manager on a given node """ + + return self.test_node_cm(node) > 0 + + # Being up and being stable is not the same question... + def node_stable(self, node): + """ Return whether or not the given node is stable """ + + if self.test_node_cm(node) == 2: + return True + + self.log("Warn: Node %s not stable" % node) + return False + + def partition_stable(self, nodes, timeout=None): + """ Return whether or not all nodes in the given partition are stable """ + + watchpats = [ + "Current ping state: S_IDLE", + self.templates["Pat:DC_IDLE"], + ] + + self.debug("Waiting for cluster stability...") + + if timeout is None: + timeout = self.env["DeadTime"] + + if len(nodes) < 3: + self.debug("Cluster is inactive") + return True + + idle_watch = LogWatcher(self.env["LogFileName"], watchpats, nodes.split(), + self.env["LogWatcher"], "ClusterStable", timeout) + idle_watch.set_watch() + + for node in nodes.split(): + # have each node dump its current state + self.rsh(node, self.templates["StatusCmd"] % node, verbose=1) + + ret = idle_watch.look() + + while ret: + self.debug(ret) + + for node in nodes.split(): + if re.search(node, ret): + return True + + ret = idle_watch.look() + + self.debug("Warn: Partition %r not IDLE after %ds" % (nodes, timeout)) + return False + + def cluster_stable(self, timeout=None, double_check=False): + """ Return whether or not all nodes in the cluster are stable """ + + partitions = self.find_partitions() + + for partition in partitions: + if not self.partition_stable(partition, timeout): + return False + + if not double_check: + return True + + # Make sure we are really stable and that all resources, + # including those that depend on transient node attributes, + # are started if they were going to be + time.sleep(5) + for partition in partitions: + if not self.partition_stable(partition, timeout): + return False + + return True + + def is_node_dc(self, node, status_line=None): + """ Return whether or not the given node is the cluster DC by checking + the given status_line, or by querying the cluster if None + """ + + if not status_line: + (_, out) = self.rsh(node, self.templates["StatusCmd"] % node, verbose=1) + + if out: + status_line = out[0].strip() + + if not status_line: + return False + + if status_line.find('S_IDLE') != -1: + return True + + if status_line.find('S_INTEGRATION') != -1: + return True + + if status_line.find('S_FINALIZE_JOIN') != -1: + return True + + if status_line.find('S_POLICY_ENGINE') != -1: + return True + + if status_line.find('S_TRANSITION_ENGINE') != -1: + return True + + return False + + def active_resources(self, node): + """ Return a list of primitive resources active on the given node """ + + (_, output) = self.rsh(node, "crm_resource -c", verbose=1) + resources = [] + for line in output: + if not re.search("^Resource", line): + continue + + tmp = AuditResource(self, line) + if tmp.type == "primitive" and tmp.host == node: + resources.append(tmp.id) + + return resources + + def resource_location(self, rid): + """ Return a list of nodes on which the given resource is running """ + + resource_nodes = [] + for node in self.env["nodes"]: + if self.expected_status[node] != "up": + continue + + cmd = self.templates["RscRunning"] % rid + (rc, lines) = self.rsh(node, cmd) + + if rc == 127: + self.log("Command '%s' failed. Binary or pacemaker-cts package not installed?" % cmd) + for line in lines: + self.log("Output: %s " % line) + + elif rc == 0: + resource_nodes.append(node) + + return resource_nodes + + def find_partitions(self): + """ Return a list of all partitions in the cluster. Each element of the + list is itself a list of all active nodes in that partition. + """ + + ccm_partitions = [] + + for node in self.env["nodes"]: + if self.expected_status[node] != "up": + self.debug("Node %s is down... skipping" % node) + continue + + (_, out) = self.rsh(node, self.templates["PartitionCmd"], verbose=1) + + if not out: + self.log("no partition details for %s" % node) + continue + + partition = out[0].strip() + + if len(partition) <= 2: + self.log("bad partition details for %s" % node) + continue + + nodes = partition.split() + nodes.sort() + partition = ' '.join(nodes) + + found = 0 + for a_partition in ccm_partitions: + if partition == a_partition: + found = 1 + + if found == 0: + self.debug("Adding partition from %s: %s" % (node, partition)) + ccm_partitions.append(partition) + else: + self.debug("Partition '%s' from %s is consistent with existing entries" % (partition, node)) + + self.debug("Found partitions: %r" % ccm_partitions) + return ccm_partitions + + def has_quorum(self, node_list): + """ Return whether or not the cluster has quorum """ + + # If we are auditing a partition, then one side will + # have quorum and the other not. + # So the caller needs to tell us which we are checking + # If no value for node_list is specified... assume all nodes + if not node_list: + node_list = self.env["nodes"] + + for node in node_list: + if self.expected_status[node] != "up": + continue + + (_, quorum) = self.rsh(node, self.templates["QuorumCmd"], verbose=1) + quorum = quorum[0].strip() + + if quorum.find("1") != -1: + return True + + if quorum.find("0") != -1: + return False + + self.debug("WARN: Unexpected quorum test result from %s:%s" % (node, quorum)) + + return False + + @property + def components(self): + """ A list of all patterns that should be ignored for the cluster's + components. This must be provided by all subclasses. + """ + + raise NotImplementedError + + def in_standby_mode(self, node): + """ Return whether or not the node is in Standby """ + + (_, out) = self.rsh(node, self.templates["StandbyQueryCmd"] % node, verbose=1) + + if not out: + return False + + out = out[0].strip() + self.debug("Standby result: %s" % out) + return out == "on" + + def set_standby_mode(self, node, status): + """ Set node to Standby if status is True, or Active if status is False. + Return whether the node is now in the requested status. + """ + + current_status = self.in_standby_mode(node) + + if current_status == status: + return True + + if status: + cmd = self.templates["StandbyCmd"] % (node, "on") + else: + cmd = self.templates["StandbyCmd"] % (node, "off") + + (rc, _) = self.rsh(node, cmd) + return rc == 0 + + def add_dummy_rsc(self, node, rid): + """ Add a dummy resource with the given ID to the given node """ + + rsc_xml = """ '<resources> + <primitive class=\"ocf\" id=\"%s\" provider=\"pacemaker\" type=\"Dummy\"> + <operations> + <op id=\"%s-interval-10s\" interval=\"10s\" name=\"monitor\"/ + </operations> + </primitive> + </resources>'""" % (rid, rid) + constraint_xml = """ '<constraints> + <rsc_location id=\"location-%s-%s\" node=\"%s\" rsc=\"%s\" score=\"INFINITY\"/> + </constraints>' + """ % (rid, node, node, rid) + + self.rsh(node, self.templates['CibAddXml'] % rsc_xml) + self.rsh(node, self.templates['CibAddXml'] % constraint_xml) + + def remove_dummy_rsc(self, node, rid): + """ Remove the previously added dummy resource given by rid on the + given node + """ + + constraint = "\"//rsc_location[@rsc='%s']\"" % rid + rsc = "\"//primitive[@id='%s']\"" % rid + + self.rsh(node, self.templates['CibDelXpath'] % constraint) + self.rsh(node, self.templates['CibDelXpath'] % rsc) diff --git a/python/pacemaker/_cts/cmcorosync.py b/python/pacemaker/_cts/cmcorosync.py new file mode 100644 index 0000000..cac059b --- /dev/null +++ b/python/pacemaker/_cts/cmcorosync.py @@ -0,0 +1,80 @@ +""" Corosync-specific class for Pacemaker's Cluster Test Suite (CTS) """ + +__all__ = ["Corosync2"] +__copyright__ = "Copyright 2007-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +from pacemaker._cts.CTS import Process +from pacemaker._cts.clustermanager import ClusterManager +from pacemaker._cts.patterns import PatternSelector + +# Throughout this file, pylint has trouble understanding that EnvFactory +# is a singleton instance that can be treated as a subscriptable object. +# Various warnings are disabled because of this. See also a comment about +# self._rsh in environment.py. +# pylint: disable=unsubscriptable-object + +class Corosync2(ClusterManager): + """ A subclass of ClusterManager specialized to handle corosync2 and later + based clusters + """ + + def __init__(self): + """ Create a new Corosync2 instance """ + + ClusterManager.__init__(self) + + self._fullcomplist = {} + self.templates = PatternSelector(self.name) + + @property + def components(self): + """ A list of all patterns that should be ignored for the cluster's + components. + """ + + complist = [] + + if not self._fullcomplist: + common_ignore = self.templates.get_component("common-ignore") + + daemons = [ + "pacemaker-based", + "pacemaker-controld", + "pacemaker-attrd", + "pacemaker-execd", + "pacemaker-fenced" + ] + for c in daemons: + badnews = self.templates.get_component("%s-ignore" % c) + common_ignore + proc = Process(self, c, pats=self.templates.get_component(c), + badnews_ignore=badnews) + self._fullcomplist[c] = proc + + # the scheduler uses dc_pats instead of pats + badnews = self.templates.get_component("pacemaker-schedulerd-ignore") + common_ignore + proc = Process(self, "pacemaker-schedulerd", + dc_pats=self.templates.get_component("pacemaker-schedulerd"), + badnews_ignore=badnews) + self._fullcomplist["pacemaker-schedulerd"] = proc + + # add (or replace) extra components + badnews = self.templates.get_component("corosync-ignore") + common_ignore + proc = Process(self, "corosync", pats=self.templates.get_component("corosync"), + badnews_ignore=badnews) + self._fullcomplist["corosync"] = proc + + # Processes running under valgrind can't be shot with "killall -9 processname", + # so don't include them in the returned list + vgrind = self.env["valgrind-procs"].split() + for (key, val) in self._fullcomplist.items(): + if self.env["valgrind-tests"] and key in vgrind: + self.log("Filtering %s from the component list as it is being profiled by valgrind" % key) + continue + + if key == "pacemaker-fenced" and not self.env["DoFencing"]: + continue + + complist.append(val) + + return complist diff --git a/python/pacemaker/_cts/environment.py b/python/pacemaker/_cts/environment.py index e4d70e6..732ab24 100644 --- a/python/pacemaker/_cts/environment.py +++ b/python/pacemaker/_cts/environment.py @@ -11,6 +11,7 @@ import socket import sys import time +from pacemaker.buildoptions import BuildOptions from pacemaker._cts.logging import LogFactory from pacemaker._cts.remote import RemoteFactory from pacemaker._cts.watcher import LogKind @@ -31,7 +32,7 @@ class Environment: def __init__(self, args): """ Create a new Environment instance. This class can be treated kind of like a dictionary due to the presence of typical dict functions - like has_key, __getitem__, and __setitem__. However, it is not a + like __contains__, __getitem__, and __setitem__. However, it is not a dictionary so do not rely on standard dictionary behavior. Arguments: @@ -100,7 +101,7 @@ class Environment: return list(self.data.keys()) - def has_key(self, key): + def __contains__(self, key): """ Does the given environment key exist? """ if key == "nodes": @@ -120,10 +121,7 @@ class Environment: if key == "Name": return self._get_stack_short() - if key in self.data: - return self.data[key] - - return None + return self.data.get(key) def __setitem__(self, key, value): """ Set the given environment key to the given value, overriding any @@ -161,6 +159,14 @@ class Environment: return self.random_gen.choice(self["nodes"]) + def get(self, key, default=None): + """ Return the value for key if key is in the environment, else default """ + + if key == "nodes": + return self._nodes + + return self.data.get(key, default) + def _set_stack(self, name): """ Normalize the given cluster stack name """ @@ -279,7 +285,7 @@ class Environment: # pylint: disable=no-member if int(self["IPBase"].split('.')[3]) >= 240: self._logger.log("Could not determine an offset for IPaddr resources. Upper bound is too high: %s %s" - % (self["IPBase"], self["IPBase"].split('.')[3])) + % (self["IPBase"], self["IPBase"].split('.')[3])) self["IPBase"] = " fe80::1234:56:7890:1000" self._logger.log("Defaulting to '%s', use --test-ip-base to override" % self["IPBase"]) @@ -294,7 +300,7 @@ class Environment: # it as an int in __init__ and treat it as an int everywhere. # pylint: disable=bad-string-format-type self._logger.log("Limiting the number of nodes configured=%d (max=%d)" - %(len(self["nodes"]), self["node-limit"])) + % (len(self["nodes"]), self["node-limit"])) while len(self["nodes"]) > self["node-limit"]: self["nodes"].pop(len(self["nodes"])-1) @@ -398,15 +404,9 @@ class Environment: grp4.add_argument("--boot", action="store_true", help="") - grp4.add_argument("--bsc", - action="store_true", - help="") grp4.add_argument("--cib-filename", metavar="PATH", help="Install the given CIB file to the cluster") - grp4.add_argument("--container-tests", - action="store_true", - help="Include pacemaker_remote tests that run in lxc container resources") grp4.add_argument("--experimental-tests", action="store_true", help="Include experimental tests") @@ -438,7 +438,7 @@ class Environment: help="Use QARSH to access nodes instead of SSH") grp4.add_argument("--schema", metavar="SCHEMA", - default="pacemaker-3.0", + default="pacemaker-%s" % BuildOptions.CIB_SCHEMA_VERSION, help="Create a CIB conforming to the given schema") grp4.add_argument("--seed", metavar="SEED", @@ -491,7 +491,6 @@ class Environment: self["at-boot"] = args.at_boot in ["1", "yes"] self["benchmark"] = args.benchmark self["continue"] = args.always_continue - self["container-tests"] = args.container_tests self["experimental-tests"] = args.experimental_tests self["iterations"] = args.iterations self["loop-minutes"] = args.loop_minutes @@ -542,10 +541,6 @@ class Environment: if args.boot: self["scenario"] = "boot" - if args.bsc: - self["DoBSC"] = True - self["scenario"] = "basic-sanity" - if args.cib_filename: self["CIBfilename"] = args.cib_filename else: diff --git a/python/pacemaker/_cts/input.py b/python/pacemaker/_cts/input.py new file mode 100644 index 0000000..7e734f6 --- /dev/null +++ b/python/pacemaker/_cts/input.py @@ -0,0 +1,18 @@ +""" User input related utilities for CTS """ + +__all__ = ["should_continue"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +def should_continue(env): + """ On failure, prompt the user to see if we should continue """ + + if env["continue"]: + return True + + try: + answer = input("Continue? [yN]") + except EOFError: + answer = "n" + + return answer in ["y", "Y"] diff --git a/python/pacemaker/_cts/logging.py b/python/pacemaker/_cts/logging.py index d9f3012..6c7bfb0 100644 --- a/python/pacemaker/_cts/logging.py +++ b/python/pacemaker/_cts/logging.py @@ -21,7 +21,7 @@ class Logger: self._logfile = filename if tag: - self._source = tag + ": " + self._source = "%s: " % tag else: self._source = "" diff --git a/python/pacemaker/_cts/network.py b/python/pacemaker/_cts/network.py new file mode 100644 index 0000000..33e401f --- /dev/null +++ b/python/pacemaker/_cts/network.py @@ -0,0 +1,59 @@ +""" Network related utilities for CTS """ + +__all__ = ["next_ip"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +# pylint: disable=global-statement +CURRENT_IP = None + +def next_ip(ip_base=None, reset=False): + """ Return the next available IP address. + + Arguments: + + ip_base -- The initial IP address to start from. The first call to next_ip + will return the next IP address from this base. Each subsequent + call will return the next address from the previous call, so you + can just omit this argument for subsequent calls. + reset -- Force next_ip to start from ip_base again. This requires also + passing the ip_base argument. (Mostly useful for unit testing, + but may be useful elsewhere). + + This function only increments the last portion of the IP address. Once it + has hit the upper limit, ValueError will be raised. + """ + + global CURRENT_IP + + if CURRENT_IP is None or reset: + CURRENT_IP = ip_base + + new_ip = None + + # Split the existing IP address up into a tuple of: + # (everything except the last part of the addr, the separator, the last part of the addr). + # For instance, "192.168.1.2" becomes ("192.168.1", ".", "2"). Then, + # increment the last part of the address and paste everything back + # together. + if ":" in CURRENT_IP: + # This is an IPv6 address + fields = CURRENT_IP.rpartition(":") + new_ip = int(fields[2], 16) + 1 + + if new_ip > 0xffff: + raise ValueError("No more available IP addresses") + + # hex() puts "0x" at the front of the string, so strip it off. + new_ip = hex(new_ip)[2:] + + else: + # This is an IPv4 address + fields = CURRENT_IP.rpartition(".") + new_ip = int(fields[2]) + 1 + + if new_ip > 255: + raise ValueError("No more available IP addresses") + + CURRENT_IP = "%s%s%s" % (fields[0], fields[1], new_ip) + return CURRENT_IP diff --git a/python/pacemaker/_cts/patterns.py b/python/pacemaker/_cts/patterns.py index 880477a..0fb1c2b 100644 --- a/python/pacemaker/_cts/patterns.py +++ b/python/pacemaker/_cts/patterns.py @@ -220,7 +220,7 @@ class Corosync2Patterns(BasePatterns): r"pending LRM operations at shutdown", r"Lost connection to the CIB manager", r"pacemaker-controld.*:\s*Action A_RECOVER .* not supported", - r"pacemaker-controld.*:\s*Performing A_EXIT_1 - forcefully exiting ", + r"pacemaker-controld.*:\s*Exiting now due to errors", r".*:\s*Requesting fencing \([^)]+\) targeting node ", r"(Blackbox dump requested|Problem detected)", ] @@ -238,7 +238,7 @@ class Corosync2Patterns(BasePatterns): r"error:.*Connection to cib_(shm|rw).* (failed|closed)", r"error:.*cib_(shm|rw) IPC provider disconnected while waiting", r"error:.*Connection to (fencer|stonith-ng).* (closed|failed|lost)", - r"crit: Fencing daemon connection failed", + r"error: Lost fencer connection", # This is overbroad, but we don't have a way to say that only # certain transition errors are acceptable (if the fencer respawns, # fence devices may appear multiply active). We have to rely on @@ -253,7 +253,7 @@ class Corosync2Patterns(BasePatterns): # it's possible for another daemon to lose that connection and # exit before losing the cluster connection. r"pacemakerd.*:\s*warning:.*Lost connection to cluster layer", - r"pacemaker-attrd.*:\s*(crit|error):.*Lost connection to (cluster layer|the CIB manager)", + r"pacemaker-attrd.*:\s*(crit|error):.*Lost connection to (Corosync process group|the CIB manager)", r"pacemaker-based.*:\s*(crit|error):.*Lost connection to cluster layer", r"pacemaker-controld.*:\s*(crit|error):.*Lost connection to (cluster layer|the CIB manager)", r"pacemaker-fenced.*:\s*(crit|error):.*Lost connection to (cluster layer|the CIB manager)", @@ -290,7 +290,7 @@ class Corosync2Patterns(BasePatterns): ] self._components["pacemaker-execd"] = [ - r"pacemaker-controld.*Connection to executor failed", + r"pacemaker-controld.*Lost connection to local executor", r"pacemaker-controld.*I_ERROR.*lrm_connection_destroy", r"pacemaker-controld.*State transition .* S_RECOVERY", r"pacemaker-controld.*: Input I_TERMINATE .*from do_recover", @@ -317,7 +317,7 @@ class Corosync2Patterns(BasePatterns): r"State transition .* S_RECOVERY", r"pacemakerd.* Respawning pacemaker-controld subdaemon after unexpected exit", r"pacemaker-controld\[[0-9]+\] exited with status 1 \(", - r"Connection to the scheduler failed", + r"pacemaker-controld.*Lost connection to the scheduler", r"pacemaker-controld.*I_ERROR.*save_cib_contents", r"pacemaker-controld.*: Input I_TERMINATE .*from do_recover", r"pacemaker-controld.*Could not recover from internal error", @@ -329,13 +329,13 @@ class Corosync2Patterns(BasePatterns): self._components["pacemaker-fenced"] = [ r"error:.*Connection to (fencer|stonith-ng).* (closed|failed|lost)", - r"Fencing daemon connection failed", + r"Lost fencer connection", r"pacemaker-controld.*Fencer successfully connected", ] self._components["pacemaker-fenced-ignore"] = [ r"(error|warning):.*Connection to (fencer|stonith-ng).* (closed|failed|lost)", - r"crit:.*Fencing daemon connection failed", + r"error:.*Lost fencer connection", r"error:.*Fencer connection failed \(will retry\)", r"pacemaker-controld.*:\s+Result of .* operation for Fencing.*Error \(Lost connection to fencer\)", # This is overbroad, but we don't have a way to say that only diff --git a/python/pacemaker/_cts/process.py b/python/pacemaker/_cts/process.py index 2940b71..757360c 100644 --- a/python/pacemaker/_cts/process.py +++ b/python/pacemaker/_cts/process.py @@ -63,7 +63,7 @@ def pipe_communicate(pipes, check_stderr=False, stdin=None): output = pipe_outputs[0].decode(sys.stdout.encoding) if check_stderr: - output = output + pipe_outputs[1].decode(sys.stderr.encoding) + output += pipe_outputs[1].decode(sys.stderr.encoding) return output diff --git a/python/pacemaker/_cts/remote.py b/python/pacemaker/_cts/remote.py index 99d2ed7..4b6b8f6 100644 --- a/python/pacemaker/_cts/remote.py +++ b/python/pacemaker/_cts/remote.py @@ -7,7 +7,7 @@ __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT AN import re import os -from subprocess import Popen,PIPE +from subprocess import Popen, PIPE from threading import Thread from pacemaker._cts.logging import LogFactory @@ -71,7 +71,7 @@ class AsyncCmd(Thread): self._proc.wait() if self._delegate: - self._logger.debug("cmd: pid %d returned %d to %s" % (self._proc.pid, self._proc.returncode, repr(self._delegate))) + self._logger.debug("cmd: pid %d returned %d to %r" % (self._proc.pid, self._proc.returncode, self._delegate)) else: self._logger.debug("cmd: pid %d returned %d" % (self._proc.pid, self._proc.returncode)) @@ -126,7 +126,7 @@ class RemoteExec: sysname = args[0] command = args[1] - if sysname is None or sysname.lower() == self._our_node or sysname == "localhost": + if sysname is None or sysname.lower() in [self._our_node, "localhost"]: ret = command else: ret = "%s %s '%s'" % (self._command, sysname, self._fixcmd(command)) @@ -188,7 +188,7 @@ class RemoteExec: result = None # pylint: disable=consider-using-with proc = Popen(self._cmd([node, command]), - stdout = PIPE, stderr = PIPE, close_fds = True, shell = True) + stdout=PIPE, stderr=PIPE, close_fds=True, shell=True) if not synchronous and proc.pid > 0 and not self._silent: aproc = AsyncCmd(node, command, proc=proc) diff --git a/python/pacemaker/_cts/scenarios.py b/python/pacemaker/_cts/scenarios.py new file mode 100644 index 0000000..769b2d0 --- /dev/null +++ b/python/pacemaker/_cts/scenarios.py @@ -0,0 +1,422 @@ +""" Test scenario classes for Pacemaker's Cluster Test Suite (CTS) """ + +__all__ = [ + "AllOnce", + "Boot", + "BootCluster", + "LeaveBooted", + "RandomTests", + "Sequence", +] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +import re +import time + +from pacemaker._cts.audits import ClusterAudit +from pacemaker._cts.input import should_continue +from pacemaker._cts.tests.ctstest import CTSTest +from pacemaker._cts.watcher import LogWatcher + +class ScenarioComponent: + """ The base class for all scenario components. A scenario component is + one single step in a scenario. Each component is basically just a setup + and teardown method. + """ + + def __init__(self, cm, env): + """ Create a new ScenarioComponent instance + + Arguments: + + cm -- A ClusterManager instance + env -- An Environment instance + """ + + # pylint: disable=invalid-name + self._cm = cm + self._env = env + + def is_applicable(self): + """ Return True if this component is applicable in the given Environment. + This method must be provided by all subclasses. + """ + + raise NotImplementedError + + def setup(self): + """ Set up the component, returning True on success. This method must be + provided by all subclasses. + """ + + raise NotImplementedError + + def teardown(self): + """ Tear down the given component. This method must be provided by all + subclasses. + """ + + raise NotImplementedError + + +class Scenario: + """ The base class for scenario. A scenario is an ordered list of + ScenarioComponent objects. A scenario proceeds by setting up all its + components in sequence, running a list of tests and audits, and then + tearing down its components in reverse. + """ + + def __init__(self, cm, components, audits, tests): + """ Create a new Scenario instance + + Arguments: + + cm -- A ClusterManager instance + components -- A list of ScenarioComponents comprising this Scenario + audits -- A list of ClusterAudits that will be performed as + part of this Scenario + tests -- A list of CTSTests that will be run + """ + + # pylint: disable=invalid-name + + self.stats = { + "success": 0, + "failure": 0, + "BadNews": 0, + "skipped": 0 + } + self.tests = tests + + self._audits = audits + self._bad_news = None + self._cm = cm + self._components = components + + for comp in components: + if not issubclass(comp.__class__, ScenarioComponent): + raise ValueError("Init value must be subclass of ScenarioComponent") + + for audit in audits: + if not issubclass(audit.__class__, ClusterAudit): + raise ValueError("Init value must be subclass of ClusterAudit") + + for test in tests: + if not issubclass(test.__class__, CTSTest): + raise ValueError("Init value must be a subclass of CTSTest") + + def is_applicable(self): + """ Return True if all ScenarioComponents are applicable """ + + for comp in self._components: + if not comp.is_applicable(): + return False + + return True + + def setup(self): + """ Set up the scenario, returning True on success. If setup fails at + some point, tear down those components that did successfully set up. + """ + + self._cm.prepare() + self.audit() # Also detects remote/local log config + self._cm.ns.wait_for_all_nodes(self._cm.env["nodes"]) + + self.audit() + self._cm.install_support() + + self._bad_news = LogWatcher(self._cm.env["LogFileName"], + self._cm.templates.get_patterns("BadNews"), + self._cm.env["nodes"], + self._cm.env["LogWatcher"], + "BadNews", 0) + self._bad_news.set_watch() # Call after we've figured out what type of log watching to do in LogAudit + + j = 0 + while j < len(self._components): + if not self._components[j].setup(): + # OOPS! We failed. Tear partial setups down. + self.audit() + self._cm.log("Tearing down partial setup") + self.teardown(j) + return False + + j += 1 + + self.audit() + return True + + def teardown(self, n_components=None): + """ Tear down the scenario in the reverse order it was set up. If + n_components is not None, only tear down that many components. + """ + + if not n_components: + n_components = len(self._components)-1 + + j = n_components + + while j >= 0: + self._components[j].teardown() + j -= 1 + + self.audit() + self._cm.install_support("uninstall") + + def incr(self, name): + """ Increment the given stats key """ + + if not name in self.stats: + self.stats[name] = 0 + + self.stats[name] += 1 + + def run(self, iterations): + """ Run all tests in the scenario the given number of times """ + + self._cm.oprofile_start() + + try: + self._run_loop(iterations) + self._cm.oprofile_stop() + except: + self._cm.oprofile_stop() + raise + + def _run_loop(self, iterations): + """ Do the hard part of the run method - actually run all the tests the + given number of times. + """ + + raise NotImplementedError + + def run_test(self, test, testcount): + """ Run the given test. testcount is the number of tests (including + this one) that have been run across all iterations. + """ + + nodechoice = self._cm.env.random_node() + + ret = True + did_run = False + + self._cm.clear_instance_errors_to_ignore() + choice = "(%s)" % nodechoice + self._cm.log("Running test {:<22} {:<15} [{:>3}]".format(test.name, choice, testcount)) + + starttime = test.set_timer() + + if not test.setup(nodechoice): + self._cm.log("Setup failed") + ret = False + else: + did_run = True + ret = test(nodechoice) + + if not test.teardown(nodechoice): + self._cm.log("Teardown failed") + + if not should_continue(self._cm.env): + raise ValueError("Teardown of %s on %s failed" % (test.name, nodechoice)) + + ret = False + + stoptime = time.time() + self._cm.oprofile_save(testcount) + + elapsed_time = stoptime - starttime + test_time = stoptime - test.get_timer() + + if "min_time" not in test.stats: + test.stats["elapsed_time"] = elapsed_time + test.stats["min_time"] = test_time + test.stats["max_time"] = test_time + else: + test.stats["elapsed_time"] += elapsed_time + + if test_time < test.stats["min_time"]: + test.stats["min_time"] = test_time + + if test_time > test.stats["max_time"]: + test.stats["max_time"] = test_time + + if ret: + self.incr("success") + test.log_timer() + else: + self.incr("failure") + self._cm.statall() + did_run = True # Force the test count to be incremented anyway so test extraction works + + self.audit(test.errors_to_ignore) + return did_run + + def summarize(self): + """ Output scenario results """ + + self._cm.log("****************") + self._cm.log("Overall Results:%r" % self.stats) + self._cm.log("****************") + + stat_filter = { + "calls": 0, + "failure": 0, + "skipped": 0, + "auditfail": 0, + } + + self._cm.log("Test Summary") + for test in self.tests: + for key in stat_filter: + stat_filter[key] = test.stats[key] + + name = "Test %s:" % test.name + self._cm.log("{:<25} {!r}".format(name, stat_filter)) + + self._cm.debug("Detailed Results") + for test in self.tests: + name = "Test %s:" % test.name + self._cm.debug("{:<25} {!r}".format(name, stat_filter)) + + self._cm.log("<<<<<<<<<<<<<<<< TESTS COMPLETED") + + def audit(self, local_ignore=None): + """ Perform all scenario audits and log results. If there are too many + failures, prompt the user to confirm that the scenario should continue + running. + """ + + errcount = 0 + + ignorelist = ["CTS:"] + + if local_ignore: + ignorelist.extend(local_ignore) + + ignorelist.extend(self._cm.errors_to_ignore) + ignorelist.extend(self._cm.instance_errors_to_ignore) + + # This makes sure everything is stabilized before starting... + failed = 0 + for audit in self._audits: + if not audit(): + self._cm.log("Audit %s FAILED." % audit.name) + failed += 1 + else: + self._cm.debug("Audit %s passed." % audit.name) + + while errcount < 1000: + match = None + if self._bad_news: + match = self._bad_news.look(0) + + if match: + add_err = True + + for ignore in ignorelist: + if add_err and re.search(ignore, match): + add_err = False + + if add_err: + self._cm.log("BadNews: %s" % match) + self.incr("BadNews") + errcount += 1 + else: + break + else: + print("Big problems") + if not should_continue(self._cm.env): + self._cm.log("Shutting down.") + self.summarize() + self.teardown() + raise ValueError("Looks like we hit a BadNews jackpot!") + + if self._bad_news: + self._bad_news.end() + + return failed + + +class AllOnce(Scenario): + """ Every Test Once """ + + def _run_loop(self, iterations): + testcount = 1 + + for test in self.tests: + self.run_test(test, testcount) + testcount += 1 + + +class RandomTests(Scenario): + """ Random Test Execution """ + + def _run_loop(self, iterations): + testcount = 1 + + while testcount <= iterations: + test = self._cm.env.random_gen.choice(self.tests) + self.run_test(test, testcount) + testcount += 1 + + +class Sequence(Scenario): + """ Named Tests in Sequence """ + + def _run_loop(self, iterations): + testcount = 1 + + while testcount <= iterations: + for test in self.tests: + self.run_test(test, testcount) + testcount += 1 + + +class Boot(Scenario): + """ Start the Cluster """ + + def _run_loop(self, iterations): + return + + +class BootCluster(ScenarioComponent): + """ The BootCluster component simply starts the cluster manager on all + nodes, waiting for each to come up before starting given that a node + might have been rebooted or crashed beforehand. + """ + + def is_applicable(self): + """ BootCluster is always applicable """ + + return True + + def setup(self): + """ Set up the component, returning True on success """ + + self._cm.prepare() + + # Clear out the cobwebs ;-) + self._cm.stopall(verbose=True, force=True) + + # Now start the Cluster Manager on all the nodes. + self._cm.log("Starting Cluster Manager on all nodes.") + return self._cm.startall(verbose=True, quick=True) + + def teardown(self): + """ Tear down the component """ + + self._cm.log("Stopping Cluster Manager on all nodes") + self._cm.stopall(verbose=True, force=False) + + +class LeaveBooted(BootCluster): + """ The LeaveBooted component leaves all nodes up when the scenario + is complete. + """ + + def teardown(self): + """ Tear down the component """ + + self._cm.log("Leaving Cluster running on all nodes") diff --git a/python/pacemaker/_cts/test.py b/python/pacemaker/_cts/test.py index fb809a9..577ebb3 100644 --- a/python/pacemaker/_cts/test.py +++ b/python/pacemaker/_cts/test.py @@ -147,7 +147,7 @@ class Test: this requires all subclasses to set self._daemon_location before accessing this property or an exception will be raised. """ - return os.path.join(self.logdir, self._daemon_location + ".log") + return os.path.join(self.logdir, "%s.log" % self._daemon_location) ### ### PRIVATE METHODS @@ -287,6 +287,17 @@ class Test: self._patterns.append(Pattern(pattern, negative=negative, regex=regex)) + def _signal_dict(self): + """ Return a dictionary mapping signal numbers to their names """ + + # FIXME: When we support python >= 3.5, this function can be replaced with: + # signal.Signals(self.daemon_process.returncode).name + return { + getattr(signal, _signame): _signame + for _signame in dir(signal) + if _signame.startswith("SIG") and not _signame.startswith("SIG_") + } + def clean_environment(self): """ Clean up the host after executing a test """ @@ -295,13 +306,11 @@ class Test: self._daemon_process.terminate() self._daemon_process.wait() else: - return_code = { - getattr(signal, _signame): _signame - for _signame in dir(signal) - if _signame.startswith('SIG') and not _signame.startswith("SIG_") - }.get(-self._daemon_process.returncode, "RET=%d" % (self._daemon_process.returncode)) + rc = self._daemon_process.returncode + signame = self._signal_dict().get(-rc, "RET=%s" % rc) msg = "FAILURE - '%s' failed. %s abnormally exited during test (%s)." - self._result_txt = msg % (self.name, self._daemon_location, return_code) + + self._result_txt = msg % (self.name, self._daemon_location, signame) self.exitcode = ExitStatus.ERROR self._daemon_process = None @@ -311,7 +320,7 @@ class Test: # makes fenced output any kind of 8 bit value - while still interesting # for debugging and we'd still like the regression-test to go over the # full set of test-cases - with open(self.logpath, 'rt', encoding = "ISO-8859-1") as logfile: + with open(self.logpath, 'rt', encoding="ISO-8859-1") as logfile: for line in logfile.readlines(): self._daemon_output += line @@ -361,7 +370,7 @@ class Test: if self.verbose: print("Step %d SUCCESS" % (i)) - i = i + 1 + i += 1 self.clean_environment() @@ -427,7 +436,7 @@ class Test: if args['validate']: if args['check_rng']: - rng_file = rng_directory() + "/api/api-result.rng" + rng_file = "%s/api/api-result.rng" % rng_directory() else: rng_file = None @@ -478,7 +487,7 @@ class Test: if not self.force_wait and logfile is None \ and os.path.exists(self.logpath): - logfile = io.open(self.logpath, 'rt', encoding = "ISO-8859-1") + logfile = io.open(self.logpath, 'rt', encoding="ISO-8859-1") if not self.force_wait and logfile is not None: for line in logfile.readlines(): @@ -562,10 +571,10 @@ class Tests: continue if test.exitcode != ExitStatus.OK: - failures = failures + 1 + failures += 1 test.print_result(" ") else: - success = success + 1 + success += 1 if failures == 0: print(" None") diff --git a/python/pacemaker/_cts/tests/Makefile.am b/python/pacemaker/_cts/tests/Makefile.am new file mode 100644 index 0000000..0dba74b --- /dev/null +++ b/python/pacemaker/_cts/tests/Makefile.am @@ -0,0 +1,14 @@ +# +# Copyright 2023 the Pacemaker project contributors +# +# The version control history for this file may have further details. +# +# This source code is licensed under the GNU General Public License version 2 +# or later (GPLv2+) WITHOUT ANY WARRANTY. +# + +MAINTAINERCLEANFILES = Makefile.in + +pkgpythondir = $(pythondir)/$(PACKAGE)/_cts/tests + +pkgpython_PYTHON = $(wildcard *.py) diff --git a/python/pacemaker/_cts/tests/__init__.py b/python/pacemaker/_cts/tests/__init__.py new file mode 100644 index 0000000..63b34aa --- /dev/null +++ b/python/pacemaker/_cts/tests/__init__.py @@ -0,0 +1,87 @@ +""" +Test classes for the `pacemaker._cts` package. +""" + +__copyright__ = "Copyright 2023 the Pacemaker project contributors" +__license__ = "GNU Lesser General Public License version 2.1 or later (LGPLv2.1+)" + +from pacemaker._cts.tests.componentfail import ComponentFail +from pacemaker._cts.tests.ctstest import CTSTest +from pacemaker._cts.tests.fliptest import FlipTest +from pacemaker._cts.tests.maintenancemode import MaintenanceMode +from pacemaker._cts.tests.nearquorumpointtest import NearQuorumPointTest +from pacemaker._cts.tests.partialstart import PartialStart +from pacemaker._cts.tests.reattach import Reattach +from pacemaker._cts.tests.restartonebyone import RestartOnebyOne +from pacemaker._cts.tests.resourcerecover import ResourceRecover +from pacemaker._cts.tests.restarttest import RestartTest +from pacemaker._cts.tests.resynccib import ResyncCIB +from pacemaker._cts.tests.remotebasic import RemoteBasic +from pacemaker._cts.tests.remotedriver import RemoteDriver +from pacemaker._cts.tests.remotemigrate import RemoteMigrate +from pacemaker._cts.tests.remoterscfailure import RemoteRscFailure +from pacemaker._cts.tests.remotestonithd import RemoteStonithd +from pacemaker._cts.tests.simulstart import SimulStart +from pacemaker._cts.tests.simulstop import SimulStop +from pacemaker._cts.tests.simulstartlite import SimulStartLite +from pacemaker._cts.tests.simulstoplite import SimulStopLite +from pacemaker._cts.tests.splitbraintest import SplitBrainTest +from pacemaker._cts.tests.standbytest import StandbyTest +from pacemaker._cts.tests.starttest import StartTest +from pacemaker._cts.tests.startonebyone import StartOnebyOne +from pacemaker._cts.tests.stonithdtest import StonithdTest +from pacemaker._cts.tests.stoponebyone import StopOnebyOne +from pacemaker._cts.tests.stoptest import StopTest + +def test_list(cm, audits): + """ Return a list of test class objects that are enabled and whose + is_applicable methods return True. These are the tests that + should be run. + """ + + # cm is a reasonable name here. + # pylint: disable=invalid-name + + # A list of all enabled test classes, in the order that they should + # be run (if we're doing --once). There are various other ways of + # specifying which tests should be run, in which case the order here + # will not matter. + # + # Note that just because a test is listed here doesn't mean it will + # definitely be run - is_applicable is still taken into consideration. + # Also note that there are other tests that are excluded from this + # list for various reasons. + enabled_test_classes = [ + FlipTest, + RestartTest, + StonithdTest, + StartOnebyOne, + SimulStart, + SimulStop, + StopOnebyOne, + RestartOnebyOne, + PartialStart, + StandbyTest, + MaintenanceMode, + ResourceRecover, + ComponentFail, + SplitBrainTest, + Reattach, + ResyncCIB, + NearQuorumPointTest, + RemoteBasic, + RemoteStonithd, + RemoteMigrate, + RemoteRscFailure, + ] + + result = [] + + for testclass in enabled_test_classes: + bound_test = testclass(cm) + + if bound_test.is_applicable(): + bound_test.audits = audits + result.append(bound_test) + + return result diff --git a/python/pacemaker/_cts/tests/componentfail.py b/python/pacemaker/_cts/tests/componentfail.py new file mode 100644 index 0000000..f3d3622 --- /dev/null +++ b/python/pacemaker/_cts/tests/componentfail.py @@ -0,0 +1,167 @@ +""" Kill a pacemaker daemon and test how the cluster recovers """ + +__all__ = ["ComponentFail"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +import re + +from pacemaker._cts.audits import AuditResource +from pacemaker._cts.tests.ctstest import CTSTest +from pacemaker._cts.tests.simulstartlite import SimulStartLite + +# Disable various pylint warnings that occur in so many places throughout this +# file it's easiest to just take care of them globally. This does introduce the +# possibility that we'll miss some other cause of the same warning, but we'll +# just have to be careful. + +# pylint doesn't understand that self._rsh is callable. +# pylint: disable=not-callable +# pylint doesn't understand that self._env is subscriptable. +# pylint: disable=unsubscriptable-object + + +class ComponentFail(CTSTest): + """ A concrete test that kills a random pacemaker daemon and waits for the + cluster to recover + """ + + def __init__(self, cm): + """ Create a new ComponentFail instance + + Arguments: + + cm -- A ClusterManager instance + """ + + CTSTest.__init__(self, cm) + + self.is_unsafe = True + self.name = "ComponentFail" + + self._complist = cm.components + self._okerrpatterns = [] + self._patterns = [] + self._startall = SimulStartLite(cm) + + def __call__(self, node): + """ Perform this test """ + + self.incr("calls") + self._patterns = [] + self._okerrpatterns = [] + + # start all nodes + ret = self._startall(None) + if not ret: + return self.failure("Setup failed") + + if not self._cm.cluster_stable(self._env["StableTime"]): + return self.failure("Setup failed - unstable") + + node_is_dc = self._cm.is_node_dc(node, None) + + # select a component to kill + chosen = self._env.random_gen.choice(self._complist) + while chosen.dc_only and not node_is_dc: + chosen = self._env.random_gen.choice(self._complist) + + self.debug("...component %s (dc=%s)" % (chosen.name, node_is_dc)) + self.incr(chosen.name) + + if chosen.name != "corosync": + self._patterns.extend([ + self.templates["Pat:ChildKilled"] % (node, chosen.name), + self.templates["Pat:ChildRespawn"] % (node, chosen.name), + ]) + + self._patterns.extend(chosen.pats) + if node_is_dc: + self._patterns.extend(chosen.dc_pats) + + # @TODO this should be a flag in the Component + if chosen.name in ["corosync", "pacemaker-based", "pacemaker-fenced"]: + # Ignore actions for fence devices if fencer will respawn + # (their registration will be lost, and probes will fail) + self._okerrpatterns = [ + self.templates["Pat:Fencing_active"], + ] + (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) + + for line in lines: + if re.search("^Resource", line): + r = AuditResource(self._cm, line) + + if r.rclass == "stonith": + self._okerrpatterns.extend([ + self.templates["Pat:Fencing_recover"] % r.id, + self.templates["Pat:Fencing_probe"] % r.id, + ]) + + # supply a copy so self.patterns doesn't end up empty + tmp_pats = self._patterns.copy() + self._patterns.extend(chosen.badnews_ignore) + + # Look for STONITH ops, depending on Env["at-boot"] we might need to change the nodes status + stonith_pats = [ + self.templates["Pat:Fencing_ok"] % node + ] + stonith = self.create_watch(stonith_pats, 0) + stonith.set_watch() + + # set the watch for stable + watch = self.create_watch( + tmp_pats, self._env["DeadTime"] + self._env["StableTime"] + self._env["StartTime"]) + + watch.set_watch() + + # kill the component + chosen.kill(node) + + self.debug("Waiting for the cluster to recover") + self._cm.cluster_stable() + + self.debug("Waiting for any fenced node to come back up") + self._cm.ns.wait_for_all_nodes(self._env["nodes"], 600) + + self.debug("Waiting for the cluster to re-stabilize with all nodes") + self._cm.cluster_stable(self._env["StartTime"]) + + self.debug("Checking if %s was shot" % node) + shot = stonith.look(60) + + if shot: + self.debug("Found: %r" % shot) + self._okerrpatterns.append(self.templates["Pat:Fencing_start"] % node) + + if not self._env["at-boot"]: + self._cm.expected_status[node] = "down" + + # If fencing occurred, chances are many (if not all) the expected logs + # will not be sent - or will be lost when the node reboots + return self.success() + + # check for logs indicating a graceful recovery + matched = watch.look_for_all(allow_multiple_matches=True) + if watch.unmatched: + self._logger.log("Patterns not found: %r" % watch.unmatched) + + self.debug("Waiting for the cluster to re-stabilize with all nodes") + is_stable = self._cm.cluster_stable(self._env["StartTime"]) + + if not matched: + return self.failure("Didn't find all expected %s patterns" % chosen.name) + + if not is_stable: + return self.failure("Cluster did not become stable after killing %s" % chosen.name) + + return self.success() + + @property + def errors_to_ignore(self): + """ Return list of errors which should be ignored """ + + # Note that okerrpatterns refers to the last time we ran this test + # The good news is that this works fine for us... + self._okerrpatterns.extend(self._patterns) + return self._okerrpatterns diff --git a/python/pacemaker/_cts/tests/ctstest.py b/python/pacemaker/_cts/tests/ctstest.py new file mode 100644 index 0000000..8669e48 --- /dev/null +++ b/python/pacemaker/_cts/tests/ctstest.py @@ -0,0 +1,252 @@ +""" Base classes for CTS tests """ + +__all__ = ["CTSTest"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +import re + +from pacemaker._cts.environment import EnvFactory +from pacemaker._cts.logging import LogFactory +from pacemaker._cts.patterns import PatternSelector +from pacemaker._cts.remote import RemoteFactory +from pacemaker._cts.timer import Timer +from pacemaker._cts.watcher import LogWatcher + +# Disable various pylint warnings that occur in so many places throughout this +# file it's easiest to just take care of them globally. This does introduce the +# possibility that we'll miss some other cause of the same warning, but we'll +# just have to be careful. + +# pylint doesn't understand that self._rsh is callable. +# pylint: disable=not-callable + + +class CTSTest: + """ The base class for all cluster tests. This implements a basic set of + properties and behaviors like setup, tear down, time keeping, and + statistics tracking. It is up to specific tests to implement their own + specialized behavior on top of this class. + """ + + def __init__(self, cm): + """ Create a new CTSTest instance + + Arguments: + + cm -- A ClusterManager instance + """ + + # pylint: disable=invalid-name + + self.audits = [] + self.name = None + self.templates = PatternSelector(cm["Name"]) + + self.stats = { + "auditfail": 0, + "calls": 0, + "failure": 0, + "skipped": 0, + "success": 0 + } + + self._cm = cm + self._env = EnvFactory().getInstance() + self._r_o2cb = None + self._r_ocfs2 = [] + self._rsh = RemoteFactory().getInstance() + self._logger = LogFactory() + self._timers = {} + + self.benchmark = True # which tests to benchmark + self.failed = False + self.is_experimental = False + self.is_loop = False + self.is_unsafe = False + self.is_valgrind = False + self.passed = True + + def log(self, args): + """ Log a message """ + + self._logger.log(args) + + def debug(self, args): + """ Log a debug message """ + + self._logger.debug(args) + + def get_timer(self, key="test"): + """ Get the start time of the given timer """ + + try: + return self._timers[key].start_time + except KeyError: + return 0 + + def set_timer(self, key="test"): + """ Set the start time of the given timer to now, and return + that time + """ + + if key not in self._timers: + self._timers[key] = Timer(self._logger, self.name, key) + + self._timers[key].start() + return self._timers[key].start_time + + def log_timer(self, key="test"): + """ Log the elapsed time of the given timer """ + + if key not in self._timers: + return + + elapsed = self._timers[key].elapsed + self.debug("%s:%s runtime: %.2f" % (self.name, key, elapsed)) + del self._timers[key] + + def incr(self, name): + """ Increment the given stats key """ + + if name not in self.stats: + self.stats[name] = 0 + + self.stats[name] += 1 + + # Reset the test passed boolean + if name == "calls": + self.passed = True + + def failure(self, reason="none"): + """ Increment the failure count, with an optional failure reason """ + + self.passed = False + self.incr("failure") + self._logger.log(("Test %s" % self.name).ljust(35) + " FAILED: %s" % reason) + + return False + + def success(self): + """ Increment the success count """ + + self.incr("success") + return True + + def skipped(self): + """ Increment the skipped count """ + + self.incr("skipped") + return True + + def __call__(self, node): + """ Perform this test """ + + raise NotImplementedError + + def audit(self): + """ Perform all the relevant audits (see ClusterAudit), returning + whether or not they all passed. + """ + + passed = True + + for audit in self.audits: + if not audit(): + self._logger.log("Internal %s Audit %s FAILED." % (self.name, audit.name)) + self.incr("auditfail") + passed = False + + return passed + + def setup(self, node): + """ Setup this test """ + + # node is used in subclasses + # pylint: disable=unused-argument + + return self.success() + + def teardown(self, node): + """ Tear down this test """ + + # node is used in subclasses + # pylint: disable=unused-argument + + return self.success() + + def create_watch(self, patterns, timeout, name=None): + """ Create a new LogWatcher object with the given patterns, timeout, + and optional name. This object can be used to search log files + for matching patterns during this test's run. + """ + if not name: + name = self.name + + return LogWatcher(self._env["LogFileName"], patterns, self._env["nodes"], self._env["LogWatcher"], name, timeout) + + def local_badnews(self, prefix, watch, local_ignore=None): + """ Use the given watch object to search through log files for messages + starting with the given prefix. If no prefix is given, use + "LocalBadNews:" by default. The optional local_ignore list should + be a list of regexes that, if found in a line, will cause that line + to be ignored. + + Return the number of matches found. + """ + errcount = 0 + if not prefix: + prefix = "LocalBadNews:" + + ignorelist = [" CTS: ", prefix] + + if local_ignore: + ignorelist += local_ignore + + while errcount < 100: + match = watch.look(0) + if match: + add_err = True + + for ignore in ignorelist: + if add_err and re.search(ignore, match): + add_err = False + + if add_err: + self._logger.log("%s %s" % (prefix, match)) + errcount += 1 + else: + break + else: + self._logger.log("Too many errors!") + + watch.end() + return errcount + + def is_applicable(self): + """ Return True if this test is applicable in the current test configuration. + This method must be implemented by all subclasses. + """ + + if self.is_loop and not self._env["loop-tests"]: + return False + + if self.is_unsafe and not self._env["unsafe-tests"]: + return False + + if self.is_valgrind and not self._env["valgrind-tests"]: + return False + + if self.is_experimental and not self._env["experimental-tests"]: + return False + + if self._env["benchmark"] and not self.benchmark: + return False + + return True + + @property + def errors_to_ignore(self): + """ Return list of errors which should be ignored """ + + return [] diff --git a/python/pacemaker/_cts/tests/fliptest.py b/python/pacemaker/_cts/tests/fliptest.py new file mode 100644 index 0000000..5e77936 --- /dev/null +++ b/python/pacemaker/_cts/tests/fliptest.py @@ -0,0 +1,61 @@ +""" Stop running nodes, and start stopped nodes """ + +__all__ = ["FlipTest"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +import time + +from pacemaker._cts.tests.ctstest import CTSTest +from pacemaker._cts.tests.starttest import StartTest +from pacemaker._cts.tests.stoptest import StopTest + +# Disable various pylint warnings that occur in so many places throughout this +# file it's easiest to just take care of them globally. This does introduce the +# possibility that we'll miss some other cause of the same warning, but we'll +# just have to be careful. + +# pylint doesn't understand that self._env is subscriptable. +# pylint: disable=unsubscriptable-object + + +class FlipTest(CTSTest): + """ A concrete test that stops running nodes and starts stopped nodes """ + + def __init__(self, cm): + """ Create a new FlipTest instance + + Arguments: + + cm -- A ClusterManager instance + """ + + CTSTest.__init__(self, cm) + self.name = "Flip" + + self._start = StartTest(cm) + self._stop = StopTest(cm) + + def __call__(self, node): + """ Perform this test """ + + self.incr("calls") + + if self._cm.expected_status[node] == "up": + self.incr("stopped") + ret = self._stop(node) + kind = "up->down" + # Give the cluster time to recognize it's gone... + time.sleep(self._env["StableTime"]) + elif self._cm.expected_status[node] == "down": + self.incr("started") + ret = self._start(node) + kind = "down->up" + else: + return self.skipped() + + self.incr(kind) + if ret: + return self.success() + + return self.failure("%s failure" % kind) diff --git a/python/pacemaker/_cts/tests/maintenancemode.py b/python/pacemaker/_cts/tests/maintenancemode.py new file mode 100644 index 0000000..3c57c07 --- /dev/null +++ b/python/pacemaker/_cts/tests/maintenancemode.py @@ -0,0 +1,238 @@ +""" Toggle nodes in and out of maintenance mode """ + +__all__ = ["MaintenanceMode"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +import re + +from pacemaker._cts.audits import AuditResource +from pacemaker._cts.tests.ctstest import CTSTest +from pacemaker._cts.tests.simulstartlite import SimulStartLite +from pacemaker._cts.tests.starttest import StartTest +from pacemaker._cts.timer import Timer + +# Disable various pylint warnings that occur in so many places throughout this +# file it's easiest to just take care of them globally. This does introduce the +# possibility that we'll miss some other cause of the same warning, but we'll +# just have to be careful. + +# pylint doesn't understand that self._rsh is callable. +# pylint: disable=not-callable + + +class MaintenanceMode(CTSTest): + """ A concrete test that toggles nodes in and out of maintenance mode """ + + def __init__(self, cm): + """ Create a new MaintenanceMode instance + + Arguments: + + cm -- A ClusterManager instance + """ + + CTSTest.__init__(self, cm) + + self.benchmark = True + self.name = "MaintenanceMode" + + self._action = "asyncmon" + self._rid = "maintenanceDummy" + self._start = StartTest(cm) + self._startall = SimulStartLite(cm) + + def _toggle_maintenance_mode(self, node, enabled): + """ Toggle maintenance mode on the given node """ + + pats = [ + self.templates["Pat:DC_IDLE"] + ] + + if enabled: + action = "On" + else: + action = "Off" + + # fail the resource right after turning Maintenance mode on + # verify it is not recovered until maintenance mode is turned off + if enabled: + pats.append(self.templates["Pat:RscOpFail"] % (self._action, self._rid)) + else: + pats.extend([ + self.templates["Pat:RscOpOK"] % ("stop", self._rid), + self.templates["Pat:RscOpOK"] % ("start", self._rid) + ]) + + watch = self.create_watch(pats, 60) + watch.set_watch() + + self.debug("Turning maintenance mode %s" % action) + self._rsh(node, self.templates["MaintenanceMode%s" % action]) + + if enabled: + self._rsh(node, "crm_resource -V -F -r %s -H %s &>/dev/null" % (self._rid, node)) + + with Timer(self._logger, self.name, "recover%s" % action): + watch.look_for_all() + + if watch.unmatched: + self.debug("Failed to find patterns when turning maintenance mode %s" % action) + return repr(watch.unmatched) + + return "" + + def _insert_maintenance_dummy(self, node): + """ Create a dummy resource on the given node """ + + pats = [ + ("%s.*" % node) + (self.templates["Pat:RscOpOK"] % ("start", self._rid)) + ] + + watch = self.create_watch(pats, 60) + watch.set_watch() + + self._cm.add_dummy_rsc(node, self._rid) + + with Timer(self._logger, self.name, "addDummy"): + watch.look_for_all() + + if watch.unmatched: + self.debug("Failed to find patterns when adding maintenance dummy resource") + return repr(watch.unmatched) + + return "" + + def _remove_maintenance_dummy(self, node): + """ Remove the previously created dummy resource on the given node """ + + pats = [ + self.templates["Pat:RscOpOK"] % ("stop", self._rid) + ] + + watch = self.create_watch(pats, 60) + watch.set_watch() + self._cm.remove_dummy_rsc(node, self._rid) + + with Timer(self._logger, self.name, "removeDummy"): + watch.look_for_all() + + if watch.unmatched: + self.debug("Failed to find patterns when removing maintenance dummy resource") + return repr(watch.unmatched) + + return "" + + def _managed_rscs(self, node): + """ Return a list of all resources managed by the cluster """ + + rscs = [] + (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) + + for line in lines: + if re.search("^Resource", line): + tmp = AuditResource(self._cm, line) + + if tmp.managed: + rscs.append(tmp.id) + + return rscs + + def _verify_resources(self, node, rscs, managed): + """ Verify that all resources in rscList are managed if they are expected + to be, or unmanaged if they are expected to be. + """ + + managed_rscs = rscs + managed_str = "managed" + + if not managed: + managed_str = "unmanaged" + + (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) + for line in lines: + if re.search("^Resource", line): + tmp = AuditResource(self._cm, line) + + if managed and not tmp.managed: + continue + + if not managed and tmp.managed: + continue + + if managed_rscs.count(tmp.id): + managed_rscs.remove(tmp.id) + + if not managed_rscs: + self.debug("Found all %s resources on %s" % (managed_str, node)) + return True + + self._logger.log("Could not find all %s resources on %s. %s" % (managed_str, node, managed_rscs)) + return False + + def __call__(self, node): + """ Perform this test """ + + self.incr("calls") + verify_managed = False + verify_unmanaged = False + fail_pat = "" + + if not self._startall(None): + return self.failure("Setup failed") + + # get a list of all the managed resources. We use this list + # after enabling maintenance mode to verify all managed resources + # become un-managed. After maintenance mode is turned off, we use + # this list to verify all the resources become managed again. + managed_rscs = self._managed_rscs(node) + if not managed_rscs: + self._logger.log("No managed resources on %s" % node) + return self.skipped() + + # insert a fake resource we can fail during maintenance mode + # so we can verify recovery does not take place until after maintenance + # mode is disabled. + fail_pat += self._insert_maintenance_dummy(node) + + # toggle maintenance mode ON, then fail dummy resource. + fail_pat += self._toggle_maintenance_mode(node, True) + + # verify all the resources are now unmanaged + if self._verify_resources(node, managed_rscs, False): + verify_unmanaged = True + + # Toggle maintenance mode OFF, verify dummy is recovered. + fail_pat += self._toggle_maintenance_mode(node, False) + + # verify all the resources are now managed again + if self._verify_resources(node, managed_rscs, True): + verify_managed = True + + # Remove our maintenance dummy resource. + fail_pat += self._remove_maintenance_dummy(node) + + self._cm.cluster_stable() + + if fail_pat != "": + return self.failure("Unmatched patterns: %s" % fail_pat) + + if not verify_unmanaged: + return self.failure("Failed to verify resources became unmanaged during maintenance mode") + + if not verify_managed: + return self.failure("Failed to verify resources switched back to managed after disabling maintenance mode") + + return self.success() + + @property + def errors_to_ignore(self): + """ Return list of errors which should be ignored """ + + return [ + r"Updating failcount for %s" % self._rid, + r"schedulerd.*: Recover\s+%s\s+\(.*\)" % self._rid, + r"Unknown operation: fail", + self.templates["Pat:RscOpOK"] % (self._action, self._rid), + r"(ERROR|error).*: Action %s_%s_%d .* initiated outside of a transition" % (self._rid, self._action, 0) + ] diff --git a/python/pacemaker/_cts/tests/nearquorumpointtest.py b/python/pacemaker/_cts/tests/nearquorumpointtest.py new file mode 100644 index 0000000..c5b70b7 --- /dev/null +++ b/python/pacemaker/_cts/tests/nearquorumpointtest.py @@ -0,0 +1,125 @@ +""" Randomly start and stop nodes to bring the cluster close to the quorum point """ + +__all__ = ["NearQuorumPointTest"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +from pacemaker._cts.tests.ctstest import CTSTest + +# Disable various pylint warnings that occur in so many places throughout this +# file it's easiest to just take care of them globally. This does introduce the +# possibility that we'll miss some other cause of the same warning, but we'll +# just have to be careful. + +# pylint doesn't understand that self._rsh is callable. +# pylint: disable=not-callable +# pylint doesn't understand that self._env is subscriptable. +# pylint: disable=unsubscriptable-object + + +class NearQuorumPointTest(CTSTest): + """ A concrete test that randomly starts and stops nodes to bring the + cluster close to the quorum point + """ + + def __init__(self, cm): + """ Create a new NearQuorumPointTest instance + + Arguments: + + cm -- A ClusterManager instance + """ + + CTSTest.__init__(self, cm) + + self.name = "NearQuorumPoint" + + def __call__(self, dummy): + """ Perform this test """ + + self.incr("calls") + startset = [] + stopset = [] + + stonith = self._cm.prepare_fencing_watcher() + #decide what to do with each node + for node in self._env["nodes"]: + action = self._env.random_gen.choice(["start", "stop"]) + + if action == "start": + startset.append(node) + elif action == "stop": + stopset.append(node) + + self.debug("start nodes:%r" % startset) + self.debug("stop nodes:%r" % stopset) + + #add search patterns + watchpats = [] + for node in stopset: + if self._cm.expected_status[node] == "up": + watchpats.append(self.templates["Pat:We_stopped"] % node) + + for node in startset: + if self._cm.expected_status[node] == "down": + watchpats.append(self.templates["Pat:Local_started"] % node) + else: + for stopping in stopset: + if self._cm.expected_status[stopping] == "up": + watchpats.append(self.templates["Pat:They_stopped"] % (node, stopping)) + + if not watchpats: + return self.skipped() + + if startset: + watchpats.append(self.templates["Pat:DC_IDLE"]) + + watch = self.create_watch(watchpats, self._env["DeadTime"] + 10) + + watch.set_watch() + + #begin actions + for node in stopset: + if self._cm.expected_status[node] == "up": + self._cm.stop_cm_async(node) + + for node in startset: + if self._cm.expected_status[node] == "down": + self._cm.start_cm_async(node) + + #get the result + if watch.look_for_all(): + self._cm.cluster_stable() + self._cm.fencing_cleanup("NearQuorumPoint", stonith) + return self.success() + + self._logger.log("Warn: Patterns not found: %r" % watch.unmatched) + + #get the "bad" nodes + upnodes = [] + for node in stopset: + if self._cm.stat_cm(node): + upnodes.append(node) + + downnodes = [] + for node in startset: + if not self._cm.stat_cm(node): + downnodes.append(node) + + self._cm.fencing_cleanup("NearQuorumPoint", stonith) + if not upnodes and not downnodes: + self._cm.cluster_stable() + + # Make sure they're completely down with no residule + for node in stopset: + self._rsh(node, self.templates["StopCmd"]) + + return self.success() + + if upnodes: + self._logger.log("Warn: Unstoppable nodes: %r" % upnodes) + + if downnodes: + self._logger.log("Warn: Unstartable nodes: %r" % downnodes) + + return self.failure() diff --git a/python/pacemaker/_cts/tests/partialstart.py b/python/pacemaker/_cts/tests/partialstart.py new file mode 100644 index 0000000..1b074e6 --- /dev/null +++ b/python/pacemaker/_cts/tests/partialstart.py @@ -0,0 +1,75 @@ +""" Start a node and then tell it to stop before it is fully running """ + +__all__ = ["PartialStart"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +from pacemaker._cts.tests.ctstest import CTSTest +from pacemaker._cts.tests.simulstartlite import SimulStartLite +from pacemaker._cts.tests.simulstoplite import SimulStopLite +from pacemaker._cts.tests.stoptest import StopTest + +# Disable various pylint warnings that occur in so many places throughout this +# file it's easiest to just take care of them globally. This does introduce the +# possibility that we'll miss some other cause of the same warning, but we'll +# just have to be careful. + +# pylint doesn't understand that self._env is subscriptable. +# pylint: disable=unsubscriptable-object + + +class PartialStart(CTSTest): + """ A concrete test that interrupts a node before it's finished starting up """ + + def __init__(self, cm): + """ Create a new PartialStart instance + + Arguments: + + cm -- A ClusterManager instance + """ + + CTSTest.__init__(self, cm) + + self.name = "PartialStart" + + self._startall = SimulStartLite(cm) + self._stop = StopTest(cm) + self._stopall = SimulStopLite(cm) + + def __call__(self, node): + """ Perform this test """ + + self.incr("calls") + + ret = self._stopall(None) + if not ret: + return self.failure("Setup failed") + + watchpats = [ + "pacemaker-controld.*Connecting to .* cluster infrastructure" + ] + watch = self.create_watch(watchpats, self._env["DeadTime"] + 10) + watch.set_watch() + + self._cm.start_cm_async(node) + ret = watch.look_for_all() + if not ret: + self._logger.log("Patterns not found: %r" % watch.unmatched) + return self.failure("Setup of %s failed" % node) + + ret = self._stop(node) + if not ret: + return self.failure("%s did not stop in time" % node) + + return self.success() + + @property + def errors_to_ignore(self): + """ Return list of errors which should be ignored """ + + # We might do some fencing in the 2-node case if we make it up far enough + return [ + r"Executing reboot fencing operation", + r"Requesting fencing \([^)]+\) targeting node " + ] diff --git a/python/pacemaker/_cts/tests/reattach.py b/python/pacemaker/_cts/tests/reattach.py new file mode 100644 index 0000000..4452bc0 --- /dev/null +++ b/python/pacemaker/_cts/tests/reattach.py @@ -0,0 +1,221 @@ +""" Restart the cluster and verify resources remain running """ + +__all__ = ["Reattach"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +import re +import time + +from pacemaker.exitstatus import ExitStatus +from pacemaker._cts.audits import AuditResource +from pacemaker._cts.tests.ctstest import CTSTest +from pacemaker._cts.tests.simulstartlite import SimulStartLite +from pacemaker._cts.tests.simulstoplite import SimulStopLite +from pacemaker._cts.tests.starttest import StartTest + +# Disable various pylint warnings that occur in so many places throughout this +# file it's easiest to just take care of them globally. This does introduce the +# possibility that we'll miss some other cause of the same warning, but we'll +# just have to be careful. + +# pylint doesn't understand that self._rsh is callable. +# pylint: disable=not-callable + + +class Reattach(CTSTest): + """ A concrete test that restarts the cluster and verifies that resources + remain running throughout + """ + + def __init__(self, cm): + """ Create a new Reattach instance + + Arguments: + + cm -- A ClusterManager instance + """ + + CTSTest.__init__(self, cm) + + self.name = "Reattach" + + self._startall = SimulStartLite(cm) + self._stopall = SimulStopLite(cm) + + def _is_managed(self, node): + """ Are resources managed by the cluster? """ + + (_, is_managed) = self._rsh(node, "crm_attribute -t rsc_defaults -n is-managed -q -G -d true", verbose=1) + is_managed = is_managed[0].strip() + return is_managed == "true" + + def _set_unmanaged(self, node): + """ Disable resource management """ + + self.debug("Disable resource management") + self._rsh(node, "crm_attribute -t rsc_defaults -n is-managed -v false") + + def _set_managed(self, node): + """ Enable resource management """ + + self.debug("Re-enable resource management") + self._rsh(node, "crm_attribute -t rsc_defaults -n is-managed -D") + + def _disable_incompatible_rscs(self, node): + """ Disable resources that are incompatible with this test + + Starts and stops of stonith-class resources are implemented internally + by Pacemaker, which means that they must stop when Pacemaker is + stopped, even if unmanaged. Disable them before running the Reattach + test so they don't affect resource placement. + + OCFS2 resources must be disabled too for some reason. + + Set target-role to "Stopped" for any of these resources in the CIB. + """ + + self.debug("Disable incompatible (stonith/OCFS2) resources") + xml = """'<meta_attributes id="cts-lab-Reattach-meta"> + <nvpair id="cts-lab-Reattach-target-role" name="target-role" value="Stopped"/> + <rule id="cts-lab-Reattach-rule" boolean-op="or" score="INFINITY"> + <rsc_expression id="cts-lab-Reattach-stonith" class="stonith"/> + <rsc_expression id="cts-lab-Reattach-o2cb" type="o2cb"/> + </rule> + </meta_attributes>' --scope rsc_defaults""" + return self._rsh(node, self._cm.templates['CibAddXml'] % xml) + + def _enable_incompatible_rscs(self, node): + """ Re-enable resources that were incompatible with this test """ + + self.debug("Re-enable incompatible (stonith/OCFS2) resources") + xml = """<meta_attributes id="cts-lab-Reattach-meta">""" + return self._rsh(node, """cibadmin --delete --xml-text '%s'""" % xml) + + def _reprobe(self, node): + """ Reprobe all resources + + The placement of some resources (such as promotable-1 in the + lab-generated CIB) is affected by constraints using node-attribute-based + rules. An earlier test may have erased the relevant node attribute, so + do a reprobe, which should add the attribute back. + """ + + return self._rsh(node, """crm_resource --refresh""") + + def setup(self, node): + """ Setup this test """ + + if not self._startall(None): + return self.failure("Startall failed") + + (rc, _) = self._disable_incompatible_rscs(node) + if rc != ExitStatus.OK: + return self.failure("Couldn't modify CIB to stop incompatible resources") + + (rc, _) = self._reprobe(node) + if rc != ExitStatus.OK: + return self.failure("Couldn't reprobe resources") + + if not self._cm.cluster_stable(double_check=True): + return self.failure("Cluster did not stabilize after setup") + + return self.success() + + def teardown(self, node): + """ Tear down this test """ + + # Make sure 'node' is up + start = StartTest(self._cm) + start(node) + + if not self._is_managed(node): + self._set_managed(node) + + (rc, _) = self._enable_incompatible_rscs(node) + if rc != ExitStatus.OK: + return self.failure("Couldn't modify CIB to re-enable incompatible resources") + + if not self._cm.cluster_stable(): + return self.failure("Cluster did not stabilize after teardown") + if not self._is_managed(node): + return self.failure("Could not re-enable resource management") + + return self.success() + + def __call__(self, node): + """ Perform this test """ + + self.incr("calls") + + # Conveniently, the scheduler will display this message when disabling + # management, even if fencing is not enabled, so we can rely on it. + managed = self.create_watch(["No fencing will be done"], 60) + managed.set_watch() + + self._set_unmanaged(node) + + if not managed.look_for_all(): + self._logger.log("Patterns not found: %r" % managed.unmatched) + return self.failure("Resource management not disabled") + + pats = [ + self.templates["Pat:RscOpOK"] % ("start", ".*"), + self.templates["Pat:RscOpOK"] % ("stop", ".*"), + self.templates["Pat:RscOpOK"] % ("promote", ".*"), + self.templates["Pat:RscOpOK"] % ("demote", ".*"), + self.templates["Pat:RscOpOK"] % ("migrate", ".*") + ] + + watch = self.create_watch(pats, 60, "ShutdownActivity") + watch.set_watch() + + self.debug("Shutting down the cluster") + ret = self._stopall(None) + if not ret: + self._set_managed(node) + return self.failure("Couldn't shut down the cluster") + + self.debug("Bringing the cluster back up") + ret = self._startall(None) + time.sleep(5) # allow ping to update the CIB + if not ret: + self._set_managed(node) + return self.failure("Couldn't restart the cluster") + + if self.local_badnews("ResourceActivity:", watch): + self._set_managed(node) + return self.failure("Resources stopped or started during cluster restart") + + watch = self.create_watch(pats, 60, "StartupActivity") + watch.set_watch() + + # Re-enable resource management (and verify it happened). + self._set_managed(node) + self._cm.cluster_stable() + if not self._is_managed(node): + return self.failure("Could not re-enable resource management") + + # Ignore actions for STONITH resources + ignore = [] + (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) + for line in lines: + if re.search("^Resource", line): + r = AuditResource(self._cm, line) + + if r.rclass == "stonith": + self.debug("Ignoring start actions for %s" % r.id) + ignore.append(self.templates["Pat:RscOpOK"] % ("start", r.id)) + + if self.local_badnews("ResourceActivity:", watch, ignore): + return self.failure("Resources stopped or started after resource management was re-enabled") + + return ret + + @property + def errors_to_ignore(self): + """ Return list of errors which should be ignored """ + + return [ + r"resource( was|s were) active at shutdown" + ] diff --git a/python/pacemaker/_cts/tests/remotebasic.py b/python/pacemaker/_cts/tests/remotebasic.py new file mode 100644 index 0000000..2f25aaf --- /dev/null +++ b/python/pacemaker/_cts/tests/remotebasic.py @@ -0,0 +1,39 @@ +""" Start and stop a remote node """ + +__all__ = ["RemoteBasic"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +from pacemaker._cts.tests.remotedriver import RemoteDriver + + +class RemoteBasic(RemoteDriver): + """ A concrete test that starts and stops a remote node """ + + def __init__(self, cm): + """ Create a new RemoteBasic instance + + Arguments: + + cm -- A ClusterManager instance + """ + + RemoteDriver.__init__(self, cm) + + self.name = "RemoteBasic" + + def __call__(self, node): + """ Perform this test """ + + if not self.start_new_test(node): + return self.failure(self.fail_string) + + self.test_attributes(node) + self.cleanup_metal(node) + + self.debug("Waiting for the cluster to recover") + self._cm.cluster_stable() + if self.failed: + return self.failure(self.fail_string) + + return self.success() diff --git a/python/pacemaker/_cts/tests/remotedriver.py b/python/pacemaker/_cts/tests/remotedriver.py new file mode 100644 index 0000000..c5b0292 --- /dev/null +++ b/python/pacemaker/_cts/tests/remotedriver.py @@ -0,0 +1,556 @@ +""" Base classes for CTS tests """ + +__all__ = ["RemoteDriver"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +import os +import time +import subprocess +import tempfile + +from pacemaker._cts.tests.ctstest import CTSTest +from pacemaker._cts.tests.simulstartlite import SimulStartLite +from pacemaker._cts.tests.starttest import StartTest +from pacemaker._cts.tests.stoptest import StopTest +from pacemaker._cts.timer import Timer + +# Disable various pylint warnings that occur in so many places throughout this +# file it's easiest to just take care of them globally. This does introduce the +# possibility that we'll miss some other cause of the same warning, but we'll +# just have to be careful. + +# pylint doesn't understand that self._rsh is callable. +# pylint: disable=not-callable + + +class RemoteDriver(CTSTest): + """ A specialized base class for cluster tests that run on Pacemaker + Remote nodes. This builds on top of CTSTest to provide methods + for starting and stopping services and resources, and managing + remote nodes. This is still just an abstract class -- specific + tests need to implement their own specialized behavior. + """ + + def __init__(self, cm): + """ Create a new RemoteDriver instance + + Arguments: + + cm -- A ClusterManager instance + """ + + CTSTest.__init__(self, cm) + self.name = "RemoteDriver" + + self._corosync_enabled = False + self._pacemaker_enabled = False + self._remote_node = None + self._remote_rsc = "remote-rsc" + self._start = StartTest(cm) + self._startall = SimulStartLite(cm) + self._stop = StopTest(cm) + + self.reset() + + def reset(self): + """ Reset the state of this test back to what it was before the test + was run + """ + + self.failed = False + self.fail_string = "" + + self._pcmk_started = False + self._remote_node_added = False + self._remote_rsc_added = False + self._remote_use_reconnect_interval = self._env.random_gen.choice([True, False]) + + def fail(self, msg): + """ Mark test as failed """ + + self.failed = True + + # Always log the failure. + self._logger.log(msg) + + # Use first failure as test status, as it's likely to be most useful. + if not self.fail_string: + self.fail_string = msg + + def _get_other_node(self, node): + """ Get the first cluster node out of the environment that is not the + given node. Typically, this is used to find some node that will + still be active that we can run cluster commands on. + """ + + for othernode in self._env["nodes"]: + if othernode == node: + # we don't want to try and use the cib that we just shutdown. + # find a cluster node that is not our soon to be remote-node. + continue + + return othernode + + def _del_rsc(self, node, rsc): + """ Delete the given named resource from the cluster. The given `node` + is the cluster node on which we should *not* run the delete command. + """ + + othernode = self._get_other_node(node) + (rc, _) = self._rsh(othernode, "crm_resource -D -r %s -t primitive" % rsc) + if rc != 0: + self.fail("Removal of resource '%s' failed" % rsc) + + def _add_rsc(self, node, rsc_xml): + """ Add a resource given in XML format to the cluster. The given `node` + is the cluster node on which we should *not* run the add command. + """ + + othernode = self._get_other_node(node) + (rc, _) = self._rsh(othernode, "cibadmin -C -o resources -X '%s'" % rsc_xml) + if rc != 0: + self.fail("resource creation failed") + + def _add_primitive_rsc(self, node): + """ Add a primitive heartbeat resource for the remote node to the + cluster. The given `node` is the cluster node on which we should + *not* run the add command. + """ + + rsc_xml = """ +<primitive class="ocf" id="%(node)s" provider="heartbeat" type="Dummy"> + <meta_attributes id="%(node)s-meta_attributes"/> + <operations> + <op id="%(node)s-monitor-interval-20s" interval="20s" name="monitor"/> + </operations> +</primitive>""" % { + "node": self._remote_rsc +} + + self._add_rsc(node, rsc_xml) + if not self.failed: + self._remote_rsc_added = True + + def _add_connection_rsc(self, node): + """ Add a primitive connection resource for the remote node to the + cluster. The given `node` is teh cluster node on which we should + *not* run the add command. + """ + + rsc_xml = """ +<primitive class="ocf" id="%(node)s" provider="pacemaker" type="remote"> + <instance_attributes id="%(node)s-instance_attributes"> + <nvpair id="%(node)s-instance_attributes-server" name="server" value="%(server)s"/> +""" % { + "node": self._remote_node, "server": node +} + + if self._remote_use_reconnect_interval: + # Set reconnect interval on resource + rsc_xml += """ + <nvpair id="%s-instance_attributes-reconnect_interval" name="reconnect_interval" value="60s"/> +""" % self._remote_node + + rsc_xml += """ + </instance_attributes> + <operations> + <op id="%(node)s-start" name="start" interval="0" timeout="120s"/> + <op id="%(node)s-monitor-20s" name="monitor" interval="20s" timeout="45s"/> + </operations> +</primitive> +""" % { + "node": self._remote_node +} + + self._add_rsc(node, rsc_xml) + if not self.failed: + self._remote_node_added = True + + def _disable_services(self, node): + """ Disable the corosync and pacemaker services on the given node """ + + self._corosync_enabled = self._env.service_is_enabled(node, "corosync") + if self._corosync_enabled: + self._env.disable_service(node, "corosync") + + self._pacemaker_enabled = self._env.service_is_enabled(node, "pacemaker") + if self._pacemaker_enabled: + self._env.disable_service(node, "pacemaker") + + def _enable_services(self, node): + """ Enable the corosync and pacemaker services on the given node """ + + if self._corosync_enabled: + self._env.enable_service(node, "corosync") + + if self._pacemaker_enabled: + self._env.enable_service(node, "pacemaker") + + def _stop_pcmk_remote(self, node): + """ Stop the Pacemaker Remote service on the given node """ + + for _ in range(10): + (rc, _) = self._rsh(node, "service pacemaker_remote stop") + if rc != 0: + time.sleep(6) + else: + break + + def _start_pcmk_remote(self, node): + """ Start the Pacemaker Remote service on the given node """ + + for _ in range(10): + (rc, _) = self._rsh(node, "service pacemaker_remote start") + if rc != 0: + time.sleep(6) + else: + self._pcmk_started = True + break + + def _freeze_pcmk_remote(self, node): + """ Simulate a Pacemaker Remote daemon failure """ + + self._rsh(node, "killall -STOP pacemaker-remoted") + + def _resume_pcmk_remote(self, node): + """ Simulate the Pacemaker Remote daemon recovering """ + + self._rsh(node, "killall -CONT pacemaker-remoted") + + def _start_metal(self, node): + """ Setup a Pacemaker Remote configuration. Remove any existing + connection resources or nodes. Start the pacemaker_remote service. + Create a connection resource. + """ + + # Cluster nodes are reused as remote nodes in remote tests. If cluster + # services were enabled at boot, in case the remote node got fenced, the + # cluster node would join instead of the expected remote one. Meanwhile + # pacemaker_remote would not be able to start. Depending on the chances, + # the situations might not be able to be orchestrated gracefully any more. + # + # Temporarily disable any enabled cluster serivces. + self._disable_services(node) + + # make sure the resource doesn't already exist for some reason + self._rsh(node, "crm_resource -D -r %s -t primitive" % self._remote_rsc) + self._rsh(node, "crm_resource -D -r %s -t primitive" % self._remote_node) + + if not self._stop(node): + self.fail("Failed to shutdown cluster node %s" % node) + return + + self._start_pcmk_remote(node) + + if not self._pcmk_started: + self.fail("Failed to start pacemaker_remote on node %s" % node) + return + + # Convert node to baremetal now that it has shutdown the cluster stack + pats = [] + watch = self.create_watch(pats, 120) + watch.set_watch() + + pats.extend([ + self.templates["Pat:RscOpOK"] % ("start", self._remote_node), + self.templates["Pat:DC_IDLE"] + ]) + + self._add_connection_rsc(node) + + with Timer(self._logger, self.name, "remoteMetalInit"): + watch.look_for_all() + + if watch.unmatched: + self.fail("Unmatched patterns: %s" % watch.unmatched) + + def migrate_connection(self, node): + """ Move the remote connection resource from the node it's currently + running on to any other available node + """ + + if self.failed: + return + + pats = [ + self.templates["Pat:RscOpOK"] % ("migrate_to", self._remote_node), + self.templates["Pat:RscOpOK"] % ("migrate_from", self._remote_node), + self.templates["Pat:DC_IDLE"] + ] + + watch = self.create_watch(pats, 120) + watch.set_watch() + + (rc, _) = self._rsh(node, "crm_resource -M -r %s" % self._remote_node, verbose=1) + if rc != 0: + self.fail("failed to move remote node connection resource") + return + + with Timer(self._logger, self.name, "remoteMetalMigrate"): + watch.look_for_all() + + if watch.unmatched: + self.fail("Unmatched patterns: %s" % watch.unmatched) + + def fail_rsc(self, node): + """ Cause the dummy resource running on a Pacemaker Remote node to fail + and verify that the failure is logged correctly + """ + + if self.failed: + return + + watchpats = [ + self.templates["Pat:RscRemoteOpOK"] % ("stop", self._remote_rsc, self._remote_node), + self.templates["Pat:RscRemoteOpOK"] % ("start", self._remote_rsc, self._remote_node), + self.templates["Pat:DC_IDLE"] + ] + + watch = self.create_watch(watchpats, 120) + watch.set_watch() + + self.debug("causing dummy rsc to fail.") + + self._rsh(node, "rm -f /var/run/resource-agents/Dummy*") + + with Timer(self._logger, self.name, "remoteRscFail"): + watch.look_for_all() + + if watch.unmatched: + self.fail("Unmatched patterns during rsc fail: %s" % watch.unmatched) + + def fail_connection(self, node): + """ Cause the remote connection resource to fail and verify that the + node is fenced and the connection resource is restarted on another + node. + """ + + if self.failed: + return + + watchpats = [ + self.templates["Pat:Fencing_ok"] % self._remote_node, + self.templates["Pat:NodeFenced"] % self._remote_node + ] + + watch = self.create_watch(watchpats, 120) + watch.set_watch() + + # freeze the pcmk remote daemon. this will result in fencing + self.debug("Force stopped active remote node") + self._freeze_pcmk_remote(node) + + self.debug("Waiting for remote node to be fenced.") + + with Timer(self._logger, self.name, "remoteMetalFence"): + watch.look_for_all() + + if watch.unmatched: + self.fail("Unmatched patterns: %s" % watch.unmatched) + return + + self.debug("Waiting for the remote node to come back up") + self._cm.ns.wait_for_node(node, 120) + + pats = [] + + watch = self.create_watch(pats, 240) + watch.set_watch() + + pats.append(self.templates["Pat:RscOpOK"] % ("start", self._remote_node)) + + if self._remote_rsc_added: + pats.append(self.templates["Pat:RscRemoteOpOK"] % ("start", self._remote_rsc, self._remote_node)) + + # start the remote node again watch it integrate back into cluster. + self._start_pcmk_remote(node) + if not self._pcmk_started: + self.fail("Failed to start pacemaker_remote on node %s" % node) + return + + self.debug("Waiting for remote node to rejoin cluster after being fenced.") + + with Timer(self._logger, self.name, "remoteMetalRestart"): + watch.look_for_all() + + if watch.unmatched: + self.fail("Unmatched patterns: %s" % watch.unmatched) + + def _add_dummy_rsc(self, node): + """ Add a dummy resource that runs on the Pacemaker Remote node """ + + if self.failed: + return + + # verify we can put a resource on the remote node + pats = [] + watch = self.create_watch(pats, 120) + watch.set_watch() + + pats.extend([ + self.templates["Pat:RscRemoteOpOK"] % ("start", self._remote_rsc, self._remote_node), + self.templates["Pat:DC_IDLE"] + ]) + + # Add a resource that must live on remote-node + self._add_primitive_rsc(node) + + # force that rsc to prefer the remote node. + (rc, _) = self._cm.rsh(node, "crm_resource -M -r %s -N %s -f" % (self._remote_rsc, self._remote_node), verbose=1) + if rc != 0: + self.fail("Failed to place remote resource on remote node.") + return + + with Timer(self._logger, self.name, "remoteMetalRsc"): + watch.look_for_all() + + if watch.unmatched: + self.fail("Unmatched patterns: %s" % watch.unmatched) + + def test_attributes(self, node): + """ Verify that attributes can be set on the Pacemaker Remote node """ + + if self.failed: + return + + # This verifies permanent attributes can be set on a remote-node. It also + # verifies the remote-node can edit its own cib node section remotely. + (rc, line) = self._cm.rsh(node, "crm_attribute -l forever -n testattr -v testval -N %s" % self._remote_node, verbose=1) + if rc != 0: + self.fail("Failed to set remote-node attribute. rc:%s output:%s" % (rc, line)) + return + + (rc, _) = self._cm.rsh(node, "crm_attribute -l forever -n testattr -q -N %s" % self._remote_node, verbose=1) + if rc != 0: + self.fail("Failed to get remote-node attribute") + return + + (rc, _) = self._cm.rsh(node, "crm_attribute -l forever -n testattr -D -N %s" % self._remote_node, verbose=1) + if rc != 0: + self.fail("Failed to delete remote-node attribute") + + def cleanup_metal(self, node): + """ Clean up the Pacemaker Remote node configuration previously created by + _setup_metal. Stop and remove dummy resources and connection resources. + Stop the pacemaker_remote service. Remove the remote node itself. + """ + + self._enable_services(node) + + if not self._pcmk_started: + return + + pats = [] + + watch = self.create_watch(pats, 120) + watch.set_watch() + + if self._remote_rsc_added: + pats.append(self.templates["Pat:RscOpOK"] % ("stop", self._remote_rsc)) + + if self._remote_node_added: + pats.append(self.templates["Pat:RscOpOK"] % ("stop", self._remote_node)) + + with Timer(self._logger, self.name, "remoteMetalCleanup"): + self._resume_pcmk_remote(node) + + if self._remote_rsc_added: + # Remove dummy resource added for remote node tests + self.debug("Cleaning up dummy rsc put on remote node") + self._rsh(self._get_other_node(node), "crm_resource -U -r %s" % self._remote_rsc) + self._del_rsc(node, self._remote_rsc) + + if self._remote_node_added: + # Remove remote node's connection resource + self.debug("Cleaning up remote node connection resource") + self._rsh(self._get_other_node(node), "crm_resource -U -r %s" % self._remote_node) + self._del_rsc(node, self._remote_node) + + watch.look_for_all() + + if watch.unmatched: + self.fail("Unmatched patterns: %s" % watch.unmatched) + + self._stop_pcmk_remote(node) + + self.debug("Waiting for the cluster to recover") + self._cm.cluster_stable() + + if self._remote_node_added: + # Remove remote node itself + self.debug("Cleaning up node entry for remote node") + self._rsh(self._get_other_node(node), "crm_node --force --remove %s" % self._remote_node) + + def _setup_env(self, node): + """ Setup the environment to allow Pacemaker Remote to function. This + involves generating a key and copying it to all nodes in the cluster. + """ + + self._remote_node = "remote-%s" % node + + # we are assuming if all nodes have a key, that it is + # the right key... If any node doesn't have a remote + # key, we regenerate it everywhere. + if self._rsh.exists_on_all("/etc/pacemaker/authkey", self._env["nodes"]): + return + + # create key locally + (handle, keyfile) = tempfile.mkstemp(".cts") + os.close(handle) + subprocess.check_call(["dd", "if=/dev/urandom", "of=%s" % keyfile, "bs=4096", "count=1"], + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + + # sync key throughout the cluster + for n in self._env["nodes"]: + self._rsh(n, "mkdir -p --mode=0750 /etc/pacemaker") + self._rsh.copy(keyfile, "root@%s:/etc/pacemaker/authkey" % n) + self._rsh(n, "chgrp haclient /etc/pacemaker /etc/pacemaker/authkey") + self._rsh(n, "chmod 0640 /etc/pacemaker/authkey") + + os.unlink(keyfile) + + def is_applicable(self): + """ Return True if this test is applicable in the current test configuration. """ + + if not CTSTest.is_applicable(self): + return False + + for node in self._env["nodes"]: + (rc, _) = self._rsh(node, "which pacemaker-remoted >/dev/null 2>&1") + if rc != 0: + return False + + return True + + def start_new_test(self, node): + """ Prepare a remote test for running by setting up its environment + and resources + """ + + self.incr("calls") + self.reset() + + ret = self._startall(None) + if not ret: + return self.failure("setup failed: could not start all nodes") + + self._setup_env(node) + self._start_metal(node) + self._add_dummy_rsc(node) + return True + + def __call__(self, node): + """ Perform this test """ + + raise NotImplementedError + + @property + def errors_to_ignore(self): + """ Return list of errors which should be ignored """ + + return [ + r"""is running on remote.*which isn't allowed""", + r"""Connection terminated""", + r"""Could not send remote""" + ] diff --git a/python/pacemaker/_cts/tests/remotemigrate.py b/python/pacemaker/_cts/tests/remotemigrate.py new file mode 100644 index 0000000..e22e98f --- /dev/null +++ b/python/pacemaker/_cts/tests/remotemigrate.py @@ -0,0 +1,63 @@ +""" Move a connection resource from one node to another """ + +__all__ = ["RemoteMigrate"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +from pacemaker._cts.tests.remotedriver import RemoteDriver + +# Disable various pylint warnings that occur in so many places throughout this +# file it's easiest to just take care of them globally. This does introduce the +# possibility that we'll miss some other cause of the same warning, but we'll +# just have to be careful. + +# pylint doesn't understand that self._env is subscriptable. +# pylint: disable=unsubscriptable-object + + +class RemoteMigrate(RemoteDriver): + """ A concrete test that moves a connection resource from one node to another """ + + def __init__(self, cm): + """ Create a new RemoteMigrate instance + + Arguments: + + cm -- A ClusterManager instance + """ + + RemoteDriver.__init__(self, cm) + + self.name = "RemoteMigrate" + + def __call__(self, node): + """ Perform this test """ + + # This code is very similar to __call__ in remotestonithd.py, but I don't think + # it's worth turning into a library function nor making one a subclass of the + # other. I think that's more confusing than leaving the duplication. + # pylint: disable=duplicate-code + + if not self.start_new_test(node): + return self.failure(self.fail_string) + + self.migrate_connection(node) + self.cleanup_metal(node) + + self.debug("Waiting for the cluster to recover") + self._cm.cluster_stable() + if self.failed: + return self.failure(self.fail_string) + + return self.success() + + def is_applicable(self): + """ Return True if this test is applicable in the current test configuration. """ + + if not RemoteDriver.is_applicable(self): + return False + + # This test requires at least three nodes: one to convert to a + # remote node, one to host the connection originally, and one + # to migrate the connection to. + return len(self._env["nodes"]) >= 3 diff --git a/python/pacemaker/_cts/tests/remoterscfailure.py b/python/pacemaker/_cts/tests/remoterscfailure.py new file mode 100644 index 0000000..6f221de --- /dev/null +++ b/python/pacemaker/_cts/tests/remoterscfailure.py @@ -0,0 +1,73 @@ +""" Cause the Pacemaker Remote connection resource to fail """ + +__all__ = ["RemoteRscFailure"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +from pacemaker._cts.tests.remotedriver import RemoteDriver + +# Disable various pylint warnings that occur in so many places throughout this +# file it's easiest to just take care of them globally. This does introduce the +# possibility that we'll miss some other cause of the same warning, but we'll +# just have to be careful. + +# pylint doesn't understand that self._env is subscriptable. +# pylint: disable=unsubscriptable-object + + +class RemoteRscFailure(RemoteDriver): + """ A concrete test that causes the Pacemaker Remote connection resource + to fail + """ + + def __init__(self, cm): + """ Create a new RemoteRscFailure instance + + Arguments: + + cm -- A ClusterManager instance + """ + + RemoteDriver.__init__(self, cm) + self.name = "RemoteRscFailure" + + def __call__(self, node): + """ Perform this test """ + + if not self.start_new_test(node): + return self.failure(self.fail_string) + + # This is an important step. We are migrating the connection + # before failing the resource. This verifies that the migration + # has properly maintained control over the remote-node. + self.migrate_connection(node) + + self.fail_rsc(node) + self.cleanup_metal(node) + + self.debug("Waiting for the cluster to recover") + self._cm.cluster_stable() + if self.failed: + return self.failure(self.fail_string) + + return self.success() + + @property + def errors_to_ignore(self): + """ Return list of errors which should be ignored """ + + return [ + r"schedulerd.*: Recover\s+remote-rsc\s+\(.*\)", + r"Dummy.*: No process state file found" + ] + super().errors_to_ignore + + def is_applicable(self): + """ Return True if this test is applicable in the current test configuration. """ + + if not RemoteDriver.is_applicable(self): + return False + + # This test requires at least three nodes: one to convert to a + # remote node, one to host the connection originally, and one + # to migrate the connection to. + return len(self._env["nodes"]) >= 3 diff --git a/python/pacemaker/_cts/tests/remotestonithd.py b/python/pacemaker/_cts/tests/remotestonithd.py new file mode 100644 index 0000000..f684992 --- /dev/null +++ b/python/pacemaker/_cts/tests/remotestonithd.py @@ -0,0 +1,62 @@ +""" Fail the connection resource and fence the remote node """ + +__all__ = ["RemoteStonithd"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +from pacemaker._cts.tests.remotedriver import RemoteDriver + + +class RemoteStonithd(RemoteDriver): + """ A concrete test that fails the connection resource and fences the + remote node + """ + + def __init__(self, cm): + """ Create a new RemoteStonithd instance + + Arguments: + + cm -- A ClusterManager instance + """ + + RemoteDriver.__init__(self, cm) + + self.name = "RemoteStonithd" + + def __call__(self, node): + """ Perform this test """ + + if not self.start_new_test(node): + return self.failure(self.fail_string) + + self.fail_connection(node) + self.cleanup_metal(node) + + self.debug("Waiting for the cluster to recover") + self._cm.cluster_stable() + if self.failed: + return self.failure(self.fail_string) + + return self.success() + + def is_applicable(self): + """ Return True if this test is applicable in the current test configuration. """ + + if not RemoteDriver.is_applicable(self): + return False + + return self._env.get("DoFencing", True) + + @property + def errors_to_ignore(self): + """ Return list of errors which should be ignored """ + + return [ + r"Lost connection to Pacemaker Remote node", + r"Software caused connection abort", + r"pacemaker-controld.*:\s+error.*: Operation remote-.*_monitor", + r"pacemaker-controld.*:\s+error.*: Result of monitor operation for remote-.*", + r"schedulerd.*:\s+Recover\s+remote-.*\s+\(.*\)", + r"error: Result of monitor operation for .* on remote-.*: Internal communication failure" + ] + super().errors_to_ignore diff --git a/python/pacemaker/_cts/tests/resourcerecover.py b/python/pacemaker/_cts/tests/resourcerecover.py new file mode 100644 index 0000000..252eb1f --- /dev/null +++ b/python/pacemaker/_cts/tests/resourcerecover.py @@ -0,0 +1,175 @@ +""" Fail a random resource and verify its fail count increases """ + +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +from pacemaker._cts.audits import AuditResource +from pacemaker._cts.tests.ctstest import CTSTest +from pacemaker._cts.tests.simulstartlite import SimulStartLite +from pacemaker._cts.tests.starttest import StartTest +from pacemaker._cts.timer import Timer + +# Disable various pylint warnings that occur in so many places throughout this +# file it's easiest to just take care of them globally. This does introduce the +# possibility that we'll miss some other cause of the same warning, but we'll +# just have to be careful. + +# pylint doesn't understand that self._rsh is callable. +# pylint: disable=not-callable + + +class ResourceRecover(CTSTest): + """ A concrete test that fails a random resource """ + + def __init__(self, cm): + """ Create a new ResourceRecover instance + + Arguments: + + cm -- A ClusterManager instance + """ + + CTSTest.__init__(self, cm) + + self.benchmark = True + self.name = "ResourceRecover" + + self._action = "asyncmon" + self._interval = 0 + self._rid = None + self._rid_alt = None + self._start = StartTest(cm) + self._startall = SimulStartLite(cm) + + def __call__(self, node): + """ Perform this test """ + + self.incr("calls") + + if not self._startall(None): + return self.failure("Setup failed") + + # List all resources active on the node (skip test if none) + resourcelist = self._cm.active_resources(node) + if not resourcelist: + self._logger.log("No active resources on %s" % node) + return self.skipped() + + # Choose one resource at random + rsc = self._choose_resource(node, resourcelist) + if rsc is None: + return self.failure("Could not get details of resource '%s'" % self._rid) + + if rsc.id == rsc.clone_id: + self.debug("Failing %s" % rsc.id) + else: + self.debug("Failing %s (also known as %s)" % (rsc.id, rsc.clone_id)) + + # Log patterns to watch for (failure, plus restart if managed) + pats = [ + self.templates["Pat:CloneOpFail"] % (self._action, rsc.id, rsc.clone_id) + ] + + if rsc.managed: + pats.append(self.templates["Pat:RscOpOK"] % ("stop", self._rid)) + + if rsc.unique: + pats.append(self.templates["Pat:RscOpOK"] % ("start", self._rid)) + else: + # Anonymous clones may get restarted with a different clone number + pats.append(self.templates["Pat:RscOpOK"] % ("start", ".*")) + + # Fail resource. (Ideally, we'd fail it twice, to ensure the fail count + # is incrementing properly, but it might restart on a different node. + # We'd have to temporarily ban it from all other nodes and ensure the + # migration-threshold hasn't been reached.) + if self._fail_resource(rsc, node, pats) is None: + # self.failure() already called + return None + + return self.success() + + def _choose_resource(self, node, resourcelist): + """ Choose a random resource to target """ + + self._rid = self._env.random_gen.choice(resourcelist) + self._rid_alt = self._rid + (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) + + for line in lines: + if line.startswith("Resource: "): + rsc = AuditResource(self._cm, line) + + if rsc.id == self._rid: + # Handle anonymous clones that get renamed + self._rid = rsc.clone_id + return rsc + + return None + + def _get_failcount(self, node): + """ Check the fail count of targeted resource on given node """ + + cmd = "crm_failcount --quiet --query --resource %s --operation %s --interval %d --node %s" + (rc, lines) = self._rsh(node, cmd % (self._rid, self._action, self._interval, node), + verbose=1) + + if rc != 0 or len(lines) != 1: + lines = [l.strip() for l in lines] + self._logger.log("crm_failcount on %s failed (%d): %s" % (node, rc, " // ".join(lines))) + return -1 + + try: + failcount = int(lines[0]) + except (IndexError, ValueError): + self._logger.log("crm_failcount output on %s unparseable: %s" % (node, " ".join(lines))) + return -1 + + return failcount + + def _fail_resource(self, rsc, node, pats): + """ Fail the targeted resource, and verify as expected """ + + orig_failcount = self._get_failcount(node) + + watch = self.create_watch(pats, 60) + watch.set_watch() + + self._rsh(node, "crm_resource -V -F -r %s -H %s &>/dev/null" % (self._rid, node)) + + with Timer(self._logger, self.name, "recover"): + watch.look_for_all() + + self._cm.cluster_stable() + recovered = self._cm.resource_location(self._rid) + + if watch.unmatched: + return self.failure("Patterns not found: %r" % watch.unmatched) + + if rsc.unique and len(recovered) > 1: + return self.failure("%s is now active on more than one node: %r" % (self._rid, recovered)) + + if recovered: + self.debug("%s is running on: %r" % (self._rid, recovered)) + + elif rsc.managed: + return self.failure("%s was not recovered and is inactive" % self._rid) + + new_failcount = self._get_failcount(node) + if new_failcount != orig_failcount + 1: + return self.failure("%s fail count is %d not %d" + % (self._rid, new_failcount, orig_failcount + 1)) + + return 0 # Anything but None is success + + @property + def errors_to_ignore(self): + """ Return list of errors which should be ignored """ + + return [ + r"Updating failcount for %s" % self._rid, + r"schedulerd.*: Recover\s+(%s|%s)\s+\(.*\)" % (self._rid, self._rid_alt), + r"Unknown operation: fail", + self.templates["Pat:RscOpOK"] % (self._action, self._rid), + r"(ERROR|error).*: Action %s_%s_%d .* initiated outside of a transition" % (self._rid, self._action, self._interval) + ] diff --git a/python/pacemaker/_cts/tests/restartonebyone.py b/python/pacemaker/_cts/tests/restartonebyone.py new file mode 100644 index 0000000..23b3a68 --- /dev/null +++ b/python/pacemaker/_cts/tests/restartonebyone.py @@ -0,0 +1,58 @@ +""" Restart all nodes in order """ + +__all__ = ["RestartOnebyOne"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +from pacemaker._cts.tests.ctstest import CTSTest +from pacemaker._cts.tests.restarttest import RestartTest +from pacemaker._cts.tests.simulstartlite import SimulStartLite + +# Disable various pylint warnings that occur in so many places throughout this +# file it's easiest to just take care of them globally. This does introduce the +# possibility that we'll miss some other cause of the same warning, but we'll +# just have to be careful. + +# pylint doesn't understand that self._env is subscriptable. +# pylint: disable=unsubscriptable-object + + +class RestartOnebyOne(CTSTest): + """ A concrete test that restarts all nodes in order """ + + def __init__(self, cm): + """ Create a new RestartOnebyOne instance + + Arguments: + + cm -- A ClusterManager instance + """ + + CTSTest.__init__(self, cm) + + self.name = "RestartOnebyOne" + + self._restart = None + self._startall = SimulStartLite(cm) + + def __call__(self, dummy): + """ Perform the test """ + + self.incr("calls") + + ret = self._startall(None) + if not ret: + return self.failure("Setup failed") + + did_fail = [] + self.set_timer() + self._restart = RestartTest(self._cm) + + for node in self._env["nodes"]: + if not self._restart(node): + did_fail.append(node) + + if did_fail: + return self.failure("Could not restart %d nodes: %r" % (len(did_fail), did_fail)) + + return self.success() diff --git a/python/pacemaker/_cts/tests/restarttest.py b/python/pacemaker/_cts/tests/restarttest.py new file mode 100644 index 0000000..3b628ce --- /dev/null +++ b/python/pacemaker/_cts/tests/restarttest.py @@ -0,0 +1,49 @@ +""" Stop and restart a node """ + +__all__ = ["RestartTest"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +from pacemaker._cts.tests.ctstest import CTSTest +from pacemaker._cts.tests.starttest import StartTest +from pacemaker._cts.tests.stoptest import StopTest + + +class RestartTest(CTSTest): + """ A concrete test that stops and restarts a node """ + + def __init__(self, cm): + """ Create a new RestartTest instance + + Arguments: + + cm -- A ClusterManager instance + """ + + CTSTest.__init__(self, cm) + self.benchmark = True + self.name = "Restart" + + self._start = StartTest(cm) + self._stop = StopTest(cm) + + def __call__(self, node): + """ Perform this test """ + + self.incr("calls") + self.incr("node:%s" % node) + + if self._cm.stat_cm(node): + self.incr("WasStopped") + if not self._start(node): + return self.failure("start (setup) failure: %s" % node) + + self.set_timer() + + if not self._stop(node): + return self.failure("stop failure: %s" % node) + + if not self._start(node): + return self.failure("start failure: %s" % node) + + return self.success() diff --git a/python/pacemaker/_cts/tests/resynccib.py b/python/pacemaker/_cts/tests/resynccib.py new file mode 100644 index 0000000..fe634d6 --- /dev/null +++ b/python/pacemaker/_cts/tests/resynccib.py @@ -0,0 +1,75 @@ +""" Start the cluster without a CIB and verify it gets copied from another node """ + +__all__ = ["ResyncCIB"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +from pacemaker import BuildOptions +from pacemaker._cts.tests.ctstest import CTSTest +from pacemaker._cts.tests.restarttest import RestartTest +from pacemaker._cts.tests.simulstartlite import SimulStartLite +from pacemaker._cts.tests.simulstoplite import SimulStopLite + +# Disable various pylint warnings that occur in so many places throughout this +# file it's easiest to just take care of them globally. This does introduce the +# possibility that we'll miss some other cause of the same warning, but we'll +# just have to be careful. + +# pylint doesn't understand that self._rsh is callable. +# pylint: disable=not-callable + + +class ResyncCIB(CTSTest): + """ A concrete test that starts the cluster on one node without a CIB and + verifies the CIB is copied over when the remaining nodes join + """ + + def __init__(self, cm): + """ Create a new ResyncCIB instance + + Arguments: + + cm -- A ClusterManager instance + """ + + CTSTest.__init__(self, cm) + + self.name = "ResyncCIB" + + self._restart1 = RestartTest(cm) + self._startall = SimulStartLite(cm) + self._stopall = SimulStopLite(cm) + + def __call__(self, node): + """ Perform this test """ + + self.incr("calls") + + # Shut down all the nodes... + if not self._stopall(None): + return self.failure("Could not stop all nodes") + + # Test config recovery when the other nodes come up + self._rsh(node, "rm -f %s/cib*" % BuildOptions.CIB_DIR) + + # Start the selected node + if not self._restart1(node): + return self.failure("Could not start %s" % node) + + # Start all remaining nodes + if not self._startall(None): + return self.failure("Could not start the remaining nodes") + + return self.success() + + @property + def errors_to_ignore(self): + """ Return list of errors which should be ignored """ + + # Errors that occur as a result of the CIB being wiped + return [ + r"error.*: v1 patchset error, patch failed to apply: Application of an update diff failed", + r"error.*: Resource start-up disabled since no STONITH resources have been defined", + r"error.*: Either configure some or disable STONITH with the stonith-enabled option", + r"error.*: NOTE: Clusters with shared data need STONITH to ensure data integrity" + ] diff --git a/python/pacemaker/_cts/tests/simulstart.py b/python/pacemaker/_cts/tests/simulstart.py new file mode 100644 index 0000000..88a7f2f --- /dev/null +++ b/python/pacemaker/_cts/tests/simulstart.py @@ -0,0 +1,42 @@ +""" Start all stopped nodes simultaneously """ + +__all__ = ["SimulStart"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +from pacemaker._cts.tests.ctstest import CTSTest +from pacemaker._cts.tests.simulstartlite import SimulStartLite +from pacemaker._cts.tests.simulstoplite import SimulStopLite + + +class SimulStart(CTSTest): + """ A concrete test that starts all stopped nodes simultaneously """ + + def __init__(self, cm): + """ Create a new SimulStart instance + + Arguments: + + cm -- A ClusterManager instance + """ + + CTSTest.__init__(self, cm) + + self.name = "SimulStart" + + self._startall = SimulStartLite(cm) + self._stopall = SimulStopLite(cm) + + def __call__(self, dummy): + """ Perform this test """ + + self.incr("calls") + + ret = self._stopall(None) + if not ret: + return self.failure("Setup failed") + + if not self._startall(None): + return self.failure("Startall failed") + + return self.success() diff --git a/python/pacemaker/_cts/tests/simulstartlite.py b/python/pacemaker/_cts/tests/simulstartlite.py new file mode 100644 index 0000000..c5c51e1 --- /dev/null +++ b/python/pacemaker/_cts/tests/simulstartlite.py @@ -0,0 +1,133 @@ +""" Simultaneously start stopped nodes """ + +__all__ = ["SimulStartLite"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +from pacemaker._cts.tests.ctstest import CTSTest + +# Disable various pylint warnings that occur in so many places throughout this +# file it's easiest to just take care of them globally. This does introduce the +# possibility that we'll miss some other cause of the same warning, but we'll +# just have to be careful. + +# pylint doesn't understand that self._env is subscriptable. +# pylint: disable=unsubscriptable-object + + +class SimulStartLite(CTSTest): + """ A pseudo-test that is only used to set up conditions before running + some other test. This class starts any stopped nodes more or less + simultaneously. + + Other test classes should not use this one as a superclass. + """ + + def __init__(self, cm): + """ Create a new SimulStartLite instance + + Arguments: + + cm -- A ClusterManager instance + """ + + CTSTest.__init__(self, cm) + self.name = "SimulStartLite" + + def __call__(self, dummy): + """ Start all stopped nodes more or less simultaneously, returning + whether this succeeded or not. + """ + + self.incr("calls") + self.debug("Setup: %s" % self.name) + + # We ignore the "node" parameter... + node_list = [] + for node in self._env["nodes"]: + if self._cm.expected_status[node] == "down": + self.incr("WasStopped") + node_list.append(node) + + self.set_timer() + while len(node_list) > 0: + # Repeat until all nodes come up + uppat = self.templates["Pat:NonDC_started"] + if self._cm.upcount() == 0: + uppat = self.templates["Pat:Local_started"] + + watchpats = [ + self.templates["Pat:DC_IDLE"] + ] + for node in node_list: + watchpats.extend([uppat % node, + self.templates["Pat:InfraUp"] % node, + self.templates["Pat:PacemakerUp"] % node]) + + # Start all the nodes - at about the same time... + watch = self.create_watch(watchpats, self._env["DeadTime"]+10) + watch.set_watch() + + stonith = self._cm.prepare_fencing_watcher() + + for node in node_list: + self._cm.start_cm_async(node) + + watch.look_for_all() + + node_list = self._cm.fencing_cleanup(self.name, stonith) + + if node_list is None: + return self.failure("Cluster did not stabilize") + + # Remove node_list messages from watch.unmatched + for node in node_list: + self._logger.debug("Dealing with stonith operations for %s" % node_list) + if watch.unmatched: + try: + watch.unmatched.remove(uppat % node) + except ValueError: + self.debug("Already matched: %s" % (uppat % node)) + + try: + watch.unmatched.remove(self.templates["Pat:InfraUp"] % node) + except ValueError: + self.debug("Already matched: %s" % (self.templates["Pat:InfraUp"] % node)) + + try: + watch.unmatched.remove(self.templates["Pat:PacemakerUp"] % node) + except ValueError: + self.debug("Already matched: %s" % (self.templates["Pat:PacemakerUp"] % node)) + + if watch.unmatched: + for regex in watch.unmatched: + self._logger.log("Warn: Startup pattern not found: %s" % regex) + + if not self._cm.cluster_stable(): + return self.failure("Cluster did not stabilize") + + did_fail = False + unstable = [] + for node in self._env["nodes"]: + if not self._cm.stat_cm(node): + did_fail = True + unstable.append(node) + + if did_fail: + return self.failure("Unstarted nodes exist: %s" % unstable) + + unstable = [] + for node in self._env["nodes"]: + if not self._cm.node_stable(node): + did_fail = True + unstable.append(node) + + if did_fail: + return self.failure("Unstable cluster nodes exist: %s" % unstable) + + return self.success() + + def is_applicable(self): + """ SimulStartLite is a setup test and never applicable """ + + return False diff --git a/python/pacemaker/_cts/tests/simulstop.py b/python/pacemaker/_cts/tests/simulstop.py new file mode 100644 index 0000000..174c533 --- /dev/null +++ b/python/pacemaker/_cts/tests/simulstop.py @@ -0,0 +1,42 @@ +""" Stop all running nodes simultaneously """ + +__all__ = ["SimulStop"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +from pacemaker._cts.tests.ctstest import CTSTest +from pacemaker._cts.tests.simulstartlite import SimulStartLite +from pacemaker._cts.tests.simulstoplite import SimulStopLite + + +class SimulStop(CTSTest): + """ A concrete test that stops all running nodes simultaneously """ + + def __init__(self, cm): + """ Create a new SimulStop instance + + Arguments: + + cm -- A ClusterManager instance + """ + + CTSTest.__init__(self, cm) + + self.name = "SimulStop" + + self._startall = SimulStartLite(cm) + self._stopall = SimulStopLite(cm) + + def __call__(self, dummy): + """ Perform this test """ + + self.incr("calls") + + ret = self._startall(None) + if not ret: + return self.failure("Setup failed") + + if not self._stopall(None): + return self.failure("Stopall failed") + + return self.success() diff --git a/python/pacemaker/_cts/tests/simulstoplite.py b/python/pacemaker/_cts/tests/simulstoplite.py new file mode 100644 index 0000000..d2e687e --- /dev/null +++ b/python/pacemaker/_cts/tests/simulstoplite.py @@ -0,0 +1,91 @@ +""" Simultaneously stop running nodes """ + +__all__ = ["SimulStopLite"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +from pacemaker._cts.tests.ctstest import CTSTest + +# Disable various pylint warnings that occur in so many places throughout this +# file it's easiest to just take care of them globally. This does introduce the +# possibility that we'll miss some other cause of the same warning, but we'll +# just have to be careful. + +# pylint doesn't understand that self._rsh is callable. +# pylint: disable=not-callable +# pylint doesn't understand that self._env is subscriptable. +# pylint: disable=unsubscriptable-object + + +class SimulStopLite(CTSTest): + """ A pseudo-test that is only used to set up conditions before running + some other test. This class stops any running nodes more or less + simultaneously. It can be used both to set up a test or to clean up + a test. + + Other test classes should not use this one as a superclass. + """ + + def __init__(self, cm): + """ Create a new SimulStopLite instance + + Arguments: + + cm -- A ClusterManager instance + """ + + CTSTest.__init__(self, cm) + self.name = "SimulStopLite" + + def __call__(self, dummy): + """ Stop all running nodes more or less simultaneously, returning + whether this succeeded or not. + """ + + self.incr("calls") + self.debug("Setup: %s" % self.name) + + # We ignore the "node" parameter... + watchpats = [] + + for node in self._env["nodes"]: + if self._cm.expected_status[node] == "up": + self.incr("WasStarted") + watchpats.append(self.templates["Pat:We_stopped"] % node) + + if len(watchpats) == 0: + return self.success() + + # Stop all the nodes - at about the same time... + watch = self.create_watch(watchpats, self._env["DeadTime"]+10) + + watch.set_watch() + self.set_timer() + for node in self._env["nodes"]: + if self._cm.expected_status[node] == "up": + self._cm.stop_cm_async(node) + + if watch.look_for_all(): + # Make sure they're completely down with no residule + for node in self._env["nodes"]: + self._rsh(node, self.templates["StopCmd"]) + + return self.success() + + did_fail = False + up_nodes = [] + for node in self._env["nodes"]: + if self._cm.stat_cm(node): + did_fail = True + up_nodes.append(node) + + if did_fail: + return self.failure("Active nodes exist: %s" % up_nodes) + + self._logger.log("Warn: All nodes stopped but CTS didn't detect: %s" % watch.unmatched) + return self.failure("Missing log message: %s " % watch.unmatched) + + def is_applicable(self): + """ SimulStopLite is a setup test and never applicable """ + + return False diff --git a/python/pacemaker/_cts/tests/splitbraintest.py b/python/pacemaker/_cts/tests/splitbraintest.py new file mode 100644 index 0000000..09d5f55 --- /dev/null +++ b/python/pacemaker/_cts/tests/splitbraintest.py @@ -0,0 +1,215 @@ +""" Create a split brain cluster and verify a resource is multiply managed """ + +__all__ = ["SplitBrainTest"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +import time + +from pacemaker._cts.input import should_continue +from pacemaker._cts.tests.ctstest import CTSTest +from pacemaker._cts.tests.simulstartlite import SimulStartLite +from pacemaker._cts.tests.starttest import StartTest + +# Disable various pylint warnings that occur in so many places throughout this +# file it's easiest to just take care of them globally. This does introduce the +# possibility that we'll miss some other cause of the same warning, but we'll +# just have to be careful. + +# pylint doesn't understand that self._rsh is callable. +# pylint: disable=not-callable +# pylint doesn't understand that self._env is subscriptable. +# pylint: disable=unsubscriptable-object + + +class SplitBrainTest(CTSTest): + """ A concrete test that creates a split brain cluster and verifies that + one node in each partition takes over the resource, resulting in two + nodes running the same resource. + """ + + def __init__(self, cm): + """ Create a new SplitBrainTest instance + + Arguments: + + cm -- A ClusterManager instance + """ + + CTSTest.__init__(self, cm) + + self.is_experimental = True + self.name = "SplitBrain" + + self._start = StartTest(cm) + self._startall = SimulStartLite(cm) + + def _isolate_partition(self, partition): + """ Create a new partition containing the given nodes """ + + other_nodes = self._env["nodes"].copy() + + for node in partition: + try: + other_nodes.remove(node) + except ValueError: + self._logger.log("Node %s not in %r from %r" % (node, self._env["nodes"], partition)) + + if not other_nodes: + return + + self.debug("Creating partition: %r" % partition) + self.debug("Everyone else: %r" % other_nodes) + + for node in partition: + if not self._cm.isolate_node(node, other_nodes): + self._logger.log("Could not isolate %s" % node) + return + + def _heal_partition(self, partition): + """ Move the given nodes out of their own partition back into the cluster """ + + other_nodes = self._env["nodes"].copy() + + for node in partition: + try: + other_nodes.remove(node) + except ValueError: + self._logger.log("Node %s not in %r" % (node, self._env["nodes"])) + + if len(other_nodes) == 0: + return + + self.debug("Healing partition: %r" % partition) + self.debug("Everyone else: %r" % other_nodes) + + for node in partition: + self._cm.unisolate_node(node, other_nodes) + + def __call__(self, node): + """ Perform this test """ + + self.incr("calls") + self.passed = True + partitions = {} + + if not self._startall(None): + return self.failure("Setup failed") + + while True: + # Retry until we get multiple partitions + partitions = {} + p_max = len(self._env["nodes"]) + + for n in self._env["nodes"]: + p = self._env.random_gen.randint(1, p_max) + + if p not in partitions: + partitions[p] = [] + + partitions[p].append(n) + + p_max = len(partitions) + if p_max > 1: + break + # else, try again + + self.debug("Created %d partitions" % p_max) + for (key, val) in partitions.items(): + self.debug("Partition[%s]:\t%r" % (key, val)) + + # Disabling STONITH to reduce test complexity for now + self._rsh(node, "crm_attribute -V -n stonith-enabled -v false") + + for val in partitions.values(): + self._isolate_partition(val) + + count = 30 + while count > 0: + if len(self._cm.find_partitions()) != p_max: + time.sleep(10) + else: + break + else: + self.failure("Expected partitions were not created") + + # Target number of partitions formed - wait for stability + if not self._cm.cluster_stable(): + self.failure("Partitioned cluster not stable") + + # Now audit the cluster state + self._cm.partitions_expected = p_max + if not self.audit(): + self.failure("Audits failed") + + self._cm.partitions_expected = 1 + + # And heal them again + for val in partitions.values(): + self._heal_partition(val) + + # Wait for a single partition to form + count = 30 + while count > 0: + if len(self._cm.find_partitions()) != 1: + time.sleep(10) + count -= 1 + else: + break + else: + self.failure("Cluster did not reform") + + # Wait for it to have the right number of members + count = 30 + while count > 0: + members = [] + + partitions = self._cm.find_partitions() + if partitions: + members = partitions[0].split() + + if len(members) != len(self._env["nodes"]): + time.sleep(10) + count -= 1 + else: + break + else: + self.failure("Cluster did not completely reform") + + # Wait up to 20 minutes - the delay is more preferable than + # trying to continue with in a messed up state + if not self._cm.cluster_stable(1200): + self.failure("Reformed cluster not stable") + + if not should_continue(self._env): + raise ValueError("Reformed cluster not stable") + + # Turn fencing back on + if self._env["DoFencing"]: + self._rsh(node, "crm_attribute -V -D -n stonith-enabled") + + self._cm.cluster_stable() + + if self.passed: + return self.success() + + return self.failure("See previous errors") + + @property + def errors_to_ignore(self): + """ Return list of errors which should be ignored """ + + return [ + r"Another DC detected:", + r"(ERROR|error).*: .*Application of an update diff failed", + r"pacemaker-controld.*:.*not in our membership list", + r"CRIT:.*node.*returning after partition" + ] + + def is_applicable(self): + """ Return True if this test is applicable in the current test configuration. """ + + if not CTSTest.is_applicable(self): + return False + + return len(self._env["nodes"]) > 2 diff --git a/python/pacemaker/_cts/tests/standbytest.py b/python/pacemaker/_cts/tests/standbytest.py new file mode 100644 index 0000000..a9ce8ec --- /dev/null +++ b/python/pacemaker/_cts/tests/standbytest.py @@ -0,0 +1,110 @@ +""" Put a node into standby mode and check that resources migrate """ + +__all__ = ["StandbyTest"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +from pacemaker._cts.tests.ctstest import CTSTest +from pacemaker._cts.tests.simulstartlite import SimulStartLite +from pacemaker._cts.tests.starttest import StartTest + +# Disable various pylint warnings that occur in so many places throughout this +# file it's easiest to just take care of them globally. This does introduce the +# possibility that we'll miss some other cause of the same warning, but we'll +# just have to be careful. + +# pylint doesn't understand that self._env is subscriptable. +# pylint: disable=unsubscriptable-object + + +class StandbyTest(CTSTest): + """ A concrete tests that puts a node into standby and checks that resources + migrate away from the node + """ + + def __init__(self, cm): + """ Create a new StandbyTest instance + + Arguments: + + cm -- A ClusterManager instance + """ + + CTSTest.__init__(self, cm) + + self.benchmark = True + self.name = "Standby" + + self._start = StartTest(cm) + self._startall = SimulStartLite(cm) + + # make sure the node is active + # set the node to standby mode + # check resources, none resource should be running on the node + # set the node to active mode + # check resources, resources should have been migrated back (SHOULD THEY?) + + def __call__(self, node): + """ Perform this test """ + + self.incr("calls") + ret = self._startall(None) + if not ret: + return self.failure("Start all nodes failed") + + self.debug("Make sure node %s is active" % node) + if self._cm.in_standby_mode(node): + if not self._cm.set_standby_mode(node, False): + return self.failure("can't set node %s to active mode" % node) + + self._cm.cluster_stable() + + if self._cm.in_standby_mode(node): + return self.failure("standby status of %s is [on] but we expect [off]" % node) + + watchpats = [ + r"State transition .* -> S_POLICY_ENGINE", + ] + watch = self.create_watch(watchpats, self._env["DeadTime"]+10) + watch.set_watch() + + self.debug("Setting node %s to standby mode" % node) + if not self._cm.set_standby_mode(node, True): + return self.failure("can't set node %s to standby mode" % node) + + self.set_timer("on") + + ret = watch.look_for_all() + if not ret: + self._logger.log("Patterns not found: %r" % watch.unmatched) + self._cm.set_standby_mode(node, False) + return self.failure("cluster didn't react to standby change on %s" % node) + + self._cm.cluster_stable() + + if not self._cm.in_standby_mode(node): + return self.failure("standby status of %s is [off] but we expect [on]" % node) + + self.log_timer("on") + + self.debug("Checking resources") + rscs_on_node = self._cm.active_resources(node) + if rscs_on_node: + rc = self.failure("%s set to standby, %r is still running on it" % (node, rscs_on_node)) + self.debug("Setting node %s to active mode" % node) + self._cm.set_standby_mode(node, False) + return rc + + self.debug("Setting node %s to active mode" % node) + if not self._cm.set_standby_mode(node, False): + return self.failure("can't set node %s to active mode" % node) + + self.set_timer("off") + self._cm.cluster_stable() + + if self._cm.in_standby_mode(node): + return self.failure("standby status of %s is [on] but we expect [off]" % node) + + self.log_timer("off") + + return self.success() diff --git a/python/pacemaker/_cts/tests/startonebyone.py b/python/pacemaker/_cts/tests/startonebyone.py new file mode 100644 index 0000000..6a01097 --- /dev/null +++ b/python/pacemaker/_cts/tests/startonebyone.py @@ -0,0 +1,55 @@ +""" Start all stopped nodes serially """ + +__all__ = ["StartOnebyOne"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +from pacemaker._cts.tests.ctstest import CTSTest +from pacemaker._cts.tests.simulstoplite import SimulStopLite +from pacemaker._cts.tests.starttest import StartTest + +# Disable various pylint warnings that occur in so many places throughout this +# file it's easiest to just take care of them globally. This does introduce the +# possibility that we'll miss some other cause of the same warning, but we'll +# just have to be careful. + +# pylint doesn't understand that self._env is subscriptable. +# pylint: disable=unsubscriptable-object + + +class StartOnebyOne(CTSTest): + """ A concrete test that starts all stopped nodes serially """ + + def __init__(self, cm): + """ Create a new StartOnebyOne instance + + Arguments: + + cm -- A ClusterManager instance + """ + + CTSTest.__init__(self, cm) + self.name = "StartOnebyOne" + + self._start = StartTest(cm) + self._stopall = SimulStopLite(cm) + + def __call__(self, dummy): + """ Perform this test """ + + self.incr("calls") + + ret = self._stopall(None) + if not ret: + return self.failure("Test setup failed") + + failed = [] + self.set_timer() + for node in self._env["nodes"]: + if not self._start(node): + failed.append(node) + + if failed: + return self.failure("Some node failed to start: %r" % failed) + + return self.success() diff --git a/python/pacemaker/_cts/tests/starttest.py b/python/pacemaker/_cts/tests/starttest.py new file mode 100644 index 0000000..6387511 --- /dev/null +++ b/python/pacemaker/_cts/tests/starttest.py @@ -0,0 +1,54 @@ +""" Start the cluster manager on a given node """ + +__all__ = ["StartTest"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +from pacemaker._cts.tests.ctstest import CTSTest + +# Disable various pylint warnings that occur in so many places throughout this +# file it's easiest to just take care of them globally. This does introduce the +# possibility that we'll miss some other cause of the same warning, but we'll +# just have to be careful. + +# pylint doesn't understand that self._env is subscriptable. +# pylint: disable=unsubscriptable-object + + +class StartTest(CTSTest): + """ A pseudo-test that is only used to set up conditions before running + some other test. This class starts the cluster manager on a given + node. + + Other test classes should not use this one as a superclass. + """ + + def __init__(self, cm): + """ Create a new StartTest instance + + Arguments: + + cm -- A ClusterManager instance + """ + + CTSTest.__init__(self, cm) + self.name = "Start" + + def __call__(self, node): + """ Start the given node, returning whether this succeeded or not """ + + self.incr("calls") + + if self._cm.upcount() == 0: + self.incr("us") + else: + self.incr("them") + + if self._cm.expected_status[node] != "down": + return self.skipped() + + if self._cm.start_cm(node): + return self.success() + + return self.failure("Startup %s on node %s failed" + % (self._env["Name"], node)) diff --git a/python/pacemaker/_cts/tests/stonithdtest.py b/python/pacemaker/_cts/tests/stonithdtest.py new file mode 100644 index 0000000..0dce291 --- /dev/null +++ b/python/pacemaker/_cts/tests/stonithdtest.py @@ -0,0 +1,145 @@ +""" Fence a running node and wait for it to restart """ + +__all__ = ["StonithdTest"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +from pacemaker.exitstatus import ExitStatus +from pacemaker._cts.tests.ctstest import CTSTest +from pacemaker._cts.tests.simulstartlite import SimulStartLite +from pacemaker._cts.timer import Timer + +# Disable various pylint warnings that occur in so many places throughout this +# file it's easiest to just take care of them globally. This does introduce the +# possibility that we'll miss some other cause of the same warning, but we'll +# just have to be careful. + +# pylint doesn't understand that self._rsh is callable. +# pylint: disable=not-callable +# pylint doesn't understand that self._env is subscriptable. +# pylint: disable=unsubscriptable-object + + +class StonithdTest(CTSTest): + """ A concrete test that fences a running node and waits for it to restart """ + + def __init__(self, cm): + """ Create a new StonithdTest instance + + Arguments: + + cm -- A ClusterManager instance + """ + + CTSTest.__init__(self, cm) + self.benchmark = True + self.name = "Stonithd" + + self._startall = SimulStartLite(cm) + + def __call__(self, node): + """ Perform this test """ + + self.incr("calls") + if len(self._env["nodes"]) < 2: + return self.skipped() + + ret = self._startall(None) + if not ret: + return self.failure("Setup failed") + + watchpats = [ + self.templates["Pat:Fencing_ok"] % node, + self.templates["Pat:NodeFenced"] % node, + ] + + if not self._env["at-boot"]: + self.debug("Expecting %s to stay down" % node) + self._cm.expected_status[node] = "down" + else: + self.debug("Expecting %s to come up again %d" % (node, self._env["at-boot"])) + watchpats.extend([ + "%s.* S_STARTING -> S_PENDING" % node, + "%s.* S_PENDING -> S_NOT_DC" % node, + ]) + + watch = self.create_watch(watchpats, 30 + self._env["DeadTime"] + self._env["StableTime"] + self._env["StartTime"]) + watch.set_watch() + + origin = self._env.random_gen.choice(self._env["nodes"]) + + (rc, _) = self._rsh(origin, "stonith_admin --reboot %s -VVVVVV" % node) + + if rc == ExitStatus.TIMEOUT: + # Look for the patterns, usually this means the required + # device was running on the node to be fenced - or that + # the required devices were in the process of being loaded + # and/or moved + # + # Effectively the node committed suicide so there will be + # no confirmation, but pacemaker should be watching and + # fence the node again + + self._logger.log("Fencing command on %s to fence %s timed out" % (origin, node)) + + elif origin != node and rc != 0: + self.debug("Waiting for the cluster to recover") + self._cm.cluster_stable() + + self.debug("Waiting for fenced node to come back up") + self._cm.ns.wait_for_all_nodes(self._env["nodes"], 600) + + self._logger.log("Fencing command on %s failed to fence %s (rc=%d)" % (origin, node, rc)) + + elif origin == node and rc != 255: + # 255 == broken pipe, ie. the node was fenced as expected + self._logger.log("Locally originated fencing returned %d" % rc) + + with Timer(self._logger, self.name, "fence"): + matched = watch.look_for_all() + + self.set_timer("reform") + if watch.unmatched: + self._logger.log("Patterns not found: %r" % watch.unmatched) + + self.debug("Waiting for the cluster to recover") + self._cm.cluster_stable() + + self.debug("Waiting for fenced node to come back up") + self._cm.ns.wait_for_all_nodes(self._env["nodes"], 600) + + self.debug("Waiting for the cluster to re-stabilize with all nodes") + is_stable = self._cm.cluster_stable(self._env["StartTime"]) + + if not matched: + return self.failure("Didn't find all expected patterns") + + if not is_stable: + return self.failure("Cluster did not become stable") + + self.log_timer("reform") + return self.success() + + @property + def errors_to_ignore(self): + """ Return list of errors which should be ignored """ + + return [ + self.templates["Pat:Fencing_start"] % ".*", + self.templates["Pat:Fencing_ok"] % ".*", + self.templates["Pat:Fencing_active"], + r"error.*: Operation 'reboot' targeting .* by .* for stonith_admin.*: Timer expired" + ] + + def is_applicable(self): + """ Return True if this test is applicable in the current test configuration. """ + + if not CTSTest.is_applicable(self): + return False + + # pylint gets confused because of EnvFactory here. + # pylint: disable=unsupported-membership-test + if "DoFencing" in self._env: + return self._env["DoFencing"] + + return True diff --git a/python/pacemaker/_cts/tests/stoponebyone.py b/python/pacemaker/_cts/tests/stoponebyone.py new file mode 100644 index 0000000..d75d282 --- /dev/null +++ b/python/pacemaker/_cts/tests/stoponebyone.py @@ -0,0 +1,56 @@ +""" Stop all running nodes serially """ + +__all__ = ["StopOnebyOne"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +from pacemaker._cts.tests.ctstest import CTSTest +from pacemaker._cts.tests.simulstartlite import SimulStartLite +from pacemaker._cts.tests.stoptest import StopTest + +# Disable various pylint warnings that occur in so many places throughout this +# file it's easiest to just take care of them globally. This does introduce the +# possibility that we'll miss some other cause of the same warning, but we'll +# just have to be careful. + +# pylint doesn't understand that self._env is subscriptable. +# pylint: disable=unsubscriptable-object + + +class StopOnebyOne(CTSTest): + """ A concrete test that stops all running nodes serially """ + + def __init__(self, cm): + """ Create a new StartOnebyOne instance + + Arguments: + + cm -- A ClusterManager instance + """ + + CTSTest.__init__(self, cm) + + self.name = "StopOnebyOne" + + self._startall = SimulStartLite(cm) + self._stop = StopTest(cm) + + def __call__(self, dummy): + """ Perform this test """ + + self.incr("calls") + + ret = self._startall(None) + if not ret: + return self.failure("Setup failed") + + failed = [] + self.set_timer() + for node in self._env["nodes"]: + if not self._stop(node): + failed.append(node) + + if failed: + return self.failure("Some node failed to stop: %r" % failed) + + return self.success() diff --git a/python/pacemaker/_cts/tests/stoptest.py b/python/pacemaker/_cts/tests/stoptest.py new file mode 100644 index 0000000..8f496d3 --- /dev/null +++ b/python/pacemaker/_cts/tests/stoptest.py @@ -0,0 +1,99 @@ +""" Stop the cluster manager on a given node """ + +__all__ = ["StopTest"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +from pacemaker._cts.tests.ctstest import CTSTest + +# Disable various pylint warnings that occur in so many places throughout this +# file it's easiest to just take care of them globally. This does introduce the +# possibility that we'll miss some other cause of the same warning, but we'll +# just have to be careful. + +# pylint doesn't understand that self._rsh is callable. +# pylint: disable=not-callable +# pylint doesn't understand that self._env is subscriptable. +# pylint: disable=unsubscriptable-object + + +class StopTest(CTSTest): + """ A pseudo-test that is only used to set up conditions before running + some other test. This class stops the cluster manager on a given + node. + + Other test classes should not use this one as a superclass. + """ + + def __init__(self, cm): + """ Create a new StopTest instance + + Arguments: + + cm -- A ClusterManager instance + """ + + CTSTest.__init__(self, cm) + self.name = "Stop" + + def __call__(self, node): + """ Stop the given node, returning whether this succeeded or not """ + + self.incr("calls") + if self._cm.expected_status[node] != "up": + return self.skipped() + + # Technically we should always be able to notice ourselves stopping + patterns = [ + self.templates["Pat:We_stopped"] % node, + ] + + # Any active node needs to notice this one left + # (note that this won't work if we have multiple partitions) + for other in self._env["nodes"]: + if self._cm.expected_status[other] == "up" and other != node: + patterns.append(self.templates["Pat:They_stopped"] %(other, node)) + + watch = self.create_watch(patterns, self._env["DeadTime"]) + watch.set_watch() + + if node == self._cm.our_node: + self.incr("us") + else: + if self._cm.upcount() <= 1: + self.incr("all") + else: + self.incr("them") + + self._cm.stop_cm(node) + watch.look_for_all() + + failreason = None + unmatched_str = "||" + + if watch.unmatched: + (_, output) = self._rsh(node, "/bin/ps axf", verbose=1) + for line in output: + self.debug(line) + + (_, output) = self._rsh(node, "/usr/sbin/dlm_tool dump 2>/dev/null", verbose=1) + for line in output: + self.debug(line) + + for regex in watch.unmatched: + self._logger.log("ERROR: Shutdown pattern not found: %s" % regex) + unmatched_str += "%s||" % regex + failreason = "Missing shutdown pattern" + + self._cm.cluster_stable(self._env["DeadTime"]) + + if not watch.unmatched or self._cm.upcount() == 0: + return self.success() + + if len(watch.unmatched) >= self._cm.upcount(): + return self.failure("no match against (%s)" % unmatched_str) + + if failreason is None: + return self.success() + + return self.failure(failreason) diff --git a/python/pacemaker/_cts/timer.py b/python/pacemaker/_cts/timer.py new file mode 100644 index 0000000..122b70b --- /dev/null +++ b/python/pacemaker/_cts/timer.py @@ -0,0 +1,63 @@ +""" Timer-related utilities for CTS """ + +__all__ = ["Timer"] +__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +import time + +class Timer: + """ A class for measuring the runtime of some task. A Timer may be used + manually or as a context manager, like so: + + with Timer(logger, "SomeTest", "SomeTimer"): + ... + + A Timer runs from when start() is called until the timer is deleted + or reset() is called. There is no explicit stop method. + """ + + def __init__(self, logger, test_name, timer_name): + """ Create a new Timer instance. + + Arguments: + + logger -- A Logger instance that can be used to record when + the timer stopped + test_name -- The name of the test this timer is being run for + timer_name -- The name of this timer + """ + + self._logger = logger + self._start_time = None + self._test_name = test_name + self._timer_name = timer_name + + def __enter__(self): + self.start() + return self + + def __exit__(self, *args): + self._logger.debug("%s:%s runtime: %.2f" % (self._test_name, self._timer_name, self.elapsed)) + + def reset(self): + """ Restart the timer """ + + self.start() + + def start(self): + """ Start the timer """ + + self._start_time = time.time() + + @property + def start_time(self): + """ When did the timer start? """ + + return self._start_time + + @property + def elapsed(self): + """ How long has the timer been running for? """ + + return time.time() - self._start_time diff --git a/python/pacemaker/_cts/watcher.py b/python/pacemaker/_cts/watcher.py index 3bdb892..3e6d702 100644 --- a/python/pacemaker/_cts/watcher.py +++ b/python/pacemaker/_cts/watcher.py @@ -13,7 +13,7 @@ from pacemaker.buildoptions import BuildOptions from pacemaker._cts.logging import LogFactory from pacemaker._cts.remote import RemoteFactory -LOG_WATCHER_BIN = BuildOptions.DAEMON_DIR + "/cts-log-watcher" +LOG_WATCHER_BIN = "%s/cts-log-watcher" % BuildOptions.DAEMON_DIR @unique class LogKind(Enum): @@ -139,7 +139,7 @@ class FileObj(SearchObj): if match: self.offset = match.group(1) - self.debug("Got %d lines, new offset: %s %s" % (len(out), self.offset, repr(self._delegate))) + self.debug("Got %d lines, new offset: %s %r" % (len(out), self.offset, self._delegate)) elif re.search(r"^CTSwatcher:.*truncated", line): self.log(line) elif re.search(r"^CTSwatcher:", line): @@ -294,8 +294,8 @@ class JournalObj(SearchObj): self.limit = lines[0].strip() self.debug("Set limit to: %s" % self.limit) else: - self.debug("Unable to set limit for %s because date returned %d lines with status %d" % (self.host, - len(lines), rc)) + self.debug("Unable to set limit for %s because date returned %d lines with status %d" + % (self.host, len(lines), rc)) class LogWatcher: """ A class for watching a single log file or journal across multiple hosts, @@ -413,7 +413,7 @@ class LogWatcher: for t in pending: t.join(60.0) if t.is_alive(): - self._logger.log("%s: Aborting after 20s waiting for %s logging commands" % (self.name, repr(t))) + self._logger.log("%s: Aborting after 20s waiting for %r logging commands" % (self.name, t)) return def end(self): diff --git a/python/pacemaker/buildoptions.py.in b/python/pacemaker/buildoptions.py.in index 53b492b..17fe981 100644 --- a/python/pacemaker/buildoptions.py.in +++ b/python/pacemaker/buildoptions.py.in @@ -22,6 +22,9 @@ class BuildOptions: CIB_DIR = "@CRM_CONFIG_DIR@" """ Where CIB files are stored """ + CIB_SCHEMA_VERSION = "@CIB_VERSION@" + """ Latest supported CIB schema version number """ + COROSYNC_CONFIG_FILE = "@PCMK__COROSYNC_CONF@" """ Path to the corosync config file """ diff --git a/python/pylintrc b/python/pylintrc index e65110b..f46eece 100644 --- a/python/pylintrc +++ b/python/pylintrc @@ -446,7 +446,8 @@ exclude-too-few-public-methods= [CLASSES] # List of method names used to declare (i.e. assign) instance attributes. -defining-attr-methods=__init__,__new__,setUp,__post_init__ +# CHANGED: Remove setUp and __post_init__, add reset +defining-attr-methods=__init__,__new__,reset # List of valid names for the first argument in a class method. valid-classmethod-first-arg=cls diff --git a/python/setup.py.in b/python/setup.py.in index c4083da..e9d61d0 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -16,5 +16,5 @@ setup(name='pacemaker', license='LGPLv2.1+', url='https://clusterlabs.org/pacemaker/', description='Python libraries for Pacemaker', - packages=['pacemaker', 'pacemaker._cts'], + packages=['pacemaker', 'pacemaker._cts', 'pacemaker._cts.tests'], ) diff --git a/python/tests/Makefile.am b/python/tests/Makefile.am index 490b272..219812c 100644 --- a/python/tests/Makefile.am +++ b/python/tests/Makefile.am @@ -9,4 +9,5 @@ MAINTAINERCLEANFILES = Makefile.in -EXTRA_DIST = $(wildcard test_*) +EXTRA_DIST = $(wildcard test_*) \ + __init__.py diff --git a/python/tests/__init__.py b/python/tests/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/python/tests/__init__.py diff --git a/python/tests/test_cts_network.py b/python/tests/test_cts_network.py new file mode 100644 index 0000000..4aea8b9 --- /dev/null +++ b/python/tests/test_cts_network.py @@ -0,0 +1,37 @@ +# These warnings are not useful in unit tests. +# pylint: disable=missing-class-docstring,missing-function-docstring,missing-module-docstring + +__copyright__ = "Copyright 2023 the Pacemaker project contributors" +__license__ = "GPLv2+" + +import unittest + +from pacemaker._cts.network import next_ip + +# next_ip makes a bunch of assumptions that we are not going to test here: +# +# * The env argument actually contains an "IPBase" key with a string in it +# * The string is a properly formatted IPv4 or IPv6 address, with no extra +# leading or trailing whitespace + +class NextIPTestCase(unittest.TestCase): + def test_ipv4(self): + # The first time next_ip is called, it will read the IPBase out of + # the environment. After that, it just goes off whatever it + # previously returned, so the environment value doesn't matter. + self.assertEqual(next_ip("1.1.1.1"), "1.1.1.2") + self.assertEqual(next_ip(), "1.1.1.3") + + # Passing reset=True will force it to re-read the environment. Here, + # we use that to ask it for a value outside of the available range. + self.assertRaises(ValueError, next_ip, "1.1.1.255", reset=True) + + def test_ipv6(self): + # Same comments as for the test_ipv4 function, plus we need to reset + # here because otherwise it will remember what happened in that function. + self.assertEqual(next_ip("fe80::fc54:ff:fe09:101", reset=True), + "fe80::fc54:ff:fe09:102") + self.assertEqual(next_ip(), + "fe80::fc54:ff:fe09:103") + + self.assertRaises(ValueError, next_ip, "fe80::fc54:ff:fe09:ffff", reset=True) |