diff options
Diffstat (limited to '')
-rw-r--r-- | cts/lab/ClusterManager.py | 940 |
1 files changed, 0 insertions, 940 deletions
diff --git a/cts/lab/ClusterManager.py b/cts/lab/ClusterManager.py deleted file mode 100644 index fda4cfb..0000000 --- a/cts/lab/ClusterManager.py +++ /dev/null @@ -1,940 +0,0 @@ -""" ClusterManager class for Pacemaker's Cluster Test Suite (CTS) -""" - -__copyright__ = """Copyright 2000-2023 the Pacemaker project contributors. -Certain portions by Huang Zhen <zhenhltc@cn.ibm.com> are copyright 2004 -International Business Machines. The version control history for this file -may have further details.""" -__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" - -import os -import re -import time - -from collections import UserDict - -from cts.CIB import ConfigFactory -from cts.CTStests import AuditResource - -from pacemaker.buildoptions import BuildOptions -from pacemaker._cts.CTS import NodeStatus, Process -from pacemaker._cts.environment import EnvFactory -from pacemaker._cts.logging import LogFactory -from pacemaker._cts.patterns import PatternSelector -from pacemaker._cts.remote import RemoteFactory -from pacemaker._cts.watcher import LogWatcher - -class ClusterManager(UserDict): - '''The Cluster Manager class. - This is an subclass of the Python dictionary class. - (this is because it contains lots of {name,value} pairs, - not because it's behavior is that terribly similar to a - dictionary in other ways.) - - This is an abstract class which class implements high-level - operations on the cluster and/or its cluster managers. - Actual cluster managers classes are subclassed from this type. - - One of the things we do is track the state we think every node should - be in. - ''' - - def __InitialConditions(self): - #if os.geteuid() != 0: - # raise ValueError("Must Be Root!") - None - - def _finalConditions(self): - for key in list(self.keys()): - if self[key] == None: - raise ValueError("Improper derivation: self[" + key + "] must be overridden by subclass.") - - def __init__(self): - self.Env = EnvFactory().getInstance() - self.templates = PatternSelector(self.Env["Name"]) - self.__InitialConditions() - self.logger = LogFactory() - self.TestLoggingLevel=0 - self.data = {} - self.name = self.Env["Name"] - - self.rsh = RemoteFactory().getInstance() - self.ShouldBeStatus={} - self.ns = NodeStatus(self.Env) - self.OurNode = os.uname()[1].lower() - self.__instance_errorstoignore = [] - - self.cib_installed = 0 - self.config = None - self.cluster_monitor = 0 - self.use_short_names = 1 - - if self.Env["DoBSC"]: - del self.templates["Pat:They_stopped"] - - self._finalConditions() - - self.check_transitions = 0 - self.check_elections = 0 - self.CIBsync = {} - self.CibFactory = ConfigFactory(self) - self.cib = self.CibFactory.createConfig(self.Env["Schema"]) - - def __getitem__(self, key): - if key == "Name": - return self.name - - print("FIXME: Getting %s from %s" % (key, repr(self))) - if key in self.data: - return self.data[key] - - return self.templates.get_patterns(key) - - def __setitem__(self, key, value): - print("FIXME: Setting %s=%s on %s" % (key, value, repr(self))) - self.data[key] = value - - def key_for_node(self, node): - return node - - def instance_errorstoignore_clear(self): - '''Allows the test scenario to reset instance errors to ignore on each iteration.''' - self.__instance_errorstoignore = [] - - def instance_errorstoignore(self): - '''Return list of errors which are 'normal' for a specific test instance''' - return self.__instance_errorstoignore - - def log(self, args): - self.logger.log(args) - - def debug(self, args): - self.logger.debug(args) - - def upcount(self): - '''How many nodes are up?''' - count = 0 - for node in self.Env["nodes"]: - if self.ShouldBeStatus[node] == "up": - count = count + 1 - return count - - def install_support(self, command="install"): - for node in self.Env["nodes"]: - self.rsh(node, BuildOptions.DAEMON_DIR + "/cts-support " + command) - - def prepare_fencing_watcher(self, name): - # If we don't have quorum now but get it as a result of starting this node, - # then a bunch of nodes might get fenced - upnode = None - if self.HasQuorum(None): - self.debug("Have quorum") - return None - - if not self.templates["Pat:Fencing_start"]: - print("No start pattern") - return None - - if not self.templates["Pat:Fencing_ok"]: - print("No ok pattern") - return None - - stonith = None - stonithPats = [] - for peer in self.Env["nodes"]: - if self.ShouldBeStatus[peer] != "up": - stonithPats.append(self.templates["Pat:Fencing_ok"] % peer) - stonithPats.append(self.templates["Pat:Fencing_start"] % peer) - - stonith = LogWatcher(self.Env["LogFileName"], stonithPats, self.Env["nodes"], self.Env["LogWatcher"], "StartupFencing", 0) - stonith.set_watch() - return stonith - - def fencing_cleanup(self, node, stonith): - peer_list = [] - peer_state = {} - - self.debug("Looking for nodes that were fenced as a result of %s starting" % node) - - # If we just started a node, we may now have quorum (and permission to fence) - if not stonith: - self.debug("Nothing to do") - return peer_list - - q = self.HasQuorum(None) - if not q and len(self.Env["nodes"]) > 2: - # We didn't gain quorum - we shouldn't have shot anyone - self.debug("Quorum: %d Len: %d" % (q, len(self.Env["nodes"]))) - return peer_list - - for n in self.Env["nodes"]: - peer_state[n] = "unknown" - - # Now see if any states need to be updated - self.debug("looking for: " + repr(stonith.regexes)) - shot = stonith.look(0) - while shot: - line = repr(shot) - self.debug("Found: " + line) - del stonith.regexes[stonith.whichmatch] - - # Extract node name - for n in self.Env["nodes"]: - if re.search(self.templates["Pat:Fencing_ok"] % n, shot): - peer = n - peer_state[peer] = "complete" - self.__instance_errorstoignore.append(self.templates["Pat:Fencing_ok"] % peer) - - elif peer_state[n] != "complete" and re.search(self.templates["Pat:Fencing_start"] % n, shot): - # TODO: Correctly detect multiple fencing operations for the same host - peer = n - peer_state[peer] = "in-progress" - self.__instance_errorstoignore.append(self.templates["Pat:Fencing_start"] % peer) - - if not peer: - self.logger.log("ERROR: Unknown stonith match: %s" % line) - - elif not peer in peer_list: - self.debug("Found peer: " + peer) - peer_list.append(peer) - - # Get the next one - shot = stonith.look(60) - - for peer in peer_list: - - self.debug(" Peer %s was fenced as a result of %s starting: %s" % (peer, node, peer_state[peer])) - if self.Env["at-boot"]: - self.ShouldBeStatus[peer] = "up" - else: - self.ShouldBeStatus[peer] = "down" - - if peer_state[peer] == "in-progress": - # Wait for any in-progress operations to complete - shot = stonith.look(60) - while len(stonith.regexes) and shot: - line = repr(shot) - self.debug("Found: " + line) - del stonith.regexes[stonith.whichmatch] - shot = stonith.look(60) - - # Now make sure the node is alive too - self.ns.wait_for_node(peer, self.Env["DeadTime"]) - - # Poll until it comes up - if self.Env["at-boot"]: - if not self.StataCM(peer): - time.sleep(self.Env["StartTime"]) - - if not self.StataCM(peer): - self.logger.log("ERROR: Peer %s failed to restart after being fenced" % peer) - return None - - return peer_list - - def StartaCM(self, node, verbose=False): - - '''Start up the cluster manager on a given node''' - if verbose: self.logger.log("Starting %s on node %s" % (self.templates["Name"], node)) - else: self.debug("Starting %s on node %s" % (self.templates["Name"], node)) - ret = 1 - - if not node in self.ShouldBeStatus: - self.ShouldBeStatus[node] = "down" - - if self.ShouldBeStatus[node] != "down": - return 1 - - patterns = [] - # Technically we should always be able to notice ourselves starting - patterns.append(self.templates["Pat:Local_started"] % node) - if self.upcount() == 0: - patterns.append(self.templates["Pat:DC_started"] % node) - else: - patterns.append(self.templates["Pat:NonDC_started"] % node) - - watch = LogWatcher( - self.Env["LogFileName"], patterns, self.Env["nodes"], self.Env["LogWatcher"], "StartaCM", self.Env["StartTime"]+10) - - self.install_config(node) - - self.ShouldBeStatus[node] = "any" - if self.StataCM(node) and self.cluster_stable(self.Env["DeadTime"]): - self.logger.log ("%s was already started" % (node)) - return 1 - - stonith = self.prepare_fencing_watcher(node) - watch.set_watch() - - (rc, _) = self.rsh(node, self.templates["StartCmd"]) - if rc != 0: - self.logger.log ("Warn: Start command failed on node %s" % (node)) - self.fencing_cleanup(node, stonith) - return None - - self.ShouldBeStatus[node] = "up" - watch_result = watch.look_for_all() - - if watch.unmatched: - for regex in watch.unmatched: - self.logger.log ("Warn: Startup pattern not found: %s" % (regex)) - - if watch_result and self.cluster_stable(self.Env["DeadTime"]): - #self.debug("Found match: "+ repr(watch_result)) - self.fencing_cleanup(node, stonith) - return 1 - - elif self.StataCM(node) and self.cluster_stable(self.Env["DeadTime"]): - self.fencing_cleanup(node, stonith) - return 1 - - self.logger.log ("Warn: Start failed for node %s" % (node)) - return None - - def StartaCMnoBlock(self, node, verbose=False): - - '''Start up the cluster manager on a given node with none-block mode''' - - if verbose: self.logger.log("Starting %s on node %s" % (self["Name"], node)) - else: self.debug("Starting %s on node %s" % (self["Name"], node)) - - self.install_config(node) - self.rsh(node, self.templates["StartCmd"], synchronous=False) - self.ShouldBeStatus[node] = "up" - return 1 - - def StopaCM(self, node, verbose=False, force=False): - - '''Stop the cluster manager on a given node''' - - if verbose: self.logger.log("Stopping %s on node %s" % (self["Name"], node)) - else: self.debug("Stopping %s on node %s" % (self["Name"], node)) - - if self.ShouldBeStatus[node] != "up" and force == False: - return 1 - - (rc, _) = self.rsh(node, self.templates["StopCmd"]) - if rc == 0: - # Make sure we can continue even if corosync leaks - # fdata-* is the old name - #self.rsh(node, "rm -rf /dev/shm/qb-* /dev/shm/fdata-*") - self.ShouldBeStatus[node] = "down" - self.cluster_stable(self.Env["DeadTime"]) - return 1 - else: - self.logger.log ("ERROR: Could not stop %s on node %s" % (self["Name"], node)) - - return None - - def StopaCMnoBlock(self, node): - - '''Stop the cluster manager on a given node with none-block mode''' - - self.debug("Stopping %s on node %s" % (self["Name"], node)) - - self.rsh(node, self.templates["StopCmd"], synchronous=False) - self.ShouldBeStatus[node] = "down" - return 1 - - def RereadCM(self, node): - - '''Force the cluster manager on a given node to reread its config - This may be a no-op on certain cluster managers. - ''' - (rc, _) = self.rsh(node, self.templates["RereadCmd"]) - if rc == 0: - return 1 - else: - self.logger.log ("Could not force %s on node %s to reread its config" - % (self["Name"], node)) - return None - - def startall(self, nodelist=None, verbose=False, quick=False): - - '''Start the cluster manager on every node in the cluster. - We can do it on a subset of the cluster if nodelist is not None. - ''' - map = {} - if not nodelist: - nodelist = self.Env["nodes"] - - for node in nodelist: - if self.ShouldBeStatus[node] == "down": - self.ns.wait_for_all_nodes(nodelist, 300) - - if not quick: - # This is used for "basic sanity checks", so only start one node ... - if not self.StartaCM(node, verbose=verbose): - return 0 - return 1 - - # Approximation of SimulStartList for --boot - watchpats = [ ] - watchpats.append(self.templates["Pat:DC_IDLE"]) - for node in nodelist: - watchpats.append(self.templates["Pat:InfraUp"] % node) - watchpats.append(self.templates["Pat:PacemakerUp"] % node) - watchpats.append(self.templates["Pat:Local_started"] % node) - watchpats.append(self.templates["Pat:They_up"] % (nodelist[0], node)) - - # Start all the nodes - at about the same time... - watch = LogWatcher(self.Env["LogFileName"], watchpats, self.Env["nodes"], self.Env["LogWatcher"], "fast-start", self.Env["DeadTime"]+10) - watch.set_watch() - - if not self.StartaCM(nodelist[0], verbose=verbose): - return 0 - for node in nodelist: - self.StartaCMnoBlock(node, verbose=verbose) - - watch.look_for_all() - if watch.unmatched: - for regex in watch.unmatched: - self.logger.log ("Warn: Startup pattern not found: %s" % (regex)) - - if not self.cluster_stable(): - self.logger.log("Cluster did not stabilize") - return 0 - - return 1 - - def stopall(self, nodelist=None, verbose=False, force=False): - - '''Stop the cluster managers on every node in the cluster. - We can do it on a subset of the cluster if nodelist is not None. - ''' - - ret = 1 - map = {} - if not nodelist: - nodelist = self.Env["nodes"] - for node in self.Env["nodes"]: - if self.ShouldBeStatus[node] == "up" or force == True: - if not self.StopaCM(node, verbose=verbose, force=force): - ret = 0 - return ret - - def rereadall(self, nodelist=None): - - '''Force the cluster managers on every node in the cluster - to reread their config files. We can do it on a subset of the - cluster if nodelist is not None. - ''' - - map = {} - if not nodelist: - nodelist = self.Env["nodes"] - for node in self.Env["nodes"]: - if self.ShouldBeStatus[node] == "up": - self.RereadCM(node) - - def statall(self, nodelist=None): - - '''Return the status of the cluster managers in the cluster. - We can do it on a subset of the cluster if nodelist is not None. - ''' - - result = {} - if not nodelist: - nodelist = self.Env["nodes"] - for node in nodelist: - if self.StataCM(node): - result[node] = "up" - else: - result[node] = "down" - return result - - def isolate_node(self, target, nodes=None): - '''isolate the communication between the nodes''' - if not nodes: - nodes = self.Env["nodes"] - - for node in nodes: - if node != target: - rc = self.rsh(target, self.templates["BreakCommCmd"] % self.key_for_node(node)) - if rc != 0: - self.logger.log("Could not break the communication between %s and %s: %d" % (target, node, rc)) - return None - else: - self.debug("Communication cut between %s and %s" % (target, node)) - return 1 - - def unisolate_node(self, target, nodes=None): - '''fix the communication between the nodes''' - if not nodes: - nodes = self.Env["nodes"] - - for node in nodes: - if node != target: - restored = 0 - - # Limit the amount of time we have asynchronous connectivity for - # Restore both sides as simultaneously as possible - self.rsh(target, self.templates["FixCommCmd"] % self.key_for_node(node), synchronous=False) - self.rsh(node, self.templates["FixCommCmd"] % self.key_for_node(target), synchronous=False) - self.debug("Communication restored between %s and %s" % (target, node)) - - def oprofileStart(self, node=None): - if not node: - for n in self.Env["oprofile"]: - self.oprofileStart(n) - - elif node in self.Env["oprofile"]: - self.debug("Enabling oprofile on %s" % node) - self.rsh(node, "opcontrol --init") - self.rsh(node, "opcontrol --setup --no-vmlinux --separate=lib --callgraph=20 --image=all") - self.rsh(node, "opcontrol --start") - self.rsh(node, "opcontrol --reset") - - def oprofileSave(self, test, node=None): - if not node: - for n in self.Env["oprofile"]: - self.oprofileSave(test, n) - - elif node in self.Env["oprofile"]: - self.rsh(node, "opcontrol --dump") - self.rsh(node, "opcontrol --save=cts.%d" % test) - # Read back with: opreport -l session:cts.0 image:<directory>/c* - if None: - self.rsh(node, "opcontrol --reset") - else: - self.oprofileStop(node) - self.oprofileStart(node) - - def oprofileStop(self, node=None): - if not node: - for n in self.Env["oprofile"]: - self.oprofileStop(n) - - elif node in self.Env["oprofile"]: - self.debug("Stopping oprofile on %s" % node) - self.rsh(node, "opcontrol --reset") - self.rsh(node, "opcontrol --shutdown 2>&1 > /dev/null") - - def errorstoignore(self): - # At some point implement a more elegant solution that - # also produces a report at the end - """ Return a list of known error messages that should be ignored """ - return self.templates.get_patterns("BadNewsIgnore") - - def install_config(self, node): - if not self.ns.wait_for_node(node): - self.log("Node %s is not up." % node) - return None - - if not node in self.CIBsync and self.Env["ClobberCIB"]: - self.CIBsync[node] = 1 - self.rsh(node, "rm -f " + BuildOptions.CIB_DIR + "/cib*") - - # Only install the CIB on the first node, all the other ones will pick it up from there - if self.cib_installed == 1: - return None - - self.cib_installed = 1 - if self.Env["CIBfilename"] == None: - self.log("Installing Generated CIB on node %s" % (node)) - self.cib.install(node) - - else: - self.log("Installing CIB (%s) on node %s" % (self.Env["CIBfilename"], node)) - if self.rsh.copy(self.Env["CIBfilename"], "root@" + (self.templates["CIBfile"] % node)) != 0: - raise ValueError("Can not scp file to %s %d"%(node)) - - self.rsh(node, "chown " + BuildOptions.DAEMON_USER + " " + BuildOptions.CIB_DIR + "/cib.xml") - - def prepare(self): - '''Finish the Initialization process. Prepare to test...''' - - self.partitions_expected = 1 - for node in self.Env["nodes"]: - self.ShouldBeStatus[node] = "" - if self.Env["experimental-tests"]: - self.unisolate_node(node) - self.StataCM(node) - - def test_node_CM(self, node): - '''Report the status of the cluster manager on a given node''' - - watchpats = [ ] - watchpats.append("Current ping state: (S_IDLE|S_NOT_DC)") - watchpats.append(self.templates["Pat:NonDC_started"] % node) - watchpats.append(self.templates["Pat:DC_started"] % node) - idle_watch = LogWatcher(self.Env["LogFileName"], watchpats, [node], self.Env["LogWatcher"], "ClusterIdle") - idle_watch.set_watch() - - (_, out) = self.rsh(node, self.templates["StatusCmd"]%node, verbose=1) - - if not out: - out = "" - else: - out = out[0].strip() - - self.debug("Node %s status: '%s'" %(node, out)) - - if out.find('ok') < 0: - if self.ShouldBeStatus[node] == "up": - self.log( - "Node status for %s is %s but we think it should be %s" - % (node, "down", self.ShouldBeStatus[node])) - self.ShouldBeStatus[node] = "down" - return 0 - - if self.ShouldBeStatus[node] == "down": - self.log( - "Node status for %s is %s but we think it should be %s: %s" - % (node, "up", self.ShouldBeStatus[node], out)) - - self.ShouldBeStatus[node] = "up" - - # check the output first - because syslog-ng loses messages - if out.find('S_NOT_DC') != -1: - # Up and stable - return 2 - if out.find('S_IDLE') != -1: - # Up and stable - return 2 - - # fall back to syslog-ng and wait - if not idle_watch.look(): - # just up - self.debug("Warn: Node %s is unstable: %s" % (node, out)) - return 1 - - # Up and stable - return 2 - - # Is the node up or is the node down - def StataCM(self, node): - '''Report the status of the cluster manager on a given node''' - - if self.test_node_CM(node) > 0: - return 1 - return None - - # Being up and being stable is not the same question... - def node_stable(self, node): - '''Report the status of the cluster manager on a given node''' - - if self.test_node_CM(node) == 2: - return 1 - self.log("Warn: Node %s not stable" % (node)) - return None - - def partition_stable(self, nodes, timeout=None): - watchpats = [ ] - watchpats.append("Current ping state: S_IDLE") - watchpats.append(self.templates["Pat:DC_IDLE"]) - self.debug("Waiting for cluster stability...") - - if timeout == None: - timeout = self.Env["DeadTime"] - - if len(nodes) < 3: - self.debug("Cluster is inactive") - return 1 - - idle_watch = LogWatcher(self.Env["LogFileName"], watchpats, nodes.split(), self.Env["LogWatcher"], "ClusterStable", timeout) - idle_watch.set_watch() - - for node in nodes.split(): - # have each node dump its current state - self.rsh(node, self.templates["StatusCmd"] % node, verbose=1) - - ret = idle_watch.look() - while ret: - self.debug(ret) - for node in nodes.split(): - if re.search(node, ret): - return 1 - ret = idle_watch.look() - - self.debug("Warn: Partition %s not IDLE after %ds" % (repr(nodes), timeout)) - return None - - def cluster_stable(self, timeout=None, double_check=False): - partitions = self.find_partitions() - - for partition in partitions: - if not self.partition_stable(partition, timeout): - return None - - if double_check: - # Make sure we are really stable and that all resources, - # including those that depend on transient node attributes, - # are started if they were going to be - time.sleep(5) - for partition in partitions: - if not self.partition_stable(partition, timeout): - return None - - return 1 - - def is_node_dc(self, node, status_line=None): - rc = 0 - - if not status_line: - (_, out) = self.rsh(node, self.templates["StatusCmd"]%node, verbose=1) - - if out: - status_line = out[0].strip() - - if not status_line: - rc = 0 - elif status_line.find('S_IDLE') != -1: - rc = 1 - elif status_line.find('S_INTEGRATION') != -1: - rc = 1 - elif status_line.find('S_FINALIZE_JOIN') != -1: - rc = 1 - elif status_line.find('S_POLICY_ENGINE') != -1: - rc = 1 - elif status_line.find('S_TRANSITION_ENGINE') != -1: - rc = 1 - - return rc - - def active_resources(self, node): - (_, output) = self.rsh(node, "crm_resource -c", verbose=1) - resources = [] - for line in output: - if re.search("^Resource", line): - tmp = AuditResource(self, line) - if tmp.type == "primitive" and tmp.host == node: - resources.append(tmp.id) - return resources - - def ResourceLocation(self, rid): - ResourceNodes = [] - for node in self.Env["nodes"]: - if self.ShouldBeStatus[node] == "up": - - cmd = self.templates["RscRunning"] % (rid) - (rc, lines) = self.rsh(node, cmd) - - if rc == 127: - self.log("Command '%s' failed. Binary or pacemaker-cts package not installed?" % cmd) - for line in lines: - self.log("Output: "+line) - elif rc == 0: - ResourceNodes.append(node) - - return ResourceNodes - - def find_partitions(self): - ccm_partitions = [] - - for node in self.Env["nodes"]: - if self.ShouldBeStatus[node] == "up": - (_, out) = self.rsh(node, self.templates["PartitionCmd"], verbose=1) - - if not out: - self.log("no partition details for %s" % node) - continue - - partition = out[0].strip() - - if len(partition) > 2: - nodes = partition.split() - nodes.sort() - partition = ' '.join(nodes) - - found = 0 - for a_partition in ccm_partitions: - if partition == a_partition: - found = 1 - if found == 0: - self.debug("Adding partition from %s: %s" % (node, partition)) - ccm_partitions.append(partition) - else: - self.debug("Partition '%s' from %s is consistent with existing entries" % (partition, node)) - - else: - self.log("bad partition details for %s" % node) - else: - self.debug("Node %s is down... skipping" % node) - - self.debug("Found partitions: %s" % repr(ccm_partitions) ) - return ccm_partitions - - def HasQuorum(self, node_list): - # If we are auditing a partition, then one side will - # have quorum and the other not. - # So the caller needs to tell us which we are checking - # If no value for node_list is specified... assume all nodes - if not node_list: - node_list = self.Env["nodes"] - - for node in node_list: - if self.ShouldBeStatus[node] == "up": - (_, quorum) = self.rsh(node, self.templates["QuorumCmd"], verbose=1) - quorum = quorum[0].strip() - - if quorum.find("1") != -1: - return 1 - elif quorum.find("0") != -1: - return 0 - else: - self.debug("WARN: Unexpected quorum test result from " + node + ":" + quorum) - - return 0 - - def Components(self): - complist = [] - common_ignore = [ - "Pending action:", - "(ERROR|error): crm_log_message_adv:", - "(ERROR|error): MSG: No message to dump", - "pending LRM operations at shutdown", - "Lost connection to the CIB manager", - "Connection to the CIB terminated...", - "Sending message to the CIB manager FAILED", - "Action A_RECOVER .* not supported", - "(ERROR|error): stonithd_op_result_ready: not signed on", - "pingd.*(ERROR|error): send_update: Could not send update", - "send_ipc_message: IPC Channel to .* is not connected", - "unconfirmed_actions: Waiting on .* unconfirmed actions", - "cib_native_msgready: Message pending on command channel", - r": Performing A_EXIT_1 - forcefully exiting ", - r"Resource .* was active at shutdown. You may ignore this error if it is unmanaged.", - ] - - stonith_ignore = [ - r"Updating failcount for child_DoFencing", - r"error.*: Fencer connection failed \(will retry\)", - "pacemaker-execd.*(ERROR|error): stonithd_receive_ops_result failed.", - ] - - stonith_ignore.extend(common_ignore) - - ccm = Process(self, "ccm", pats = [ - "State transition .* S_RECOVERY", - "pacemaker-controld.*Action A_RECOVER .* not supported", - r"pacemaker-controld.*: Input I_TERMINATE .*from do_recover", - r"pacemaker-controld.*: Could not recover from internal error", - "pacemaker-controld.*I_ERROR.*crmd_cib_connection_destroy", - # these status numbers are likely wrong now - r"pacemaker-controld.*exited with status 2", - r"attrd.*exited with status 1", - r"cib.*exited with status 2", - -# Not if it was fenced -# "A new node joined the cluster", - -# "WARN: determine_online_status: Node .* is unclean", -# "Scheduling node .* for fencing", -# "Executing .* fencing operation", -# "tengine_stonith_callback: .*result=0", -# "Processing I_NODE_JOIN:.* cause=C_HA_MESSAGE", -# "State transition S_.* -> S_INTEGRATION.*input=I_NODE_JOIN", - "State transition S_STARTING -> S_PENDING", - ], badnews_ignore = common_ignore) - - based = Process(self, "pacemaker-based", pats = [ - "State transition .* S_RECOVERY", - "Lost connection to the CIB manager", - "Connection to the CIB manager terminated", - r"pacemaker-controld.*: Input I_TERMINATE .*from do_recover", - "pacemaker-controld.*I_ERROR.*crmd_cib_connection_destroy", - r"pacemaker-controld.*: Could not recover from internal error", - # these status numbers are likely wrong now - r"pacemaker-controld.*exited with status 2", - r"attrd.*exited with status 1", - ], badnews_ignore = common_ignore) - - execd = Process(self, "pacemaker-execd", pats = [ - "State transition .* S_RECOVERY", - "LRM Connection failed", - "pacemaker-controld.*I_ERROR.*lrm_connection_destroy", - "State transition S_STARTING -> S_PENDING", - r"pacemaker-controld.*: Input I_TERMINATE .*from do_recover", - r"pacemaker-controld.*: Could not recover from internal error", - # this status number is likely wrong now - r"pacemaker-controld.*exited with status 2", - ], badnews_ignore = common_ignore) - - controld = Process(self, "pacemaker-controld", - pats = [ -# "WARN: determine_online_status: Node .* is unclean", -# "Scheduling node .* for fencing", -# "Executing .* fencing operation", -# "tengine_stonith_callback: .*result=0", - "State transition .* S_IDLE", - "State transition S_STARTING -> S_PENDING", - ], badnews_ignore = common_ignore) - - schedulerd = Process(self, "pacemaker-schedulerd", pats = [ - "State transition .* S_RECOVERY", - r"pacemaker-controld.*: Input I_TERMINATE .*from do_recover", - r"pacemaker-controld.*: Could not recover from internal error", - r"pacemaker-controld.*CRIT.*: Connection to the scheduler failed", - "pacemaker-controld.*I_ERROR.*save_cib_contents", - # this status number is likely wrong now - r"pacemaker-controld.*exited with status 2", - ], badnews_ignore = common_ignore, dc_only=True) - - if self.Env["DoFencing"]: - complist.append(Process(self, "stoniths", dc_pats = [ - r"pacemaker-controld.*CRIT.*: Fencing daemon connection failed", - "Attempting connection to fencing daemon", - ], badnews_ignore = stonith_ignore)) - - ccm.pats.extend([ - # these status numbers are likely wrong now - r"attrd.*exited with status 1", - r"pacemaker-(based|controld).*exited with status 2", - ]) - based.pats.extend([ - # these status numbers are likely wrong now - r"attrd.*exited with status 1", - r"pacemaker-controld.*exited with status 2", - ]) - execd.pats.extend([ - # these status numbers are likely wrong now - r"pacemaker-controld.*exited with status 2", - ]) - - complist.append(ccm) - complist.append(based) - complist.append(execd) - complist.append(controld) - complist.append(schedulerd) - - return complist - - def StandbyStatus(self, node): - (_, out) = self.rsh(node, self.templates["StandbyQueryCmd"] % node, verbose=1) - if not out: - return "off" - out = out[0].strip() - self.debug("Standby result: "+out) - return out - - # status == "on" : Enter Standby mode - # status == "off": Enter Active mode - def SetStandbyMode(self, node, status): - current_status = self.StandbyStatus(node) - cmd = self.templates["StandbyCmd"] % (node, status) - self.rsh(node, cmd) - return True - - def AddDummyRsc(self, node, rid): - rsc_xml = """ '<resources> - <primitive class=\"ocf\" id=\"%s\" provider=\"pacemaker\" type=\"Dummy\"> - <operations> - <op id=\"%s-interval-10s\" interval=\"10s\" name=\"monitor\"/ - </operations> - </primitive> - </resources>'""" % (rid, rid) - constraint_xml = """ '<constraints> - <rsc_location id=\"location-%s-%s\" node=\"%s\" rsc=\"%s\" score=\"INFINITY\"/> - </constraints>' - """ % (rid, node, node, rid) - - self.rsh(node, self.templates['CibAddXml'] % (rsc_xml)) - self.rsh(node, self.templates['CibAddXml'] % (constraint_xml)) - - def RemoveDummyRsc(self, node, rid): - constraint = "\"//rsc_location[@rsc='%s']\"" % (rid) - rsc = "\"//primitive[@id='%s']\"" % (rid) - - self.rsh(node, self.templates['CibDelXpath'] % constraint) - self.rsh(node, self.templates['CibDelXpath'] % rsc) |