summaryrefslogtreecommitdiffstats
path: root/cts/lab/CTSaudits.py
diff options
context:
space:
mode:
Diffstat (limited to 'cts/lab/CTSaudits.py')
-rwxr-xr-xcts/lab/CTSaudits.py879
1 files changed, 879 insertions, 0 deletions
diff --git a/cts/lab/CTSaudits.py b/cts/lab/CTSaudits.py
new file mode 100755
index 0000000..51a04f8
--- /dev/null
+++ b/cts/lab/CTSaudits.py
@@ -0,0 +1,879 @@
+""" Auditing classes for Pacemaker's Cluster Test Suite (CTS)
+"""
+
+__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors"
+__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY"
+
+import time, re, uuid
+
+from pacemaker.buildoptions import BuildOptions
+from pacemaker._cts.watcher import LogKind, LogWatcher
+
+class ClusterAudit(object):
+
+ def __init__(self, cm):
+ self.CM = cm
+
+ def __call__(self):
+ raise ValueError("Abstract Class member (__call__)")
+
+ def is_applicable(self):
+ '''Return TRUE if we are applicable in the current test configuration'''
+ raise ValueError("Abstract Class member (is_applicable)")
+ return 1
+
+ def log(self, args):
+ self.CM.log("audit: %s" % args)
+
+ def debug(self, args):
+ self.CM.debug("audit: %s" % args)
+
+ def name(self):
+ raise ValueError("Abstract Class member (name)")
+
+AllAuditClasses = [ ]
+
+
+class LogAudit(ClusterAudit):
+
+ def name(self):
+ return "LogAudit"
+
+ def __init__(self, cm):
+ self.CM = cm
+
+ def RestartClusterLogging(self, nodes=None):
+ if not nodes:
+ nodes = self.CM.Env["nodes"]
+
+ self.CM.debug("Restarting logging on: %s" % repr(nodes))
+
+ for node in nodes:
+ if self.CM.Env["have_systemd"]:
+ (rc, _) = self.CM.rsh(node, "systemctl stop systemd-journald.socket")
+ if rc != 0:
+ self.CM.log ("ERROR: Cannot stop 'systemd-journald' on %s" % node)
+
+ (rc, _) = self.CM.rsh(node, "systemctl start systemd-journald.service")
+ if rc != 0:
+ self.CM.log ("ERROR: Cannot start 'systemd-journald' on %s" % node)
+
+ (rc, _) = self.CM.rsh(node, "service %s restart" % self.CM.Env["syslogd"])
+ if rc != 0:
+ self.CM.log ("ERROR: Cannot restart '%s' on %s" % (self.CM.Env["syslogd"], node))
+
+ def _create_watcher(self, patterns, kind):
+ watch = LogWatcher(self.CM.Env["LogFileName"], patterns,
+ self.CM.Env["nodes"], kind, "LogAudit", 5,
+ silent=True)
+ watch.set_watch()
+ return watch
+
+ def TestLogging(self):
+ patterns = []
+ prefix = "Test message from"
+ suffix = str(uuid.uuid4())
+ watch = {}
+
+ for node in self.CM.Env["nodes"]:
+ # Look for the node name in two places to make sure
+ # that syslog is logging with the correct hostname
+ m = re.search("^([^.]+).*", node)
+ if m:
+ simple = m.group(1)
+ else:
+ simple = node
+ patterns.append("%s.*%s %s %s" % (simple, prefix, node, suffix))
+
+ watch_pref = self.CM.Env["LogWatcher"]
+ if watch_pref == LogKind.ANY:
+ kinds = [ LogKind.FILE ]
+ if self.CM.Env["have_systemd"]:
+ kinds += [ LogKind.JOURNAL ]
+ kinds += [ LogKind.REMOTE_FILE ]
+ for k in kinds:
+ watch[k] = self._create_watcher(patterns, k)
+ self.CM.log("Logging test message with identifier %s" % (suffix))
+ else:
+ watch[watch_pref] = self._create_watcher(patterns, watch_pref)
+
+ for node in self.CM.Env["nodes"]:
+ cmd = "logger -p %s.info %s %s %s" % (self.CM.Env["SyslogFacility"], prefix, node, suffix)
+
+ (rc, _) = self.CM.rsh(node, cmd, synchronous=False, verbose=0)
+ if rc != 0:
+ self.CM.log ("ERROR: Cannot execute remote command [%s] on %s" % (cmd, node))
+
+ for k in list(watch.keys()):
+ w = watch[k]
+ if watch_pref == LogKind.ANY:
+ self.CM.log("Checking for test message in %s logs" % (k))
+ w.look_for_all(silent=True)
+ if w.unmatched:
+ for regex in w.unmatched:
+ self.CM.log("Test message [%s] not found in %s logs" % (regex, w.kind))
+ else:
+ if watch_pref == LogKind.ANY:
+ self.CM.log("Found test message in %s logs" % (k))
+ self.CM.Env["LogWatcher"] = k
+ return 1
+
+ return 0
+
+ def __call__(self):
+ max = 3
+ attempt = 0
+
+ self.CM.ns.wait_for_all_nodes(self.CM.Env["nodes"])
+ while attempt <= max and self.TestLogging() == 0:
+ attempt = attempt + 1
+ self.RestartClusterLogging()
+ time.sleep(60*attempt)
+
+ if attempt > max:
+ self.CM.log("ERROR: Cluster logging unrecoverable.")
+ return 0
+
+ return 1
+
+ def is_applicable(self):
+ if self.CM.Env["DoBSC"]:
+ return 0
+ if self.CM.Env["LogAuditDisabled"]:
+ return 0
+ return 1
+
+
+class DiskAudit(ClusterAudit):
+
+ def name(self):
+ return "DiskspaceAudit"
+
+ def __init__(self, cm):
+ self.CM = cm
+
+ def __call__(self):
+ result = 1
+ # @TODO Use directory of PCMK_logfile if set on host
+ dfcmd = "df -BM " + BuildOptions.LOG_DIR + " | tail -1 | awk '{print $(NF-1)\" \"$(NF-2)}' | tr -d 'M%'"
+
+ self.CM.ns.wait_for_all_nodes(self.CM.Env["nodes"])
+ for node in self.CM.Env["nodes"]:
+ (_, dfout) = self.CM.rsh(node, dfcmd, verbose=1)
+ if not dfout:
+ self.CM.log ("ERROR: Cannot execute remote df command [%s] on %s" % (dfcmd, node))
+ else:
+ dfout = dfout[0].strip()
+
+ try:
+ (used, remain) = dfout.split()
+ used_percent = int(used)
+ remaining_mb = int(remain)
+ except (ValueError, TypeError):
+ self.CM.log("Warning: df output '%s' from %s was invalid [%s, %s]"
+ % (dfout, node, used, remain))
+ else:
+ if remaining_mb < 10 or used_percent > 95:
+ self.CM.log("CRIT: Out of log disk space on %s (%d%% / %dMB)"
+ % (node, used_percent, remaining_mb))
+ result = None
+ if self.CM.Env["continue"]:
+ answer = "Y"
+ else:
+ try:
+ answer = input('Continue? [nY]')
+ except EOFError as e:
+ answer = "n"
+
+ if answer and answer == "n":
+ raise ValueError("Disk full on %s" % (node))
+
+ elif remaining_mb < 100 or used_percent > 90:
+ self.CM.log("WARN: Low on log disk space (%dMB) on %s" % (remaining_mb, node))
+ return result
+
+ def is_applicable(self):
+ if self.CM.Env["DoBSC"]:
+ return 0
+ return 1
+
+
+class FileAudit(ClusterAudit):
+
+ def name(self):
+ return "FileAudit"
+
+ def __init__(self, cm):
+ self.CM = cm
+ self.known = []
+
+ def __call__(self):
+ result = 1
+
+ self.CM.ns.wait_for_all_nodes(self.CM.Env["nodes"])
+ for node in self.CM.Env["nodes"]:
+
+ (_, lsout) = self.CM.rsh(node, "ls -al /var/lib/pacemaker/cores/* | grep core.[0-9]", verbose=1)
+ for line in lsout:
+ line = line.strip()
+ if line not in self.known:
+ result = 0
+ self.known.append(line)
+ self.CM.log("Warning: Pacemaker core file on %s: %s" % (node, line))
+
+ (_, lsout) = self.CM.rsh(node, "ls -al /var/lib/corosync | grep core.[0-9]", verbose=1)
+ for line in lsout:
+ line = line.strip()
+ if line not in self.known:
+ result = 0
+ self.known.append(line)
+ self.CM.log("Warning: Corosync core file on %s: %s" % (node, line))
+
+ if node in self.CM.ShouldBeStatus and self.CM.ShouldBeStatus[node] == "down":
+ clean = 0
+ (_, lsout) = self.CM.rsh(node, "ls -al /dev/shm | grep qb-", verbose=1)
+ for line in lsout:
+ result = 0
+ clean = 1
+ self.CM.log("Warning: Stale IPC file on %s: %s" % (node, line))
+
+ if clean:
+ (_, lsout) = self.CM.rsh(node, "ps axf | grep -e pacemaker -e corosync", verbose=1)
+ for line in lsout:
+ self.CM.debug("ps[%s]: %s" % (node, line))
+
+ self.CM.rsh(node, "rm -rf /dev/shm/qb-*")
+
+ else:
+ self.CM.debug("Skipping %s" % node)
+
+ return result
+
+ def is_applicable(self):
+ return 1
+
+
+class AuditResource(object):
+ def __init__(self, cm, line):
+ fields = line.split()
+ self.CM = cm
+ self.line = line
+ self.type = fields[1]
+ self.id = fields[2]
+ self.clone_id = fields[3]
+ self.parent = fields[4]
+ self.rprovider = fields[5]
+ self.rclass = fields[6]
+ self.rtype = fields[7]
+ self.host = fields[8]
+ self.needs_quorum = fields[9]
+ self.flags = int(fields[10])
+ self.flags_s = fields[11]
+
+ if self.parent == "NA":
+ self.parent = None
+
+ def unique(self):
+ if self.flags & int("0x00000020", 16):
+ return 1
+ return 0
+
+ def orphan(self):
+ if self.flags & int("0x00000001", 16):
+ return 1
+ return 0
+
+ def managed(self):
+ if self.flags & int("0x00000002", 16):
+ return 1
+ return 0
+
+
+class AuditConstraint(object):
+ def __init__(self, cm, line):
+ fields = line.split()
+ self.CM = cm
+ self.line = line
+ self.type = fields[1]
+ self.id = fields[2]
+ self.rsc = fields[3]
+ self.target = fields[4]
+ self.score = fields[5]
+ self.rsc_role = fields[6]
+ self.target_role = fields[7]
+
+ if self.rsc_role == "NA":
+ self.rsc_role = None
+ if self.target_role == "NA":
+ self.target_role = None
+
+
+class PrimitiveAudit(ClusterAudit):
+ def name(self):
+ return "PrimitiveAudit"
+
+ def __init__(self, cm):
+ self.CM = cm
+
+ def doResourceAudit(self, resource, quorum):
+ rc = 1
+ active = self.CM.ResourceLocation(resource.id)
+
+ if len(active) == 1:
+ if quorum:
+ self.debug("Resource %s active on %s" % (resource.id, repr(active)))
+
+ elif resource.needs_quorum == 1:
+ self.CM.log("Resource %s active without quorum: %s"
+ % (resource.id, repr(active)))
+ rc = 0
+
+ elif not resource.managed():
+ self.CM.log("Resource %s not managed. Active on %s"
+ % (resource.id, repr(active)))
+
+ elif not resource.unique():
+ # TODO: Figure out a clever way to actually audit these resource types
+ if len(active) > 1:
+ self.debug("Non-unique resource %s is active on: %s"
+ % (resource.id, repr(active)))
+ else:
+ self.debug("Non-unique resource %s is not active" % resource.id)
+
+ elif len(active) > 1:
+ self.CM.log("Resource %s is active multiple times: %s"
+ % (resource.id, repr(active)))
+ rc = 0
+
+ elif resource.orphan():
+ self.debug("Resource %s is an inactive orphan" % resource.id)
+
+ elif len(self.inactive_nodes) == 0:
+ self.CM.log("WARN: Resource %s not served anywhere" % resource.id)
+ rc = 0
+
+ elif self.CM.Env["warn-inactive"]:
+ if quorum or not resource.needs_quorum:
+ self.CM.log("WARN: Resource %s not served anywhere (Inactive nodes: %s)"
+ % (resource.id, repr(self.inactive_nodes)))
+ else:
+ self.debug("Resource %s not served anywhere (Inactive nodes: %s)"
+ % (resource.id, repr(self.inactive_nodes)))
+
+ elif quorum or not resource.needs_quorum:
+ self.debug("Resource %s not served anywhere (Inactive nodes: %s)"
+ % (resource.id, repr(self.inactive_nodes)))
+
+ return rc
+
+ def setup(self):
+ self.target = None
+ self.resources = []
+ self.constraints = []
+ self.active_nodes = []
+ self.inactive_nodes = []
+
+ for node in self.CM.Env["nodes"]:
+ if self.CM.ShouldBeStatus[node] == "up":
+ self.active_nodes.append(node)
+ else:
+ self.inactive_nodes.append(node)
+
+ for node in self.CM.Env["nodes"]:
+ if self.target == None and self.CM.ShouldBeStatus[node] == "up":
+ self.target = node
+
+ if not self.target:
+ # TODO: In Pacemaker 1.0 clusters we'll be able to run crm_resource
+ # with CIB_file=/path/to/cib.xml even when the cluster isn't running
+ self.debug("No nodes active - skipping %s" % self.name())
+ return 0
+
+ (_, lines) = self.CM.rsh(self.target, "crm_resource -c", verbose=1)
+
+ for line in lines:
+ if re.search("^Resource", line):
+ self.resources.append(AuditResource(self.CM, line))
+ elif re.search("^Constraint", line):
+ self.constraints.append(AuditConstraint(self.CM, line))
+ else:
+ self.CM.log("Unknown entry: %s" % line);
+
+ return 1
+
+ def __call__(self):
+ rc = 1
+
+ if not self.setup():
+ return 1
+
+ quorum = self.CM.HasQuorum(None)
+ for resource in self.resources:
+ if resource.type == "primitive":
+ if self.doResourceAudit(resource, quorum) == 0:
+ rc = 0
+ return rc
+
+ def is_applicable(self):
+ # @TODO Due to long-ago refactoring, this name test would never match,
+ # so this audit (and those derived from it) would never run.
+ # Uncommenting the next lines fixes the name test, but that then
+ # exposes pre-existing bugs that need to be fixed.
+ #if self.CM["Name"] == "crm-corosync":
+ # return 1
+ return 0
+
+
+class GroupAudit(PrimitiveAudit):
+ def name(self):
+ return "GroupAudit"
+
+ def __call__(self):
+ rc = 1
+ if not self.setup():
+ return 1
+
+ for group in self.resources:
+ if group.type == "group":
+ first_match = 1
+ group_location = None
+ for child in self.resources:
+ if child.parent == group.id:
+ nodes = self.CM.ResourceLocation(child.id)
+
+ if first_match and len(nodes) > 0:
+ group_location = nodes[0]
+
+ first_match = 0
+
+ if len(nodes) > 1:
+ rc = 0
+ self.CM.log("Child %s of %s is active more than once: %s"
+ % (child.id, group.id, repr(nodes)))
+
+ elif len(nodes) == 0:
+ # Groups are allowed to be partially active
+ # However we do need to make sure later children aren't running
+ group_location = None
+ self.debug("Child %s of %s is stopped" % (child.id, group.id))
+
+ elif nodes[0] != group_location:
+ rc = 0
+ self.CM.log("Child %s of %s is active on the wrong node (%s) expected %s"
+ % (child.id, group.id, nodes[0], group_location))
+ else:
+ self.debug("Child %s of %s is active on %s" % (child.id, group.id, nodes[0]))
+
+ return rc
+
+
+class CloneAudit(PrimitiveAudit):
+ def name(self):
+ return "CloneAudit"
+
+ def __call__(self):
+ rc = 1
+ if not self.setup():
+ return 1
+
+ for clone in self.resources:
+ if clone.type == "clone":
+ for child in self.resources:
+ if child.parent == clone.id and child.type == "primitive":
+ self.debug("Checking child %s of %s..." % (child.id, clone.id))
+ # Check max and node_max
+ # Obtain with:
+ # crm_resource -g clone_max --meta -r child.id
+ # crm_resource -g clone_node_max --meta -r child.id
+
+ return rc
+
+
+class ColocationAudit(PrimitiveAudit):
+ def name(self):
+ return "ColocationAudit"
+
+ def crm_location(self, resource):
+ (rc, lines) = self.CM.rsh(self.target, "crm_resource -W -r %s -Q"%resource, verbose=1)
+ hosts = []
+ if rc == 0:
+ for line in lines:
+ fields = line.split()
+ hosts.append(fields[0])
+
+ return hosts
+
+ def __call__(self):
+ rc = 1
+ if not self.setup():
+ return 1
+
+ for coloc in self.constraints:
+ if coloc.type == "rsc_colocation":
+ source = self.crm_location(coloc.rsc)
+ target = self.crm_location(coloc.target)
+ if len(source) == 0:
+ self.debug("Colocation audit (%s): %s not running" % (coloc.id, coloc.rsc))
+ else:
+ for node in source:
+ if not node in target:
+ rc = 0
+ self.CM.log("Colocation audit (%s): %s running on %s (not in %s)"
+ % (coloc.id, coloc.rsc, node, repr(target)))
+ else:
+ self.debug("Colocation audit (%s): %s running on %s (in %s)"
+ % (coloc.id, coloc.rsc, node, repr(target)))
+
+ return rc
+
+
+class ControllerStateAudit(ClusterAudit):
+ def __init__(self, cm):
+ self.CM = cm
+ self.Stats = {"calls":0
+ , "success":0
+ , "failure":0
+ , "skipped":0
+ , "auditfail":0}
+
+ def has_key(self, key):
+ return key in self.Stats
+
+ def __setitem__(self, key, value):
+ self.Stats[key] = value
+
+ def __getitem__(self, key):
+ return self.Stats[key]
+
+ def incr(self, name):
+ '''Increment (or initialize) the value associated with the given name'''
+ if not name in self.Stats:
+ self.Stats[name] = 0
+ self.Stats[name] = self.Stats[name]+1
+
+ def __call__(self):
+ passed = 1
+ up_are_down = 0
+ down_are_up = 0
+ unstable_list = []
+
+ for node in self.CM.Env["nodes"]:
+ should_be = self.CM.ShouldBeStatus[node]
+ rc = self.CM.test_node_CM(node)
+ if rc > 0:
+ if should_be == "down":
+ down_are_up = down_are_up + 1
+ if rc == 1:
+ unstable_list.append(node)
+ elif should_be == "up":
+ up_are_down = up_are_down + 1
+
+ if len(unstable_list) > 0:
+ passed = 0
+ self.CM.log("Cluster is not stable: %d (of %d): %s"
+ % (len(unstable_list), self.CM.upcount(), repr(unstable_list)))
+
+ if up_are_down > 0:
+ passed = 0
+ self.CM.log("%d (of %d) nodes expected to be up were down."
+ % (up_are_down, len(self.CM.Env["nodes"])))
+
+ if down_are_up > 0:
+ passed = 0
+ self.CM.log("%d (of %d) nodes expected to be down were up."
+ % (down_are_up, len(self.CM.Env["nodes"])))
+
+ return passed
+
+ def name(self):
+ return "ControllerStateAudit"
+
+ def is_applicable(self):
+ # @TODO Due to long-ago refactoring, this name test would never match,
+ # so this audit (and those derived from it) would never run.
+ # Uncommenting the next lines fixes the name test, but that then
+ # exposes pre-existing bugs that need to be fixed.
+ #if self.CM["Name"] == "crm-corosync":
+ # return 1
+ return 0
+
+
+class CIBAudit(ClusterAudit):
+ def __init__(self, cm):
+ self.CM = cm
+ self.Stats = {"calls":0
+ , "success":0
+ , "failure":0
+ , "skipped":0
+ , "auditfail":0}
+
+ def has_key(self, key):
+ return key in self.Stats
+
+ def __setitem__(self, key, value):
+ self.Stats[key] = value
+
+ def __getitem__(self, key):
+ return self.Stats[key]
+
+ def incr(self, name):
+ '''Increment (or initialize) the value associated with the given name'''
+ if not name in self.Stats:
+ self.Stats[name] = 0
+ self.Stats[name] = self.Stats[name]+1
+
+ def __call__(self):
+ passed = 1
+ ccm_partitions = self.CM.find_partitions()
+
+ if len(ccm_partitions) == 0:
+ self.debug("\tNo partitions to audit")
+ return 1
+
+ for partition in ccm_partitions:
+ self.debug("\tAuditing CIB consistency for: %s" % partition)
+ partition_passed = 0
+ if self.audit_cib_contents(partition) == 0:
+ passed = 0
+
+ return passed
+
+ def audit_cib_contents(self, hostlist):
+ passed = 1
+ node0 = None
+ node0_xml = None
+
+ partition_hosts = hostlist.split()
+ for node in partition_hosts:
+ node_xml = self.store_remote_cib(node, node0)
+
+ if node_xml == None:
+ self.CM.log("Could not perform audit: No configuration from %s" % node)
+ passed = 0
+
+ elif node0 == None:
+ node0 = node
+ node0_xml = node_xml
+
+ elif node0_xml == None:
+ self.CM.log("Could not perform audit: No configuration from %s" % node0)
+ passed = 0
+
+ else:
+ (rc, result) = self.CM.rsh(
+ node0, "crm_diff -VV -cf --new %s --original %s" % (node_xml, node0_xml), verbose=1)
+
+ if rc != 0:
+ self.CM.log("Diff between %s and %s failed: %d" % (node0_xml, node_xml, rc))
+ passed = 0
+
+ for line in result:
+ if not re.search("<diff/>", line):
+ passed = 0
+ self.debug("CibDiff[%s-%s]: %s" % (node0, node, line))
+ else:
+ self.debug("CibDiff[%s-%s] Ignoring: %s" % (node0, node, line))
+
+# self.CM.rsh(node0, "rm -f %s" % node_xml)
+# self.CM.rsh(node0, "rm -f %s" % node0_xml)
+ return passed
+
+ def store_remote_cib(self, node, target):
+ combined = ""
+ filename = "/tmp/ctsaudit.%s.xml" % node
+
+ if not target:
+ target = node
+
+ (rc, lines) = self.CM.rsh(node, self.CM["CibQuery"], verbose=1)
+ if rc != 0:
+ self.CM.log("Could not retrieve configuration")
+ return None
+
+ self.CM.rsh("localhost", "rm -f %s" % filename)
+ for line in lines:
+ self.CM.rsh("localhost", "echo \'%s\' >> %s" % (line[:-1], filename), verbose=0)
+
+ if self.CM.rsh.copy(filename, "root@%s:%s" % (target, filename), silent=True) != 0:
+ self.CM.log("Could not store configuration")
+ return None
+ return filename
+
+ def name(self):
+ return "CibAudit"
+
+ def is_applicable(self):
+ # @TODO Due to long-ago refactoring, this name test would never match,
+ # so this audit (and those derived from it) would never run.
+ # Uncommenting the next lines fixes the name test, but that then
+ # exposes pre-existing bugs that need to be fixed.
+ #if self.CM["Name"] == "crm-corosync":
+ # return 1
+ return 0
+
+
+class PartitionAudit(ClusterAudit):
+ def __init__(self, cm):
+ self.CM = cm
+ self.Stats = {"calls":0
+ , "success":0
+ , "failure":0
+ , "skipped":0
+ , "auditfail":0}
+ self.NodeEpoch = {}
+ self.NodeState = {}
+ self.NodeQuorum = {}
+
+ def has_key(self, key):
+ return key in self.Stats
+
+ def __setitem__(self, key, value):
+ self.Stats[key] = value
+
+ def __getitem__(self, key):
+ return self.Stats[key]
+
+ def incr(self, name):
+ '''Increment (or initialize) the value associated with the given name'''
+ if not name in self.Stats:
+ self.Stats[name] = 0
+ self.Stats[name] = self.Stats[name]+1
+
+ def __call__(self):
+ passed = 1
+ ccm_partitions = self.CM.find_partitions()
+
+ if ccm_partitions == None or len(ccm_partitions) == 0:
+ return 1
+
+ self.CM.cluster_stable(double_check=True)
+
+ if len(ccm_partitions) != self.CM.partitions_expected:
+ self.CM.log("ERROR: %d cluster partitions detected:" % len(ccm_partitions))
+ passed = 0
+ for partition in ccm_partitions:
+ self.CM.log("\t %s" % partition)
+
+ for partition in ccm_partitions:
+ partition_passed = 0
+ if self.audit_partition(partition) == 0:
+ passed = 0
+
+ return passed
+
+ def trim_string(self, avalue):
+ if not avalue:
+ return None
+ if len(avalue) > 1:
+ return avalue[:-1]
+
+ def trim2int(self, avalue):
+ if not avalue:
+ return None
+ if len(avalue) > 1:
+ return int(avalue[:-1])
+
+ def audit_partition(self, partition):
+ passed = 1
+ dc_found = []
+ dc_allowed_list = []
+ lowest_epoch = None
+ node_list = partition.split()
+
+ self.debug("Auditing partition: %s" % (partition))
+ for node in node_list:
+ if self.CM.ShouldBeStatus[node] != "up":
+ self.CM.log("Warn: Node %s appeared out of nowhere" % (node))
+ self.CM.ShouldBeStatus[node] = "up"
+ # not in itself a reason to fail the audit (not what we're
+ # checking for in this audit)
+
+ (_, out) = self.CM.rsh(node, self.CM["StatusCmd"] % node, verbose=1)
+ self.NodeState[node] = out[0].strip()
+
+ (_, out) = self.CM.rsh(node, self.CM["EpochCmd"], verbose=1)
+ self.NodeEpoch[node] = out[0].strip()
+
+ (_, out) = self.CM.rsh(node, self.CM["QuorumCmd"], verbose=1)
+ self.NodeQuorum[node] = out[0].strip()
+
+ self.debug("Node %s: %s - %s - %s." % (node, self.NodeState[node], self.NodeEpoch[node], self.NodeQuorum[node]))
+ self.NodeState[node] = self.trim_string(self.NodeState[node])
+ self.NodeEpoch[node] = self.trim2int(self.NodeEpoch[node])
+ self.NodeQuorum[node] = self.trim_string(self.NodeQuorum[node])
+
+ if not self.NodeEpoch[node]:
+ self.CM.log("Warn: Node %s dissappeared: cant determin epoch" % (node))
+ self.CM.ShouldBeStatus[node] = "down"
+ # not in itself a reason to fail the audit (not what we're
+ # checking for in this audit)
+ elif lowest_epoch == None or self.NodeEpoch[node] < lowest_epoch:
+ lowest_epoch = self.NodeEpoch[node]
+
+ if not lowest_epoch:
+ self.CM.log("Lowest epoch not determined in %s" % (partition))
+ passed = 0
+
+ for node in node_list:
+ if self.CM.ShouldBeStatus[node] == "up":
+ if self.CM.is_node_dc(node, self.NodeState[node]):
+ dc_found.append(node)
+ if self.NodeEpoch[node] == lowest_epoch:
+ self.debug("%s: OK" % node)
+ elif not self.NodeEpoch[node]:
+ self.debug("Check on %s ignored: no node epoch" % node)
+ elif not lowest_epoch:
+ self.debug("Check on %s ignored: no lowest epoch" % node)
+ else:
+ self.CM.log("DC %s is not the oldest node (%d vs. %d)"
+ % (node, self.NodeEpoch[node], lowest_epoch))
+ passed = 0
+
+ if len(dc_found) == 0:
+ self.CM.log("DC not found on any of the %d allowed nodes: %s (of %s)"
+ % (len(dc_allowed_list), str(dc_allowed_list), str(node_list)))
+
+ elif len(dc_found) > 1:
+ self.CM.log("%d DCs (%s) found in cluster partition: %s"
+ % (len(dc_found), str(dc_found), str(node_list)))
+ passed = 0
+
+ if passed == 0:
+ for node in node_list:
+ if self.CM.ShouldBeStatus[node] == "up":
+ self.CM.log("epoch %s : %s"
+ % (self.NodeEpoch[node], self.NodeState[node]))
+
+ return passed
+
+ def name(self):
+ return "PartitionAudit"
+
+ def is_applicable(self):
+ # @TODO Due to long-ago refactoring, this name test would never match,
+ # so this audit (and those derived from it) would never run.
+ # Uncommenting the next lines fixes the name test, but that then
+ # exposes pre-existing bugs that need to be fixed.
+ #if self.CM["Name"] == "crm-corosync":
+ # return 1
+ return 0
+
+AllAuditClasses.append(DiskAudit)
+AllAuditClasses.append(FileAudit)
+AllAuditClasses.append(LogAudit)
+AllAuditClasses.append(ControllerStateAudit)
+AllAuditClasses.append(PartitionAudit)
+AllAuditClasses.append(PrimitiveAudit)
+AllAuditClasses.append(GroupAudit)
+AllAuditClasses.append(CloneAudit)
+AllAuditClasses.append(ColocationAudit)
+AllAuditClasses.append(CIBAudit)
+
+
+def AuditList(cm):
+ result = []
+ for auditclass in AllAuditClasses:
+ a = auditclass(cm)
+ if a.is_applicable():
+ result.append(a)
+ return result