summaryrefslogtreecommitdiffstats
path: root/scripts/health
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/health')
-rwxr-xr-xscripts/health/collect.py111
-rwxr-xr-xscripts/health/hahealth.py40
-rw-r--r--scripts/health/main.yml16
-rwxr-xr-xscripts/health/report.py134
4 files changed, 301 insertions, 0 deletions
diff --git a/scripts/health/collect.py b/scripts/health/collect.py
new file mode 100755
index 0000000..180b866
--- /dev/null
+++ b/scripts/health/collect.py
@@ -0,0 +1,111 @@
+#!/usr/bin/python3
+from __future__ import unicode_literals
+from builtins import str
+import os
+import pwd
+import hashlib
+import platform
+import crm_script
+
+import crmsh.log
+crmsh.log.setup_logging()
+from crmsh.report import utils
+
+data = crm_script.get_input()
+
+PACKAGES = ['booth', 'cluster-glue', 'corosync', 'crmsh', 'csync2', 'drbd',
+ 'fence-agents', 'gfs2', 'gfs2-utils', 'ha-cluster-bootstrap',
+ 'haproxy', 'hawk', 'libdlm', 'libqb', 'ocfs2', 'ocfs2-tools',
+ 'pacemaker', 'pacemaker-mgmt', 'resource-agents', 'sbd']
+
+
+def rpm_info():
+ return crm_script.rpmcheck(PACKAGES)
+
+
+def logrotate_info():
+ return {}
+
+
+def get_user():
+ return pwd.getpwuid(os.getuid()).pw_name
+
+
+def sys_info():
+ sysname, nodename, release, version, machine = os.uname()
+ # The first three columns measure CPU and IO utilization of the
+ # last one, five, and 15 minute periods. The fourth column shows
+ # the number of currently running processes and the total number of
+ # processes. The last column displays the last process ID used.
+ system, node, release, version, machine, processor = platform.uname()
+ distname = utils.get_distro_info()
+ hostname = os.uname()[1]
+
+ uptime = open('/proc/uptime').read().split()
+ loadavg = open('/proc/loadavg').read().split()
+
+ return {'system': system,
+ 'node': node,
+ 'release': release,
+ 'version': version,
+ 'machine': machine,
+ 'processor': processor,
+ 'distname': distname,
+ 'user': get_user(),
+ 'hostname': hostname,
+ 'uptime': uptime[0],
+ 'idletime': uptime[1],
+ 'loadavg': loadavg[2] # 15 minute average
+ }
+
+
+def disk_info():
+ rc, out, err = crm_script.call(['df'], shell=False)
+ if rc == 0:
+ disk_use = []
+ for line in out.split('\n')[1:]:
+ line = line.strip()
+ if line:
+ data = line.split()
+ if len(data) >= 6:
+ disk_use.append((data[5], data[4]))
+ return disk_use
+ return []
+
+
+# configurations out of sync
+
+FILES = [
+ '/etc/csync2/key_hagroup',
+ '/etc/csync2/csync2.cfg',
+ '/etc/corosync/corosync.conf',
+ '/etc/sysconfig/sbd',
+ '/etc/sysconfig/SuSEfirewall2',
+ '/etc/sysconfig/SuSEfirewall2.d/services/cluster'
+ ]
+
+
+def files_info():
+ ret = {}
+ for f in FILES:
+ if os.path.isfile(f):
+ try:
+ ret[f] = hashlib.sha1(open(f).read().encode('utf-8')).hexdigest()
+ except IOError as e:
+ ret[f] = "error: %s" % (e)
+ else:
+ ret[f] = ""
+ return ret
+
+
+try:
+ data = {
+ 'rpm': rpm_info(),
+ 'logrotate': logrotate_info(),
+ 'system': sys_info(),
+ 'disk': disk_info(),
+ 'files': files_info()
+ }
+ crm_script.exit_ok(data)
+except Exception as e:
+ crm_script.exit_fail(str(e))
diff --git a/scripts/health/hahealth.py b/scripts/health/hahealth.py
new file mode 100755
index 0000000..f46aec6
--- /dev/null
+++ b/scripts/health/hahealth.py
@@ -0,0 +1,40 @@
+#!/usr/bin/python3
+import os
+import crm_script as crm
+
+
+if not os.path.isfile('/usr/sbin/crm') and not os.path.isfile('/usr/bin/crm'):
+ # crm not installed
+ crm.exit_ok({'status': 'crm not installed'})
+
+
+def get_from_date():
+ rc, out, err = crm.call("date '+%F %H:%M' --date='1 day ago'", shell=True)
+ return out.strip()
+
+
+def create_report():
+ cmd = ['crm', 'report',
+ '-f', get_from_date(),
+ '-D', '-Z', 'health-report']
+ rc, out, err = crm.call(cmd, shell=False)
+ return rc == 0
+
+
+if not create_report():
+ crm.exit_ok({'status': 'Failed to create report'})
+
+
+def extract_report():
+ rc, out, err = crm.call(['tar', 'xjf', 'health-report.tar.bz2'], shell=False)
+ return rc == 0
+
+
+if not extract_report():
+ crm.exit_ok({'status': 'Failed to extract report'})
+
+analysis = ''
+if os.path.isfile('health-report/analysis.txt'):
+ analysis = open('health-report/analysis.txt').read()
+
+crm.exit_ok({'status': 'OK', 'analysis': analysis})
diff --git a/scripts/health/main.yml b/scripts/health/main.yml
new file mode 100644
index 0000000..7c59bdd
--- /dev/null
+++ b/scripts/health/main.yml
@@ -0,0 +1,16 @@
+version: 2.2
+category: Basic
+shortdesc: Verify health and configuration
+longdesc: |
+ Checks and detects issues with the cluster, by creating and
+ analysing a cluster report.
+
+ Requires SSH access between cluster nodes. This command is
+ also available from the command line as "crm cluster health".
+actions:
+ - collect: collect.py
+ shortdesc: Collect information
+ - apply_local: hahealth.py
+ shortdesc: Run cluster health check
+ - report: report.py
+ shortdesc: Report cluster state
diff --git a/scripts/health/report.py b/scripts/health/report.py
new file mode 100755
index 0000000..51e11d2
--- /dev/null
+++ b/scripts/health/report.py
@@ -0,0 +1,134 @@
+#!/usr/bin/python3
+import os
+import crm_script
+data = crm_script.get_input()
+health_report = data[1]
+
+print("Processing collected information...")
+
+CORE_PACKAGES = ['corosync', 'pacemaker', 'resource-agents']
+
+warnings = []
+errors = []
+
+
+def warn(fmt, *args):
+ warnings.append(fmt % args)
+
+
+def error(fmt, *args):
+ errors.append(fmt % args)
+
+
+# sort {package: {version: [host]}}
+rpm_versions = {}
+
+LOW_UPTIME = 60.0
+HIGH_LOAD = 1.0
+
+for node, info in health_report.items():
+ if node != info['system']['hostname']:
+ error("Hostname mismatch: %s is not %s" %
+ (node, info['system']['hostname']))
+
+ if float(info['system']['uptime']) < LOW_UPTIME:
+ warn("%s: Uptime is low: %ss" % (node, info['system']['uptime']))
+
+ if float(info['system']['loadavg']) > HIGH_LOAD:
+ warn("%s: 15 minute load average is %s" % (node, info['system']['loadavg']))
+
+ for rpm in info['rpm']:
+ if 'error' in rpm:
+ if rpm['name'] not in rpm_versions:
+ rpm_versions[rpm['name']] = {rpm['error']: [node]}
+ else:
+ versions = rpm_versions[rpm['name']]
+ if rpm['error'] in versions:
+ versions[rpm['error']].append(node)
+ else:
+ versions[rpm['error']] = [node]
+ else:
+ if rpm['name'] not in rpm_versions:
+ rpm_versions[rpm['name']] = {rpm['version']: [node]}
+ else:
+ versions = rpm_versions[rpm['name']]
+ if rpm['version'] in versions:
+ versions[rpm['version']].append(node)
+ else:
+ versions[rpm['version']] = [node]
+ for disk, use in info['disk']:
+ use = int(use[:-1])
+ if use > 90:
+ warn("On %s, disk %s usage is %s%%", node, disk, use)
+
+ for logfile, state in info['logrotate'].items():
+ if not state:
+ warn("%s: No log rotation configured for %s" % (node, logfile))
+
+for cp in CORE_PACKAGES:
+ if cp not in rpm_versions:
+ error("Core package '%s' not installed on any node", cp)
+
+for name, versions in rpm_versions.items():
+ if len(versions) > 1:
+ desc = ', '.join('%s (%s)' % (v, ', '.join(nodes)) for v, nodes in list(versions.items()))
+ warn("Package %s: Versions differ! %s", name, desc)
+
+ all_hosts = set(sum([hosts for hosts in list(versions.values())], []))
+ for node in list(health_report.keys()):
+ if len(all_hosts) > 0 and node not in all_hosts:
+ warn("Package '%s' not installed on host '%s'" % (name, node))
+
+
+def compare_system(systems):
+ def check(value, msg):
+ vals = set([system[value] for host, system in systems])
+ if len(vals) > 1:
+ info = ', '.join('%s: %s' % (h, system[value]) for h, system in systems)
+ warn("%s: %s" % (msg, info))
+
+ check('machine', 'Architecture differs')
+ check('release', 'Kernel release differs')
+ check('distname', 'Distribution differs')
+ check('distver', 'Distribution version differs')
+ # check('version', 'Kernel version differs')
+
+
+def compare_files(systems):
+ keys = set()
+ for host, files in systems:
+ keys.update(list(files.keys()))
+ for filename in keys:
+ vals = set([files.get(filename) for host, files in systems])
+ if len(vals) > 1:
+ info = ', '.join('%s: %s' % (h, files.get(filename)) for h, files in systems)
+ warn("%s: %s" % ("Files differ", info))
+
+
+compare_system((h, info['system']) for h, info in health_report.items())
+compare_files((h, info['files']) for h, info in health_report.items())
+
+if crm_script.output(2):
+ report = crm_script.output(2)
+ status = report.get('status')
+ analysis = report.get('analysis')
+ if status and not analysis:
+ warn("Cluster report: %s" % (status))
+ elif analysis:
+ print("INFO: Cluster report:")
+ print(analysis)
+ else:
+ warn("No cluster report generated")
+
+if errors:
+ for e in errors:
+ print("ERROR:", e)
+if warnings:
+ for w in warnings:
+ print("WARNING:", w)
+
+if not errors and not warnings:
+ print("No issues found.")
+
+workdir = os.path.dirname(crm_script.__file__)
+print("\nINFO: health-report in directory \"%s\"" % workdir)