From d835b2cae8abc71958b69362162e6a70c3d7ef63 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 17 Apr 2024 08:48:59 +0200 Subject: Adding upstream version 4.6.0. Signed-off-by: Daniel Baumann --- scripts/health/collect.py | 111 +++++++++++++++++++++++++++++++++++++ scripts/health/hahealth.py | 40 ++++++++++++++ scripts/health/main.yml | 16 ++++++ scripts/health/report.py | 134 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 301 insertions(+) create mode 100755 scripts/health/collect.py create mode 100755 scripts/health/hahealth.py create mode 100644 scripts/health/main.yml create mode 100755 scripts/health/report.py (limited to 'scripts/health') diff --git a/scripts/health/collect.py b/scripts/health/collect.py new file mode 100755 index 0000000..180b866 --- /dev/null +++ b/scripts/health/collect.py @@ -0,0 +1,111 @@ +#!/usr/bin/python3 +from __future__ import unicode_literals +from builtins import str +import os +import pwd +import hashlib +import platform +import crm_script + +import crmsh.log +crmsh.log.setup_logging() +from crmsh.report import utils + +data = crm_script.get_input() + +PACKAGES = ['booth', 'cluster-glue', 'corosync', 'crmsh', 'csync2', 'drbd', + 'fence-agents', 'gfs2', 'gfs2-utils', 'ha-cluster-bootstrap', + 'haproxy', 'hawk', 'libdlm', 'libqb', 'ocfs2', 'ocfs2-tools', + 'pacemaker', 'pacemaker-mgmt', 'resource-agents', 'sbd'] + + +def rpm_info(): + return crm_script.rpmcheck(PACKAGES) + + +def logrotate_info(): + return {} + + +def get_user(): + return pwd.getpwuid(os.getuid()).pw_name + + +def sys_info(): + sysname, nodename, release, version, machine = os.uname() + # The first three columns measure CPU and IO utilization of the + # last one, five, and 15 minute periods. The fourth column shows + # the number of currently running processes and the total number of + # processes. The last column displays the last process ID used. + system, node, release, version, machine, processor = platform.uname() + distname = utils.get_distro_info() + hostname = os.uname()[1] + + uptime = open('/proc/uptime').read().split() + loadavg = open('/proc/loadavg').read().split() + + return {'system': system, + 'node': node, + 'release': release, + 'version': version, + 'machine': machine, + 'processor': processor, + 'distname': distname, + 'user': get_user(), + 'hostname': hostname, + 'uptime': uptime[0], + 'idletime': uptime[1], + 'loadavg': loadavg[2] # 15 minute average + } + + +def disk_info(): + rc, out, err = crm_script.call(['df'], shell=False) + if rc == 0: + disk_use = [] + for line in out.split('\n')[1:]: + line = line.strip() + if line: + data = line.split() + if len(data) >= 6: + disk_use.append((data[5], data[4])) + return disk_use + return [] + + +# configurations out of sync + +FILES = [ + '/etc/csync2/key_hagroup', + '/etc/csync2/csync2.cfg', + '/etc/corosync/corosync.conf', + '/etc/sysconfig/sbd', + '/etc/sysconfig/SuSEfirewall2', + '/etc/sysconfig/SuSEfirewall2.d/services/cluster' + ] + + +def files_info(): + ret = {} + for f in FILES: + if os.path.isfile(f): + try: + ret[f] = hashlib.sha1(open(f).read().encode('utf-8')).hexdigest() + except IOError as e: + ret[f] = "error: %s" % (e) + else: + ret[f] = "" + return ret + + +try: + data = { + 'rpm': rpm_info(), + 'logrotate': logrotate_info(), + 'system': sys_info(), + 'disk': disk_info(), + 'files': files_info() + } + crm_script.exit_ok(data) +except Exception as e: + crm_script.exit_fail(str(e)) diff --git a/scripts/health/hahealth.py b/scripts/health/hahealth.py new file mode 100755 index 0000000..f46aec6 --- /dev/null +++ b/scripts/health/hahealth.py @@ -0,0 +1,40 @@ +#!/usr/bin/python3 +import os +import crm_script as crm + + +if not os.path.isfile('/usr/sbin/crm') and not os.path.isfile('/usr/bin/crm'): + # crm not installed + crm.exit_ok({'status': 'crm not installed'}) + + +def get_from_date(): + rc, out, err = crm.call("date '+%F %H:%M' --date='1 day ago'", shell=True) + return out.strip() + + +def create_report(): + cmd = ['crm', 'report', + '-f', get_from_date(), + '-D', '-Z', 'health-report'] + rc, out, err = crm.call(cmd, shell=False) + return rc == 0 + + +if not create_report(): + crm.exit_ok({'status': 'Failed to create report'}) + + +def extract_report(): + rc, out, err = crm.call(['tar', 'xjf', 'health-report.tar.bz2'], shell=False) + return rc == 0 + + +if not extract_report(): + crm.exit_ok({'status': 'Failed to extract report'}) + +analysis = '' +if os.path.isfile('health-report/analysis.txt'): + analysis = open('health-report/analysis.txt').read() + +crm.exit_ok({'status': 'OK', 'analysis': analysis}) diff --git a/scripts/health/main.yml b/scripts/health/main.yml new file mode 100644 index 0000000..7c59bdd --- /dev/null +++ b/scripts/health/main.yml @@ -0,0 +1,16 @@ +version: 2.2 +category: Basic +shortdesc: Verify health and configuration +longdesc: | + Checks and detects issues with the cluster, by creating and + analysing a cluster report. + + Requires SSH access between cluster nodes. This command is + also available from the command line as "crm cluster health". +actions: + - collect: collect.py + shortdesc: Collect information + - apply_local: hahealth.py + shortdesc: Run cluster health check + - report: report.py + shortdesc: Report cluster state diff --git a/scripts/health/report.py b/scripts/health/report.py new file mode 100755 index 0000000..51e11d2 --- /dev/null +++ b/scripts/health/report.py @@ -0,0 +1,134 @@ +#!/usr/bin/python3 +import os +import crm_script +data = crm_script.get_input() +health_report = data[1] + +print("Processing collected information...") + +CORE_PACKAGES = ['corosync', 'pacemaker', 'resource-agents'] + +warnings = [] +errors = [] + + +def warn(fmt, *args): + warnings.append(fmt % args) + + +def error(fmt, *args): + errors.append(fmt % args) + + +# sort {package: {version: [host]}} +rpm_versions = {} + +LOW_UPTIME = 60.0 +HIGH_LOAD = 1.0 + +for node, info in health_report.items(): + if node != info['system']['hostname']: + error("Hostname mismatch: %s is not %s" % + (node, info['system']['hostname'])) + + if float(info['system']['uptime']) < LOW_UPTIME: + warn("%s: Uptime is low: %ss" % (node, info['system']['uptime'])) + + if float(info['system']['loadavg']) > HIGH_LOAD: + warn("%s: 15 minute load average is %s" % (node, info['system']['loadavg'])) + + for rpm in info['rpm']: + if 'error' in rpm: + if rpm['name'] not in rpm_versions: + rpm_versions[rpm['name']] = {rpm['error']: [node]} + else: + versions = rpm_versions[rpm['name']] + if rpm['error'] in versions: + versions[rpm['error']].append(node) + else: + versions[rpm['error']] = [node] + else: + if rpm['name'] not in rpm_versions: + rpm_versions[rpm['name']] = {rpm['version']: [node]} + else: + versions = rpm_versions[rpm['name']] + if rpm['version'] in versions: + versions[rpm['version']].append(node) + else: + versions[rpm['version']] = [node] + for disk, use in info['disk']: + use = int(use[:-1]) + if use > 90: + warn("On %s, disk %s usage is %s%%", node, disk, use) + + for logfile, state in info['logrotate'].items(): + if not state: + warn("%s: No log rotation configured for %s" % (node, logfile)) + +for cp in CORE_PACKAGES: + if cp not in rpm_versions: + error("Core package '%s' not installed on any node", cp) + +for name, versions in rpm_versions.items(): + if len(versions) > 1: + desc = ', '.join('%s (%s)' % (v, ', '.join(nodes)) for v, nodes in list(versions.items())) + warn("Package %s: Versions differ! %s", name, desc) + + all_hosts = set(sum([hosts for hosts in list(versions.values())], [])) + for node in list(health_report.keys()): + if len(all_hosts) > 0 and node not in all_hosts: + warn("Package '%s' not installed on host '%s'" % (name, node)) + + +def compare_system(systems): + def check(value, msg): + vals = set([system[value] for host, system in systems]) + if len(vals) > 1: + info = ', '.join('%s: %s' % (h, system[value]) for h, system in systems) + warn("%s: %s" % (msg, info)) + + check('machine', 'Architecture differs') + check('release', 'Kernel release differs') + check('distname', 'Distribution differs') + check('distver', 'Distribution version differs') + # check('version', 'Kernel version differs') + + +def compare_files(systems): + keys = set() + for host, files in systems: + keys.update(list(files.keys())) + for filename in keys: + vals = set([files.get(filename) for host, files in systems]) + if len(vals) > 1: + info = ', '.join('%s: %s' % (h, files.get(filename)) for h, files in systems) + warn("%s: %s" % ("Files differ", info)) + + +compare_system((h, info['system']) for h, info in health_report.items()) +compare_files((h, info['files']) for h, info in health_report.items()) + +if crm_script.output(2): + report = crm_script.output(2) + status = report.get('status') + analysis = report.get('analysis') + if status and not analysis: + warn("Cluster report: %s" % (status)) + elif analysis: + print("INFO: Cluster report:") + print(analysis) + else: + warn("No cluster report generated") + +if errors: + for e in errors: + print("ERROR:", e) +if warnings: + for w in warnings: + print("WARNING:", w) + +if not errors and not warnings: + print("No issues found.") + +workdir = os.path.dirname(crm_script.__file__) +print("\nINFO: health-report in directory \"%s\"" % workdir) -- cgit v1.2.3