diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/pybind/mgr/cephadm/configchecks.py | |
parent | Initial commit. (diff) | |
download | ceph-b26c4052f3542036551aa9dec9caa4226e456195.tar.xz ceph-b26c4052f3542036551aa9dec9caa4226e456195.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/pybind/mgr/cephadm/configchecks.py')
-rw-r--r-- | src/pybind/mgr/cephadm/configchecks.py | 705 |
1 files changed, 705 insertions, 0 deletions
diff --git a/src/pybind/mgr/cephadm/configchecks.py b/src/pybind/mgr/cephadm/configchecks.py new file mode 100644 index 000000000..b9dcb18f4 --- /dev/null +++ b/src/pybind/mgr/cephadm/configchecks.py @@ -0,0 +1,705 @@ +import json +import ipaddress +import logging + +from mgr_module import ServiceInfoT + +from typing import TYPE_CHECKING, Any, Dict, List, Optional, cast, Tuple, Callable + +if TYPE_CHECKING: + from cephadm.module import CephadmOrchestrator + +logger = logging.getLogger(__name__) + + +class HostFacts: + + def __init__(self) -> None: + self.arch: Optional[str] = None + self.bios_date: Optional[str] = None + self.bios_version: Optional[str] = None + self.cpu_cores: Optional[int] = None + self.cpu_count: Optional[int] = None + self.cpu_load: Optional[Dict[str, float]] = None + self.cpu_model: Optional[str] = None + self.cpu_threads: Optional[int] = None + self.flash_capacity: Optional[str] = None + self.flash_capacity_bytes: Optional[int] = None + self.flash_count: Optional[int] = None + self.flash_list: Optional[List[Dict[str, Any]]] = None + self.hdd_capacity: Optional[str] = None + self.hdd_capacity_bytes: Optional[int] = None + self.hdd_count: Optional[int] = None + self.hdd_list: Optional[List[Dict[str, Any]]] = None + self.hostname: Optional[str] = None + self.interfaces: Optional[Dict[str, Dict[str, Any]]] = None + self.kernel: Optional[str] = None + self.kernel_parameters: Optional[Dict[str, Any]] = None + self.kernel_security: Optional[Dict[str, str]] = None + self.memory_available_kb: Optional[int] = None + self.memory_free_kb: Optional[int] = None + self.memory_total_kb: Optional[int] = None + self.model: Optional[str] = None + self.nic_count: Optional[int] = None + self.operating_system: Optional[str] = None + self.subscribed: Optional[str] = None + self.system_uptime: Optional[float] = None + self.timestamp: Optional[float] = None + self.vendor: Optional[str] = None + self._valid = False + + def load_facts(self, json_data: Dict[str, Any]) -> None: + + if isinstance(json_data, dict): + keys = json_data.keys() + if all([k in keys for k in self.__dict__ if not k.startswith('_')]): + self._valid = True + for k in json_data.keys(): + if hasattr(self, k): + setattr(self, k, json_data[k]) + else: + self._valid = False + else: + self._valid = False + + def subnet_to_nic(self, subnet: str) -> Optional[str]: + ip_version = ipaddress.ip_network(subnet).version + logger.debug(f"subnet {subnet} is IP version {ip_version}") + interfaces = cast(Dict[str, Dict[str, Any]], self.interfaces) + nic = None + for iface in interfaces.keys(): + addr = '' + if ip_version == 4: + addr = interfaces[iface].get('ipv4_address', '') + else: + addr = interfaces[iface].get('ipv6_address', '') + if addr: + a = addr.split('/')[0] + if ipaddress.ip_address(a) in ipaddress.ip_network(subnet): + nic = iface + break + return nic + + +class SubnetLookup: + def __init__(self, subnet: str, hostname: str, mtu: str, speed: str): + self.subnet = subnet + self.mtu_map = { + mtu: [hostname] + } + self.speed_map = { + speed: [hostname] + } + + @ property + def host_list(self) -> List[str]: + hosts = [] + for mtu in self.mtu_map: + hosts.extend(self.mtu_map.get(mtu, [])) + return hosts + + def update(self, hostname: str, mtu: str, speed: str) -> None: + if mtu in self.mtu_map and hostname not in self.mtu_map[mtu]: + self.mtu_map[mtu].append(hostname) + else: + self.mtu_map[mtu] = [hostname] + + if speed in self.speed_map and hostname not in self.speed_map[speed]: + self.speed_map[speed].append(hostname) + else: + self.speed_map[speed] = [hostname] + + def __repr__(self) -> str: + return json.dumps({ + "subnet": self.subnet, + "mtu_map": self.mtu_map, + "speed_map": self.speed_map + }) + + +class CephadmCheckDefinition: + def __init__(self, mgr: "CephadmOrchestrator", healthcheck_name: str, description: str, name: str, func: Callable) -> None: + self.mgr = mgr + self.log = logger + self.healthcheck_name = healthcheck_name + self.description = description + self.name = name + self.func = func + + @property + def status(self) -> str: + check_states: Dict[str, str] = {} + # Issuing a get each time, since the value could be set at the CLI + raw_states = self.mgr.get_store('config_checks') + if not raw_states: + self.log.error( + "config_checks setting is not defined - unable to determine healthcheck state") + return "Unknown" + + try: + check_states = json.loads(raw_states) + except json.JSONDecodeError: + self.log.error("Unable to serialize the config_checks settings to JSON") + return "Unavailable" + + return check_states.get(self.name, 'Missing') + + def to_json(self) -> Dict[str, Any]: + return { + "healthcheck_name": self.healthcheck_name, + "description": self.description, + "name": self.name, + "status": self.status, + "valid": True if self.func else False + } + + +class CephadmConfigChecks: + def __init__(self, mgr: "CephadmOrchestrator"): + self.mgr: "CephadmOrchestrator" = mgr + self.health_checks: List[CephadmCheckDefinition] = [ + CephadmCheckDefinition(mgr, "CEPHADM_CHECK_KERNEL_LSM", + "checks SELINUX/Apparmor profiles are consistent across cluster hosts", + "kernel_security", + self._check_kernel_lsm), + CephadmCheckDefinition(mgr, "CEPHADM_CHECK_SUBSCRIPTION", + "checks subscription states are consistent for all cluster hosts", + "os_subscription", + self._check_subscription), + CephadmCheckDefinition(mgr, "CEPHADM_CHECK_PUBLIC_MEMBERSHIP", + "check that all hosts have a NIC on the Ceph public_network", + "public_network", + self._check_public_network), + CephadmCheckDefinition(mgr, "CEPHADM_CHECK_MTU", + "check that OSD hosts share a common MTU setting", + "osd_mtu_size", + self._check_osd_mtu), + CephadmCheckDefinition(mgr, "CEPHADM_CHECK_LINKSPEED", + "check that OSD hosts share a common linkspeed", + "osd_linkspeed", + self._check_osd_linkspeed), + CephadmCheckDefinition(mgr, "CEPHADM_CHECK_NETWORK_MISSING", + "checks that the cluster/public networks defined exist on the Ceph hosts", + "network_missing", + self._check_network_missing), + CephadmCheckDefinition(mgr, "CEPHADM_CHECK_CEPH_RELEASE", + "check for Ceph version consistency - ceph daemons should be on the same release (unless upgrade is active)", + "ceph_release", + self._check_release_parity), + CephadmCheckDefinition(mgr, "CEPHADM_CHECK_KERNEL_VERSION", + "checks that the MAJ.MIN of the kernel on Ceph hosts is consistent", + "kernel_version", + self._check_kernel_version), + ] + self.log = logger + self.host_facts: Dict[str, HostFacts] = {} + self.subnet_lookup: Dict[str, SubnetLookup] = {} # subnet CIDR -> SubnetLookup Object + self.lsm_to_host: Dict[str, List[str]] = {} + self.subscribed: Dict[str, List[str]] = { + "yes": [], + "no": [], + "unknown": [], + } + self.host_to_role: Dict[str, List[str]] = {} + self.kernel_to_hosts: Dict[str, List[str]] = {} + + self.public_network_list: List[str] = [] + self.cluster_network_list: List[str] = [] + self.health_check_raised = False + self.active_checks: List[str] = [] # checks enabled and executed + self.skipped_checks: List[str] = [] # checks enabled, but skipped due to a pre-req failure + + raw_checks = self.mgr.get_store('config_checks') + if not raw_checks: + # doesn't exist, so seed the checks + self.seed_config_checks() + else: + # setting is there, so ensure there is an entry for each of the checks that + # this module supports (account for upgrades/changes) + try: + config_checks = json.loads(raw_checks) + except json.JSONDecodeError: + self.log.error("Unable to serialize config_checks config. Reset to defaults") + self.seed_config_checks() + else: + # Ensure the config_checks setting is consistent with this module + from_config = set(config_checks.keys()) + from_module = set([c.name for c in self.health_checks]) + old_checks = from_config.difference(from_module) + new_checks = from_module.difference(from_config) + + if old_checks: + self.log.debug(f"old checks being removed from config_checks: {old_checks}") + for i in old_checks: + del config_checks[i] + if new_checks: + self.log.debug(f"new checks being added to config_checks: {new_checks}") + for i in new_checks: + config_checks[i] = 'enabled' + + if old_checks or new_checks: + self.log.info( + f"config_checks updated: {len(old_checks)} removed, {len(new_checks)} added") + self.mgr.set_store('config_checks', json.dumps(config_checks)) + else: + self.log.debug("config_checks match module definition") + + def lookup_check(self, key_value: str, key_name: str = 'name') -> Optional[CephadmCheckDefinition]: + + for c in self.health_checks: + if getattr(c, key_name) == key_value: + return c + return None + + @property + def defined_checks(self) -> int: + return len(self.health_checks) + + @property + def active_checks_count(self) -> int: + return len(self.active_checks) + + def seed_config_checks(self) -> None: + defaults = {check.name: 'enabled' for check in self.health_checks} + self.mgr.set_store('config_checks', json.dumps(defaults)) + + @property + def skipped_checks_count(self) -> int: + return len(self.skipped_checks) + + def to_json(self) -> List[Dict[str, str]]: + return [check.to_json() for check in self.health_checks] + + def load_network_config(self) -> None: + ret, out, _err = self.mgr.check_mon_command({ + 'prefix': 'config dump', + 'format': 'json' + }) + assert ret == 0 + js = json.loads(out) + for item in js: + if item['name'] == "cluster_network": + self.cluster_network_list = item['value'].strip().split(',') + if item['name'] == "public_network": + self.public_network_list = item['value'].strip().split(',') + + self.log.debug(f"public networks {self.public_network_list}") + self.log.debug(f"cluster networks {self.cluster_network_list}") + + def _update_subnet(self, subnet: str, hostname: str, nic: Dict[str, Any]) -> None: + mtu = nic.get('mtu', None) + speed = nic.get('speed', None) + if not mtu or not speed: + return + + this_subnet = self.subnet_lookup.get(subnet, None) + if this_subnet: + this_subnet.update(hostname, mtu, speed) + else: + self.subnet_lookup[subnet] = SubnetLookup(subnet, hostname, mtu, speed) + + def _update_subnet_lookups(self, hostname: str, devname: str, nic: Dict[str, Any]) -> None: + if nic['ipv4_address']: + try: + iface4 = ipaddress.IPv4Interface(nic['ipv4_address']) + subnet = str(iface4.network) + except ipaddress.AddressValueError as e: + self.log.exception(f"Invalid network on {hostname}, interface {devname} : {str(e)}") + else: + self._update_subnet(subnet, hostname, nic) + + if nic['ipv6_address']: + try: + iface6 = ipaddress.IPv6Interface(nic['ipv6_address']) + subnet = str(iface6.network) + except ipaddress.AddressValueError as e: + self.log.exception(f"Invalid network on {hostname}, interface {devname} : {str(e)}") + else: + self._update_subnet(subnet, hostname, nic) + + def hosts_with_role(self, role: str) -> List[str]: + host_list = [] + for hostname, roles in self.host_to_role.items(): + if role in roles: + host_list.append(hostname) + return host_list + + def reset(self) -> None: + self.subnet_lookup.clear() + self.lsm_to_host.clear() + self.subscribed['yes'] = [] + self.subscribed['no'] = [] + self.subscribed['unknown'] = [] + self.host_to_role.clear() + self.kernel_to_hosts.clear() + + def _get_majority(self, data: Dict[str, List[str]]) -> Tuple[str, int]: + assert isinstance(data, dict) + + majority_key = '' + majority_count = 0 + for key in data: + if len(data[key]) > majority_count: + majority_count = len(data[key]) + majority_key = key + return majority_key, majority_count + + def get_ceph_metadata(self) -> Dict[str, Optional[Dict[str, str]]]: + """Build a map of service -> service metadata""" + service_map: Dict[str, Optional[Dict[str, str]]] = {} + + for server in self.mgr.list_servers(): + for service in cast(List[ServiceInfoT], server.get('services', [])): + if service: + service_map.update( + { + f"{service['type']}.{service['id']}": + self.mgr.get_metadata(service['type'], service['id']) + } + ) + return service_map + + def _check_kernel_lsm(self) -> None: + if len(self.lsm_to_host.keys()) > 1: + + majority_hosts_ptr, majority_hosts_count = self._get_majority(self.lsm_to_host) + lsm_copy = self.lsm_to_host.copy() + del lsm_copy[majority_hosts_ptr] + details = [] + for lsm_key in lsm_copy.keys(): + for host in lsm_copy[lsm_key]: + details.append( + f"{host} has inconsistent KSM settings compared to the " + f"majority of hosts({majority_hosts_count}) in the cluster") + host_sfx = 's' if len(details) > 1 else '' + self.mgr.health_checks['CEPHADM_CHECK_KERNEL_LSM'] = { + 'severity': 'warning', + 'summary': f"Kernel Security Module (SELinux/AppArmor) is inconsistent for " + f"{len(details)} host{host_sfx}", + 'count': len(details), + 'detail': details, + } + self.health_check_raised = True + else: + self.mgr.health_checks.pop('CEPHADM_CHECK_KERNEL_LSM', None) + + def _check_subscription(self) -> None: + if len(self.subscribed['yes']) > 0 and len(self.subscribed['no']) > 0: + # inconsistent subscription states - CEPHADM_CHECK_SUBSCRIPTION + details = [] + for host in self.subscribed['no']: + details.append(f"{host} does not have an active subscription") + self.mgr.health_checks['CEPHADM_CHECK_SUBSCRIPTION'] = { + 'severity': 'warning', + 'summary': f"Support subscriptions inactive on {len(details)} host(s)" + f"({len(self.subscribed['yes'])} subscriptions active)", + 'count': len(details), + 'detail': details, + } + self.health_check_raised = True + else: + self.mgr.health_checks.pop('CEPHADM_CHECK_SUBSCRIPTION', None) + + def _check_public_network(self) -> None: + hosts_remaining: List[str] = list(self.mgr.cache.facts.keys()) + hosts_removed: List[str] = [] + self.log.debug(f"checking public network membership for: {hosts_remaining}") + + for p_net in self.public_network_list: + self.log.debug(f"checking network {p_net}") + subnet_data = self.subnet_lookup.get(p_net, None) + self.log.debug(f"subnet data - {subnet_data}") + + if subnet_data: + hosts_in_subnet = subnet_data.host_list + for host in hosts_in_subnet: + if host in hosts_remaining: + hosts_remaining.remove(host) + hosts_removed.append(host) + else: + if host not in hosts_removed: + self.log.debug(f"host={host}, subnet={p_net}") + self.log.exception( + "Host listed for a subnet but not present in the host facts?") + + # Ideally all hosts will have been removed since they have an IP on at least + # one of the public networks + if hosts_remaining: + if len(hosts_remaining) != len(self.mgr.cache.facts): + # public network is visible on some hosts + details = [ + f"{host} does not have an interface on any public network" for host in hosts_remaining] + + self.mgr.health_checks['CEPHADM_CHECK_PUBLIC_MEMBERSHIP'] = { + 'severity': 'warning', + 'summary': f"Public network(s) is not directly accessible from {len(hosts_remaining)} " + "cluster hosts", + 'count': len(details), + 'detail': details, + } + self.health_check_raised = True + else: + self.mgr.health_checks.pop('CEPHADM_CHECK_PUBLIC_MEMBERSHIP', None) + + def _check_osd_mtu(self) -> None: + osd_hosts = set(self.hosts_with_role('osd')) + osd_network_list = self.cluster_network_list or self.public_network_list + mtu_errors: List[str] = [] + + for osd_net in osd_network_list: + subnet_data = self.subnet_lookup.get(osd_net, None) + + if subnet_data: + + self.log.debug(f"processing mtu map : {json.dumps(subnet_data.mtu_map)}") + mtu_count = {} + max_hosts = 0 + mtu_ptr = '' + diffs = {} + for mtu, host_list in subnet_data.mtu_map.items(): + mtu_hosts = set(host_list) + mtu_count[mtu] = len(mtu_hosts) + errors = osd_hosts.difference(mtu_hosts) + if errors: + diffs[mtu] = errors + if len(errors) > max_hosts: + mtu_ptr = mtu + + if diffs: + self.log.debug("MTU problems detected") + self.log.debug(f"most hosts using {mtu_ptr}") + mtu_copy = subnet_data.mtu_map.copy() + del mtu_copy[mtu_ptr] + for bad_mtu in mtu_copy: + for h in mtu_copy[bad_mtu]: + host = HostFacts() + host.load_facts(self.mgr.cache.facts[h]) + mtu_errors.append( + f"host {h}({host.subnet_to_nic(osd_net)}) is using MTU " + f"{bad_mtu} on {osd_net}, NICs on other hosts use {mtu_ptr}") + + if mtu_errors: + self.mgr.health_checks['CEPHADM_CHECK_MTU'] = { + 'severity': 'warning', + 'summary': f"MTU setting inconsistent on osd network NICs on {len(mtu_errors)} host(s)", + 'count': len(mtu_errors), + 'detail': mtu_errors, + } + self.health_check_raised = True + else: + self.mgr.health_checks.pop('CEPHADM_CHECK_MTU', None) + + def _check_osd_linkspeed(self) -> None: + osd_hosts = set(self.hosts_with_role('osd')) + osd_network_list = self.cluster_network_list or self.public_network_list + + linkspeed_errors = [] + + for osd_net in osd_network_list: + subnet_data = self.subnet_lookup.get(osd_net, None) + + if subnet_data: + + self.log.debug(f"processing subnet : {subnet_data}") + + speed_count = {} + max_hosts = 0 + speed_ptr = '' + diffs = {} + for speed, host_list in subnet_data.speed_map.items(): + speed_hosts = set(host_list) + speed_count[speed] = len(speed_hosts) + errors = osd_hosts.difference(speed_hosts) + if errors: + diffs[speed] = errors + if len(errors) > max_hosts: + speed_ptr = speed + + if diffs: + self.log.debug("linkspeed issue(s) detected") + self.log.debug(f"most hosts using {speed_ptr}") + speed_copy = subnet_data.speed_map.copy() + del speed_copy[speed_ptr] + for bad_speed in speed_copy: + if bad_speed > speed_ptr: + # skip speed is better than most...it can stay! + continue + for h in speed_copy[bad_speed]: + host = HostFacts() + host.load_facts(self.mgr.cache.facts[h]) + linkspeed_errors.append( + f"host {h}({host.subnet_to_nic(osd_net)}) has linkspeed of " + f"{bad_speed} on {osd_net}, NICs on other hosts use {speed_ptr}") + + if linkspeed_errors: + self.mgr.health_checks['CEPHADM_CHECK_LINKSPEED'] = { + 'severity': 'warning', + 'summary': "Link speed is inconsistent on osd network NICs for " + f"{len(linkspeed_errors)} host(s)", + 'count': len(linkspeed_errors), + 'detail': linkspeed_errors, + } + self.health_check_raised = True + else: + self.mgr.health_checks.pop('CEPHADM_CHECK_LINKSPEED', None) + + def _check_network_missing(self) -> None: + all_networks = self.public_network_list.copy() + all_networks.extend(self.cluster_network_list) + + missing_networks = [] + for subnet in all_networks: + subnet_data = self.subnet_lookup.get(subnet, None) + + if not subnet_data: + missing_networks.append(f"{subnet} not found on any host in the cluster") + self.log.warning( + f"Network {subnet} has been defined, but is not present on any host") + + if missing_networks: + net_sfx = 's' if len(missing_networks) > 1 else '' + self.mgr.health_checks['CEPHADM_CHECK_NETWORK_MISSING'] = { + 'severity': 'warning', + 'summary': f"Public/cluster network{net_sfx} defined, but can not be found on " + "any host", + 'count': len(missing_networks), + 'detail': missing_networks, + } + self.health_check_raised = True + else: + self.mgr.health_checks.pop('CEPHADM_CHECK_NETWORK_MISSING', None) + + def _check_release_parity(self) -> None: + upgrade_status = self.mgr.upgrade.upgrade_status() + if upgrade_status.in_progress: + # skip version consistency checks during an upgrade cycle + self.skipped_checks.append('ceph_release') + return + + services = self.get_ceph_metadata() + self.log.debug(json.dumps(services)) + version_to_svcs: Dict[str, List[str]] = {} + + for svc in services: + if services[svc]: + metadata = cast(Dict[str, str], services[svc]) + v = metadata.get('ceph_release', '') + if v in version_to_svcs: + version_to_svcs[v].append(svc) + else: + version_to_svcs[v] = [svc] + + if len(version_to_svcs) > 1: + majority_ptr, _majority_count = self._get_majority(version_to_svcs) + ver_copy = version_to_svcs.copy() + del ver_copy[majority_ptr] + details = [] + for v in ver_copy: + for svc in ver_copy[v]: + details.append( + f"{svc} is running {v} (majority of cluster is using {majority_ptr})") + + self.mgr.health_checks['CEPHADM_CHECK_CEPH_RELEASE'] = { + 'severity': 'warning', + 'summary': 'Ceph cluster running mixed ceph releases', + 'count': len(details), + 'detail': details, + } + self.health_check_raised = True + self.log.warning( + f"running with {len(version_to_svcs)} different ceph releases within this cluster") + else: + self.mgr.health_checks.pop('CEPHADM_CHECK_CEPH_RELEASE', None) + + def _check_kernel_version(self) -> None: + if len(self.kernel_to_hosts.keys()) > 1: + majority_hosts_ptr, majority_hosts_count = self._get_majority(self.kernel_to_hosts) + kver_copy = self.kernel_to_hosts.copy() + del kver_copy[majority_hosts_ptr] + details = [] + for k in kver_copy: + for h in kver_copy[k]: + details.append( + f"host {h} running kernel {k}, majority of hosts({majority_hosts_count}) " + f"running {majority_hosts_ptr}") + + self.log.warning("mixed kernel versions detected") + self.mgr.health_checks['CEPHADM_CHECK_KERNEL_VERSION'] = { + 'severity': 'warning', + 'summary': f"{len(details)} host(s) running different kernel versions", + 'count': len(details), + 'detail': details, + } + self.health_check_raised = True + else: + self.mgr.health_checks.pop('CEPHADM_CHECK_KERNEL_VERSION', None) + + def _process_hosts(self) -> None: + self.log.debug(f"processing data from {len(self.mgr.cache.facts)} hosts") + for hostname in self.mgr.cache.facts: + host = HostFacts() + host.load_facts(self.mgr.cache.facts[hostname]) + if not host._valid: + self.log.warning(f"skipping {hostname} - incompatible host facts") + continue + + kernel_lsm = cast(Dict[str, str], host.kernel_security) + lsm_desc = kernel_lsm.get('description', '') + if lsm_desc: + if lsm_desc in self.lsm_to_host: + self.lsm_to_host[lsm_desc].append(hostname) + else: + self.lsm_to_host[lsm_desc] = [hostname] + + subscription_state = host.subscribed.lower() if host.subscribed else None + if subscription_state: + self.subscribed[subscription_state].append(hostname) + + interfaces = cast(Dict[str, Dict[str, Any]], host.interfaces) + for name in interfaces.keys(): + if name in ['lo']: + continue + self._update_subnet_lookups(hostname, name, interfaces[name]) + + if host.kernel: + kernel_maj_min = '.'.join(host.kernel.split('.')[0:2]) + if kernel_maj_min in self.kernel_to_hosts: + self.kernel_to_hosts[kernel_maj_min].append(hostname) + else: + self.kernel_to_hosts[kernel_maj_min] = [hostname] + else: + self.log.warning(f"Host gather facts for {hostname} is missing kernel information") + + # NOTE: if daemondescription had systemd enabled state, we could check for systemd 'tampering' + self.host_to_role[hostname] = list(self.mgr.cache.get_daemon_types(hostname)) + + def run_checks(self) -> None: + checks_enabled = self.mgr.get_module_option('config_checks_enabled') + if checks_enabled is not True: + return + + self.reset() + + check_config: Dict[str, str] = {} + checks_raw: Optional[str] = self.mgr.get_store('config_checks') + if checks_raw: + try: + check_config.update(json.loads(checks_raw)) + except json.JSONDecodeError: + self.log.exception( + "mgr/cephadm/config_checks is not JSON serializable - all checks will run") + + # build lookup "maps" by walking the host facts, once + self._process_hosts() + + self.health_check_raised = False + self.active_checks = [] + self.skipped_checks = [] + + # process all healthchecks that are not explicitly disabled + for health_check in self.health_checks: + if check_config.get(health_check.name, '') != 'disabled': + self.active_checks.append(health_check.name) + health_check.func() + + self.mgr.set_health_checks(self.mgr.health_checks) |