summaryrefslogtreecommitdiffstats
path: root/src/pybind/mgr/cephadm/configchecks.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/pybind/mgr/cephadm/configchecks.py')
-rw-r--r--src/pybind/mgr/cephadm/configchecks.py705
1 files changed, 705 insertions, 0 deletions
diff --git a/src/pybind/mgr/cephadm/configchecks.py b/src/pybind/mgr/cephadm/configchecks.py
new file mode 100644
index 000000000..b9dcb18f4
--- /dev/null
+++ b/src/pybind/mgr/cephadm/configchecks.py
@@ -0,0 +1,705 @@
+import json
+import ipaddress
+import logging
+
+from mgr_module import ServiceInfoT
+
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, cast, Tuple, Callable
+
+if TYPE_CHECKING:
+ from cephadm.module import CephadmOrchestrator
+
+logger = logging.getLogger(__name__)
+
+
+class HostFacts:
+
+ def __init__(self) -> None:
+ self.arch: Optional[str] = None
+ self.bios_date: Optional[str] = None
+ self.bios_version: Optional[str] = None
+ self.cpu_cores: Optional[int] = None
+ self.cpu_count: Optional[int] = None
+ self.cpu_load: Optional[Dict[str, float]] = None
+ self.cpu_model: Optional[str] = None
+ self.cpu_threads: Optional[int] = None
+ self.flash_capacity: Optional[str] = None
+ self.flash_capacity_bytes: Optional[int] = None
+ self.flash_count: Optional[int] = None
+ self.flash_list: Optional[List[Dict[str, Any]]] = None
+ self.hdd_capacity: Optional[str] = None
+ self.hdd_capacity_bytes: Optional[int] = None
+ self.hdd_count: Optional[int] = None
+ self.hdd_list: Optional[List[Dict[str, Any]]] = None
+ self.hostname: Optional[str] = None
+ self.interfaces: Optional[Dict[str, Dict[str, Any]]] = None
+ self.kernel: Optional[str] = None
+ self.kernel_parameters: Optional[Dict[str, Any]] = None
+ self.kernel_security: Optional[Dict[str, str]] = None
+ self.memory_available_kb: Optional[int] = None
+ self.memory_free_kb: Optional[int] = None
+ self.memory_total_kb: Optional[int] = None
+ self.model: Optional[str] = None
+ self.nic_count: Optional[int] = None
+ self.operating_system: Optional[str] = None
+ self.subscribed: Optional[str] = None
+ self.system_uptime: Optional[float] = None
+ self.timestamp: Optional[float] = None
+ self.vendor: Optional[str] = None
+ self._valid = False
+
+ def load_facts(self, json_data: Dict[str, Any]) -> None:
+
+ if isinstance(json_data, dict):
+ keys = json_data.keys()
+ if all([k in keys for k in self.__dict__ if not k.startswith('_')]):
+ self._valid = True
+ for k in json_data.keys():
+ if hasattr(self, k):
+ setattr(self, k, json_data[k])
+ else:
+ self._valid = False
+ else:
+ self._valid = False
+
+ def subnet_to_nic(self, subnet: str) -> Optional[str]:
+ ip_version = ipaddress.ip_network(subnet).version
+ logger.debug(f"subnet {subnet} is IP version {ip_version}")
+ interfaces = cast(Dict[str, Dict[str, Any]], self.interfaces)
+ nic = None
+ for iface in interfaces.keys():
+ addr = ''
+ if ip_version == 4:
+ addr = interfaces[iface].get('ipv4_address', '')
+ else:
+ addr = interfaces[iface].get('ipv6_address', '')
+ if addr:
+ a = addr.split('/')[0]
+ if ipaddress.ip_address(a) in ipaddress.ip_network(subnet):
+ nic = iface
+ break
+ return nic
+
+
+class SubnetLookup:
+ def __init__(self, subnet: str, hostname: str, mtu: str, speed: str):
+ self.subnet = subnet
+ self.mtu_map = {
+ mtu: [hostname]
+ }
+ self.speed_map = {
+ speed: [hostname]
+ }
+
+ @ property
+ def host_list(self) -> List[str]:
+ hosts = []
+ for mtu in self.mtu_map:
+ hosts.extend(self.mtu_map.get(mtu, []))
+ return hosts
+
+ def update(self, hostname: str, mtu: str, speed: str) -> None:
+ if mtu in self.mtu_map and hostname not in self.mtu_map[mtu]:
+ self.mtu_map[mtu].append(hostname)
+ else:
+ self.mtu_map[mtu] = [hostname]
+
+ if speed in self.speed_map and hostname not in self.speed_map[speed]:
+ self.speed_map[speed].append(hostname)
+ else:
+ self.speed_map[speed] = [hostname]
+
+ def __repr__(self) -> str:
+ return json.dumps({
+ "subnet": self.subnet,
+ "mtu_map": self.mtu_map,
+ "speed_map": self.speed_map
+ })
+
+
+class CephadmCheckDefinition:
+ def __init__(self, mgr: "CephadmOrchestrator", healthcheck_name: str, description: str, name: str, func: Callable) -> None:
+ self.mgr = mgr
+ self.log = logger
+ self.healthcheck_name = healthcheck_name
+ self.description = description
+ self.name = name
+ self.func = func
+
+ @property
+ def status(self) -> str:
+ check_states: Dict[str, str] = {}
+ # Issuing a get each time, since the value could be set at the CLI
+ raw_states = self.mgr.get_store('config_checks')
+ if not raw_states:
+ self.log.error(
+ "config_checks setting is not defined - unable to determine healthcheck state")
+ return "Unknown"
+
+ try:
+ check_states = json.loads(raw_states)
+ except json.JSONDecodeError:
+ self.log.error("Unable to serialize the config_checks settings to JSON")
+ return "Unavailable"
+
+ return check_states.get(self.name, 'Missing')
+
+ def to_json(self) -> Dict[str, Any]:
+ return {
+ "healthcheck_name": self.healthcheck_name,
+ "description": self.description,
+ "name": self.name,
+ "status": self.status,
+ "valid": True if self.func else False
+ }
+
+
+class CephadmConfigChecks:
+ def __init__(self, mgr: "CephadmOrchestrator"):
+ self.mgr: "CephadmOrchestrator" = mgr
+ self.health_checks: List[CephadmCheckDefinition] = [
+ CephadmCheckDefinition(mgr, "CEPHADM_CHECK_KERNEL_LSM",
+ "checks SELINUX/Apparmor profiles are consistent across cluster hosts",
+ "kernel_security",
+ self._check_kernel_lsm),
+ CephadmCheckDefinition(mgr, "CEPHADM_CHECK_SUBSCRIPTION",
+ "checks subscription states are consistent for all cluster hosts",
+ "os_subscription",
+ self._check_subscription),
+ CephadmCheckDefinition(mgr, "CEPHADM_CHECK_PUBLIC_MEMBERSHIP",
+ "check that all hosts have a NIC on the Ceph public_network",
+ "public_network",
+ self._check_public_network),
+ CephadmCheckDefinition(mgr, "CEPHADM_CHECK_MTU",
+ "check that OSD hosts share a common MTU setting",
+ "osd_mtu_size",
+ self._check_osd_mtu),
+ CephadmCheckDefinition(mgr, "CEPHADM_CHECK_LINKSPEED",
+ "check that OSD hosts share a common linkspeed",
+ "osd_linkspeed",
+ self._check_osd_linkspeed),
+ CephadmCheckDefinition(mgr, "CEPHADM_CHECK_NETWORK_MISSING",
+ "checks that the cluster/public networks defined exist on the Ceph hosts",
+ "network_missing",
+ self._check_network_missing),
+ CephadmCheckDefinition(mgr, "CEPHADM_CHECK_CEPH_RELEASE",
+ "check for Ceph version consistency - ceph daemons should be on the same release (unless upgrade is active)",
+ "ceph_release",
+ self._check_release_parity),
+ CephadmCheckDefinition(mgr, "CEPHADM_CHECK_KERNEL_VERSION",
+ "checks that the MAJ.MIN of the kernel on Ceph hosts is consistent",
+ "kernel_version",
+ self._check_kernel_version),
+ ]
+ self.log = logger
+ self.host_facts: Dict[str, HostFacts] = {}
+ self.subnet_lookup: Dict[str, SubnetLookup] = {} # subnet CIDR -> SubnetLookup Object
+ self.lsm_to_host: Dict[str, List[str]] = {}
+ self.subscribed: Dict[str, List[str]] = {
+ "yes": [],
+ "no": [],
+ "unknown": [],
+ }
+ self.host_to_role: Dict[str, List[str]] = {}
+ self.kernel_to_hosts: Dict[str, List[str]] = {}
+
+ self.public_network_list: List[str] = []
+ self.cluster_network_list: List[str] = []
+ self.health_check_raised = False
+ self.active_checks: List[str] = [] # checks enabled and executed
+ self.skipped_checks: List[str] = [] # checks enabled, but skipped due to a pre-req failure
+
+ raw_checks = self.mgr.get_store('config_checks')
+ if not raw_checks:
+ # doesn't exist, so seed the checks
+ self.seed_config_checks()
+ else:
+ # setting is there, so ensure there is an entry for each of the checks that
+ # this module supports (account for upgrades/changes)
+ try:
+ config_checks = json.loads(raw_checks)
+ except json.JSONDecodeError:
+ self.log.error("Unable to serialize config_checks config. Reset to defaults")
+ self.seed_config_checks()
+ else:
+ # Ensure the config_checks setting is consistent with this module
+ from_config = set(config_checks.keys())
+ from_module = set([c.name for c in self.health_checks])
+ old_checks = from_config.difference(from_module)
+ new_checks = from_module.difference(from_config)
+
+ if old_checks:
+ self.log.debug(f"old checks being removed from config_checks: {old_checks}")
+ for i in old_checks:
+ del config_checks[i]
+ if new_checks:
+ self.log.debug(f"new checks being added to config_checks: {new_checks}")
+ for i in new_checks:
+ config_checks[i] = 'enabled'
+
+ if old_checks or new_checks:
+ self.log.info(
+ f"config_checks updated: {len(old_checks)} removed, {len(new_checks)} added")
+ self.mgr.set_store('config_checks', json.dumps(config_checks))
+ else:
+ self.log.debug("config_checks match module definition")
+
+ def lookup_check(self, key_value: str, key_name: str = 'name') -> Optional[CephadmCheckDefinition]:
+
+ for c in self.health_checks:
+ if getattr(c, key_name) == key_value:
+ return c
+ return None
+
+ @property
+ def defined_checks(self) -> int:
+ return len(self.health_checks)
+
+ @property
+ def active_checks_count(self) -> int:
+ return len(self.active_checks)
+
+ def seed_config_checks(self) -> None:
+ defaults = {check.name: 'enabled' for check in self.health_checks}
+ self.mgr.set_store('config_checks', json.dumps(defaults))
+
+ @property
+ def skipped_checks_count(self) -> int:
+ return len(self.skipped_checks)
+
+ def to_json(self) -> List[Dict[str, str]]:
+ return [check.to_json() for check in self.health_checks]
+
+ def load_network_config(self) -> None:
+ ret, out, _err = self.mgr.check_mon_command({
+ 'prefix': 'config dump',
+ 'format': 'json'
+ })
+ assert ret == 0
+ js = json.loads(out)
+ for item in js:
+ if item['name'] == "cluster_network":
+ self.cluster_network_list = item['value'].strip().split(',')
+ if item['name'] == "public_network":
+ self.public_network_list = item['value'].strip().split(',')
+
+ self.log.debug(f"public networks {self.public_network_list}")
+ self.log.debug(f"cluster networks {self.cluster_network_list}")
+
+ def _update_subnet(self, subnet: str, hostname: str, nic: Dict[str, Any]) -> None:
+ mtu = nic.get('mtu', None)
+ speed = nic.get('speed', None)
+ if not mtu or not speed:
+ return
+
+ this_subnet = self.subnet_lookup.get(subnet, None)
+ if this_subnet:
+ this_subnet.update(hostname, mtu, speed)
+ else:
+ self.subnet_lookup[subnet] = SubnetLookup(subnet, hostname, mtu, speed)
+
+ def _update_subnet_lookups(self, hostname: str, devname: str, nic: Dict[str, Any]) -> None:
+ if nic['ipv4_address']:
+ try:
+ iface4 = ipaddress.IPv4Interface(nic['ipv4_address'])
+ subnet = str(iface4.network)
+ except ipaddress.AddressValueError as e:
+ self.log.exception(f"Invalid network on {hostname}, interface {devname} : {str(e)}")
+ else:
+ self._update_subnet(subnet, hostname, nic)
+
+ if nic['ipv6_address']:
+ try:
+ iface6 = ipaddress.IPv6Interface(nic['ipv6_address'])
+ subnet = str(iface6.network)
+ except ipaddress.AddressValueError as e:
+ self.log.exception(f"Invalid network on {hostname}, interface {devname} : {str(e)}")
+ else:
+ self._update_subnet(subnet, hostname, nic)
+
+ def hosts_with_role(self, role: str) -> List[str]:
+ host_list = []
+ for hostname, roles in self.host_to_role.items():
+ if role in roles:
+ host_list.append(hostname)
+ return host_list
+
+ def reset(self) -> None:
+ self.subnet_lookup.clear()
+ self.lsm_to_host.clear()
+ self.subscribed['yes'] = []
+ self.subscribed['no'] = []
+ self.subscribed['unknown'] = []
+ self.host_to_role.clear()
+ self.kernel_to_hosts.clear()
+
+ def _get_majority(self, data: Dict[str, List[str]]) -> Tuple[str, int]:
+ assert isinstance(data, dict)
+
+ majority_key = ''
+ majority_count = 0
+ for key in data:
+ if len(data[key]) > majority_count:
+ majority_count = len(data[key])
+ majority_key = key
+ return majority_key, majority_count
+
+ def get_ceph_metadata(self) -> Dict[str, Optional[Dict[str, str]]]:
+ """Build a map of service -> service metadata"""
+ service_map: Dict[str, Optional[Dict[str, str]]] = {}
+
+ for server in self.mgr.list_servers():
+ for service in cast(List[ServiceInfoT], server.get('services', [])):
+ if service:
+ service_map.update(
+ {
+ f"{service['type']}.{service['id']}":
+ self.mgr.get_metadata(service['type'], service['id'])
+ }
+ )
+ return service_map
+
+ def _check_kernel_lsm(self) -> None:
+ if len(self.lsm_to_host.keys()) > 1:
+
+ majority_hosts_ptr, majority_hosts_count = self._get_majority(self.lsm_to_host)
+ lsm_copy = self.lsm_to_host.copy()
+ del lsm_copy[majority_hosts_ptr]
+ details = []
+ for lsm_key in lsm_copy.keys():
+ for host in lsm_copy[lsm_key]:
+ details.append(
+ f"{host} has inconsistent KSM settings compared to the "
+ f"majority of hosts({majority_hosts_count}) in the cluster")
+ host_sfx = 's' if len(details) > 1 else ''
+ self.mgr.health_checks['CEPHADM_CHECK_KERNEL_LSM'] = {
+ 'severity': 'warning',
+ 'summary': f"Kernel Security Module (SELinux/AppArmor) is inconsistent for "
+ f"{len(details)} host{host_sfx}",
+ 'count': len(details),
+ 'detail': details,
+ }
+ self.health_check_raised = True
+ else:
+ self.mgr.health_checks.pop('CEPHADM_CHECK_KERNEL_LSM', None)
+
+ def _check_subscription(self) -> None:
+ if len(self.subscribed['yes']) > 0 and len(self.subscribed['no']) > 0:
+ # inconsistent subscription states - CEPHADM_CHECK_SUBSCRIPTION
+ details = []
+ for host in self.subscribed['no']:
+ details.append(f"{host} does not have an active subscription")
+ self.mgr.health_checks['CEPHADM_CHECK_SUBSCRIPTION'] = {
+ 'severity': 'warning',
+ 'summary': f"Support subscriptions inactive on {len(details)} host(s)"
+ f"({len(self.subscribed['yes'])} subscriptions active)",
+ 'count': len(details),
+ 'detail': details,
+ }
+ self.health_check_raised = True
+ else:
+ self.mgr.health_checks.pop('CEPHADM_CHECK_SUBSCRIPTION', None)
+
+ def _check_public_network(self) -> None:
+ hosts_remaining: List[str] = list(self.mgr.cache.facts.keys())
+ hosts_removed: List[str] = []
+ self.log.debug(f"checking public network membership for: {hosts_remaining}")
+
+ for p_net in self.public_network_list:
+ self.log.debug(f"checking network {p_net}")
+ subnet_data = self.subnet_lookup.get(p_net, None)
+ self.log.debug(f"subnet data - {subnet_data}")
+
+ if subnet_data:
+ hosts_in_subnet = subnet_data.host_list
+ for host in hosts_in_subnet:
+ if host in hosts_remaining:
+ hosts_remaining.remove(host)
+ hosts_removed.append(host)
+ else:
+ if host not in hosts_removed:
+ self.log.debug(f"host={host}, subnet={p_net}")
+ self.log.exception(
+ "Host listed for a subnet but not present in the host facts?")
+
+ # Ideally all hosts will have been removed since they have an IP on at least
+ # one of the public networks
+ if hosts_remaining:
+ if len(hosts_remaining) != len(self.mgr.cache.facts):
+ # public network is visible on some hosts
+ details = [
+ f"{host} does not have an interface on any public network" for host in hosts_remaining]
+
+ self.mgr.health_checks['CEPHADM_CHECK_PUBLIC_MEMBERSHIP'] = {
+ 'severity': 'warning',
+ 'summary': f"Public network(s) is not directly accessible from {len(hosts_remaining)} "
+ "cluster hosts",
+ 'count': len(details),
+ 'detail': details,
+ }
+ self.health_check_raised = True
+ else:
+ self.mgr.health_checks.pop('CEPHADM_CHECK_PUBLIC_MEMBERSHIP', None)
+
+ def _check_osd_mtu(self) -> None:
+ osd_hosts = set(self.hosts_with_role('osd'))
+ osd_network_list = self.cluster_network_list or self.public_network_list
+ mtu_errors: List[str] = []
+
+ for osd_net in osd_network_list:
+ subnet_data = self.subnet_lookup.get(osd_net, None)
+
+ if subnet_data:
+
+ self.log.debug(f"processing mtu map : {json.dumps(subnet_data.mtu_map)}")
+ mtu_count = {}
+ max_hosts = 0
+ mtu_ptr = ''
+ diffs = {}
+ for mtu, host_list in subnet_data.mtu_map.items():
+ mtu_hosts = set(host_list)
+ mtu_count[mtu] = len(mtu_hosts)
+ errors = osd_hosts.difference(mtu_hosts)
+ if errors:
+ diffs[mtu] = errors
+ if len(errors) > max_hosts:
+ mtu_ptr = mtu
+
+ if diffs:
+ self.log.debug("MTU problems detected")
+ self.log.debug(f"most hosts using {mtu_ptr}")
+ mtu_copy = subnet_data.mtu_map.copy()
+ del mtu_copy[mtu_ptr]
+ for bad_mtu in mtu_copy:
+ for h in mtu_copy[bad_mtu]:
+ host = HostFacts()
+ host.load_facts(self.mgr.cache.facts[h])
+ mtu_errors.append(
+ f"host {h}({host.subnet_to_nic(osd_net)}) is using MTU "
+ f"{bad_mtu} on {osd_net}, NICs on other hosts use {mtu_ptr}")
+
+ if mtu_errors:
+ self.mgr.health_checks['CEPHADM_CHECK_MTU'] = {
+ 'severity': 'warning',
+ 'summary': f"MTU setting inconsistent on osd network NICs on {len(mtu_errors)} host(s)",
+ 'count': len(mtu_errors),
+ 'detail': mtu_errors,
+ }
+ self.health_check_raised = True
+ else:
+ self.mgr.health_checks.pop('CEPHADM_CHECK_MTU', None)
+
+ def _check_osd_linkspeed(self) -> None:
+ osd_hosts = set(self.hosts_with_role('osd'))
+ osd_network_list = self.cluster_network_list or self.public_network_list
+
+ linkspeed_errors = []
+
+ for osd_net in osd_network_list:
+ subnet_data = self.subnet_lookup.get(osd_net, None)
+
+ if subnet_data:
+
+ self.log.debug(f"processing subnet : {subnet_data}")
+
+ speed_count = {}
+ max_hosts = 0
+ speed_ptr = ''
+ diffs = {}
+ for speed, host_list in subnet_data.speed_map.items():
+ speed_hosts = set(host_list)
+ speed_count[speed] = len(speed_hosts)
+ errors = osd_hosts.difference(speed_hosts)
+ if errors:
+ diffs[speed] = errors
+ if len(errors) > max_hosts:
+ speed_ptr = speed
+
+ if diffs:
+ self.log.debug("linkspeed issue(s) detected")
+ self.log.debug(f"most hosts using {speed_ptr}")
+ speed_copy = subnet_data.speed_map.copy()
+ del speed_copy[speed_ptr]
+ for bad_speed in speed_copy:
+ if bad_speed > speed_ptr:
+ # skip speed is better than most...it can stay!
+ continue
+ for h in speed_copy[bad_speed]:
+ host = HostFacts()
+ host.load_facts(self.mgr.cache.facts[h])
+ linkspeed_errors.append(
+ f"host {h}({host.subnet_to_nic(osd_net)}) has linkspeed of "
+ f"{bad_speed} on {osd_net}, NICs on other hosts use {speed_ptr}")
+
+ if linkspeed_errors:
+ self.mgr.health_checks['CEPHADM_CHECK_LINKSPEED'] = {
+ 'severity': 'warning',
+ 'summary': "Link speed is inconsistent on osd network NICs for "
+ f"{len(linkspeed_errors)} host(s)",
+ 'count': len(linkspeed_errors),
+ 'detail': linkspeed_errors,
+ }
+ self.health_check_raised = True
+ else:
+ self.mgr.health_checks.pop('CEPHADM_CHECK_LINKSPEED', None)
+
+ def _check_network_missing(self) -> None:
+ all_networks = self.public_network_list.copy()
+ all_networks.extend(self.cluster_network_list)
+
+ missing_networks = []
+ for subnet in all_networks:
+ subnet_data = self.subnet_lookup.get(subnet, None)
+
+ if not subnet_data:
+ missing_networks.append(f"{subnet} not found on any host in the cluster")
+ self.log.warning(
+ f"Network {subnet} has been defined, but is not present on any host")
+
+ if missing_networks:
+ net_sfx = 's' if len(missing_networks) > 1 else ''
+ self.mgr.health_checks['CEPHADM_CHECK_NETWORK_MISSING'] = {
+ 'severity': 'warning',
+ 'summary': f"Public/cluster network{net_sfx} defined, but can not be found on "
+ "any host",
+ 'count': len(missing_networks),
+ 'detail': missing_networks,
+ }
+ self.health_check_raised = True
+ else:
+ self.mgr.health_checks.pop('CEPHADM_CHECK_NETWORK_MISSING', None)
+
+ def _check_release_parity(self) -> None:
+ upgrade_status = self.mgr.upgrade.upgrade_status()
+ if upgrade_status.in_progress:
+ # skip version consistency checks during an upgrade cycle
+ self.skipped_checks.append('ceph_release')
+ return
+
+ services = self.get_ceph_metadata()
+ self.log.debug(json.dumps(services))
+ version_to_svcs: Dict[str, List[str]] = {}
+
+ for svc in services:
+ if services[svc]:
+ metadata = cast(Dict[str, str], services[svc])
+ v = metadata.get('ceph_release', '')
+ if v in version_to_svcs:
+ version_to_svcs[v].append(svc)
+ else:
+ version_to_svcs[v] = [svc]
+
+ if len(version_to_svcs) > 1:
+ majority_ptr, _majority_count = self._get_majority(version_to_svcs)
+ ver_copy = version_to_svcs.copy()
+ del ver_copy[majority_ptr]
+ details = []
+ for v in ver_copy:
+ for svc in ver_copy[v]:
+ details.append(
+ f"{svc} is running {v} (majority of cluster is using {majority_ptr})")
+
+ self.mgr.health_checks['CEPHADM_CHECK_CEPH_RELEASE'] = {
+ 'severity': 'warning',
+ 'summary': 'Ceph cluster running mixed ceph releases',
+ 'count': len(details),
+ 'detail': details,
+ }
+ self.health_check_raised = True
+ self.log.warning(
+ f"running with {len(version_to_svcs)} different ceph releases within this cluster")
+ else:
+ self.mgr.health_checks.pop('CEPHADM_CHECK_CEPH_RELEASE', None)
+
+ def _check_kernel_version(self) -> None:
+ if len(self.kernel_to_hosts.keys()) > 1:
+ majority_hosts_ptr, majority_hosts_count = self._get_majority(self.kernel_to_hosts)
+ kver_copy = self.kernel_to_hosts.copy()
+ del kver_copy[majority_hosts_ptr]
+ details = []
+ for k in kver_copy:
+ for h in kver_copy[k]:
+ details.append(
+ f"host {h} running kernel {k}, majority of hosts({majority_hosts_count}) "
+ f"running {majority_hosts_ptr}")
+
+ self.log.warning("mixed kernel versions detected")
+ self.mgr.health_checks['CEPHADM_CHECK_KERNEL_VERSION'] = {
+ 'severity': 'warning',
+ 'summary': f"{len(details)} host(s) running different kernel versions",
+ 'count': len(details),
+ 'detail': details,
+ }
+ self.health_check_raised = True
+ else:
+ self.mgr.health_checks.pop('CEPHADM_CHECK_KERNEL_VERSION', None)
+
+ def _process_hosts(self) -> None:
+ self.log.debug(f"processing data from {len(self.mgr.cache.facts)} hosts")
+ for hostname in self.mgr.cache.facts:
+ host = HostFacts()
+ host.load_facts(self.mgr.cache.facts[hostname])
+ if not host._valid:
+ self.log.warning(f"skipping {hostname} - incompatible host facts")
+ continue
+
+ kernel_lsm = cast(Dict[str, str], host.kernel_security)
+ lsm_desc = kernel_lsm.get('description', '')
+ if lsm_desc:
+ if lsm_desc in self.lsm_to_host:
+ self.lsm_to_host[lsm_desc].append(hostname)
+ else:
+ self.lsm_to_host[lsm_desc] = [hostname]
+
+ subscription_state = host.subscribed.lower() if host.subscribed else None
+ if subscription_state:
+ self.subscribed[subscription_state].append(hostname)
+
+ interfaces = cast(Dict[str, Dict[str, Any]], host.interfaces)
+ for name in interfaces.keys():
+ if name in ['lo']:
+ continue
+ self._update_subnet_lookups(hostname, name, interfaces[name])
+
+ if host.kernel:
+ kernel_maj_min = '.'.join(host.kernel.split('.')[0:2])
+ if kernel_maj_min in self.kernel_to_hosts:
+ self.kernel_to_hosts[kernel_maj_min].append(hostname)
+ else:
+ self.kernel_to_hosts[kernel_maj_min] = [hostname]
+ else:
+ self.log.warning(f"Host gather facts for {hostname} is missing kernel information")
+
+ # NOTE: if daemondescription had systemd enabled state, we could check for systemd 'tampering'
+ self.host_to_role[hostname] = list(self.mgr.cache.get_daemon_types(hostname))
+
+ def run_checks(self) -> None:
+ checks_enabled = self.mgr.get_module_option('config_checks_enabled')
+ if checks_enabled is not True:
+ return
+
+ self.reset()
+
+ check_config: Dict[str, str] = {}
+ checks_raw: Optional[str] = self.mgr.get_store('config_checks')
+ if checks_raw:
+ try:
+ check_config.update(json.loads(checks_raw))
+ except json.JSONDecodeError:
+ self.log.exception(
+ "mgr/cephadm/config_checks is not JSON serializable - all checks will run")
+
+ # build lookup "maps" by walking the host facts, once
+ self._process_hosts()
+
+ self.health_check_raised = False
+ self.active_checks = []
+ self.skipped_checks = []
+
+ # process all healthchecks that are not explicitly disabled
+ for health_check in self.health_checks:
+ if check_config.get(health_check.name, '') != 'disabled':
+ self.active_checks.append(health_check.name)
+ health_check.func()
+
+ self.mgr.set_health_checks(self.mgr.health_checks)