From 17d6a993fc17d533460c5f40f3908c708e057c18 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Thu, 23 May 2024 18:45:17 +0200 Subject: Merging upstream version 18.2.3. Signed-off-by: Daniel Baumann --- src/pybind/mgr/cephadm/module.py | 189 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 178 insertions(+), 11 deletions(-) (limited to 'src/pybind/mgr/cephadm/module.py') diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 7b97ce74a..f1d234fb2 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -1,4 +1,5 @@ import asyncio +import concurrent import json import errno import ipaddress @@ -10,6 +11,7 @@ from configparser import ConfigParser from contextlib import contextmanager from functools import wraps from tempfile import TemporaryDirectory, NamedTemporaryFile +from urllib.error import HTTPError from threading import Event from cephadm.service_discovery import ServiceDiscovery @@ -39,7 +41,14 @@ from cephadm.http_server import CephadmHttpServer from cephadm.agent import CephadmAgentHelpers -from mgr_module import MgrModule, HandleCommandResult, Option, NotifyType +from mgr_module import ( + MgrModule, + HandleCommandResult, + Option, + NotifyType, + MonCommandFailed, +) +from mgr_util import build_url import orchestrator from orchestrator.module import to_format, Format @@ -64,9 +73,10 @@ from .services.osd import OSDRemovalQueue, OSDService, OSD, NotFoundError from .services.monitoring import GrafanaService, AlertmanagerService, PrometheusService, \ NodeExporterService, SNMPGatewayService, LokiService, PromtailService from .services.jaeger import ElasticSearchService, JaegerAgentService, JaegerCollectorService, JaegerQueryService +from .services.node_proxy import NodeProxy from .schedule import HostAssignment from .inventory import Inventory, SpecStore, HostCache, AgentCache, EventStore, \ - ClientKeyringStore, ClientKeyringSpec, TunedProfileStore + ClientKeyringStore, ClientKeyringSpec, TunedProfileStore, NodeProxyCache from .upgrade import CephadmUpgrade from .template import TemplateMgr from .utils import CEPH_IMAGE_TYPES, RESCHEDULE_FROM_OFFLINE_HOSTS_TYPES, forall_hosts, \ @@ -107,7 +117,7 @@ os._exit = os_exit_noop # type: ignore DEFAULT_IMAGE = 'quay.io/ceph/ceph' # DO NOT ADD TAG TO THIS DEFAULT_PROMETHEUS_IMAGE = 'quay.io/prometheus/prometheus:v2.43.0' DEFAULT_NODE_EXPORTER_IMAGE = 'quay.io/prometheus/node-exporter:v1.5.0' -DEFAULT_NVMEOF_IMAGE = 'quay.io/ceph/nvmeof:0.0.2' +DEFAULT_NVMEOF_IMAGE = 'quay.io/ceph/nvmeof:1.0.0' DEFAULT_LOKI_IMAGE = 'docker.io/grafana/loki:2.4.0' DEFAULT_PROMTAIL_IMAGE = 'docker.io/grafana/promtail:2.4.0' DEFAULT_ALERT_MANAGER_IMAGE = 'quay.io/prometheus/alertmanager:v0.25.0' @@ -435,6 +445,12 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, default=3.0, desc='Multiplied by agent refresh rate to calculate how long agent must not report before being marked down' ), + Option( + 'hw_monitoring', + type='bool', + default=False, + desc='Deploy hw monitoring daemon on every host.' + ), Option( 'max_osd_draining_count', type='int', @@ -467,11 +483,17 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, ), Option( 'default_cephadm_command_timeout', - type='secs', + type='int', default=15 * 60, desc='Default timeout applied to cephadm commands run directly on ' 'the host (in seconds)' ), + Option( + 'oob_default_addr', + type='str', + default='169.254.1.1', + desc="Default address for RedFish API (oob management)." + ), ] def __init__(self, *args: Any, **kwargs: Any): @@ -529,6 +551,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, self.registry_password: Optional[str] = None self.registry_insecure: bool = False self.use_repo_digest = True + self.config_checks_enabled = False self.default_registry = '' self.autotune_memory_target_ratio = 0.0 self.autotune_interval = 0 @@ -545,6 +568,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, self.agent_refresh_rate = 0 self.agent_down_multiplier = 0.0 self.agent_starting_port = 0 + self.hw_monitoring = False self.service_discovery_port = 0 self.secure_monitoring_stack = False self.apply_spec_fails: List[Tuple[str, str]] = [] @@ -554,6 +578,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, self.cgroups_split = True self.log_refresh_metadata = False self.default_cephadm_command_timeout = 0 + self.oob_default_addr = '' self.notify(NotifyType.mon_map, None) self.config_notify() @@ -584,6 +609,9 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, self.cache = HostCache(self) self.cache.load() + self.node_proxy_cache = NodeProxyCache(self) + self.node_proxy_cache.load() + self.agent_cache = AgentCache(self) self.agent_cache.load() @@ -621,7 +649,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, PrometheusService, NodeExporterService, LokiService, PromtailService, CrashService, IscsiService, IngressService, CustomContainerService, CephfsMirrorService, NvmeofService, CephadmAgent, CephExporterService, SNMPGatewayService, ElasticSearchService, - JaegerQueryService, JaegerAgentService, JaegerCollectorService + JaegerQueryService, JaegerAgentService, JaegerCollectorService, NodeProxy ] # https://github.com/python/mypy/issues/8993 @@ -632,6 +660,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, self.osd_service: OSDService = cast(OSDService, self.cephadm_services['osd']) self.iscsi_service: IscsiService = cast(IscsiService, self.cephadm_services['iscsi']) self.nvmeof_service: NvmeofService = cast(NvmeofService, self.cephadm_services['nvmeof']) + self.node_proxy_service: NodeProxy = cast(NodeProxy, self.cephadm_services['node-proxy']) self.scheduled_async_actions: List[Callable] = [] @@ -644,6 +673,9 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, self.http_server = CephadmHttpServer(self) self.http_server.start() + + self.node_proxy = NodeProxy(self) + self.agent_helpers = CephadmAgentHelpers(self) if self.use_agent: self.agent_helpers._apply_agent() @@ -709,7 +741,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, # are provided, that will be included in the OrchestratorError's message try: yield - except asyncio.TimeoutError: + except (asyncio.TimeoutError, concurrent.futures.TimeoutError): err_str: str = '' if cmd: err_str = f'Command "{cmd}" timed out ' @@ -722,6 +754,16 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, else: err_str += (f'(default {self.default_cephadm_command_timeout} second timeout)') raise OrchestratorError(err_str) + except concurrent.futures.CancelledError as e: + err_str = '' + if cmd: + err_str = f'Command "{cmd}" failed ' + else: + err_str = 'Command failed ' + if host: + err_str += f'on host {host} ' + err_str += f' - {str(e)}' + raise OrchestratorError(err_str) def set_container_image(self, entity: str, image: str) -> None: self.check_mon_command({ @@ -810,7 +852,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, Generate a unique random service name """ suffix = daemon_type not in [ - 'mon', 'crash', 'ceph-exporter', + 'mon', 'crash', 'ceph-exporter', 'node-proxy', 'prometheus', 'node-exporter', 'grafana', 'alertmanager', 'container', 'agent', 'snmp-gateway', 'loki', 'promtail', 'elasticsearch', 'jaeger-collector', 'jaeger-agent', 'jaeger-query' @@ -948,6 +990,17 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, self.set_health_warning('CEPHADM_FAILED_DAEMON', f'{len(failed_daemons)} failed cephadm daemon(s)', len( failed_daemons), failed_daemons) + def get_first_matching_network_ip(self, host: str, sspec: ServiceSpec) -> Optional[str]: + sspec_networks = sspec.networks + for subnet, ifaces in self.cache.networks.get(host, {}).items(): + host_network = ipaddress.ip_network(subnet) + for spec_network_str in sspec_networks: + spec_network = ipaddress.ip_network(spec_network_str) + if host_network.overlaps(spec_network): + return list(ifaces.values())[0][0] + logger.error(f'{spec_network} from {sspec.service_name()} spec does not overlap with {host_network} on {host}') + return None + @staticmethod def can_run() -> Tuple[bool, str]: if asyncssh is not None: @@ -1320,7 +1373,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, @orchestrator._cli_read_command('cephadm config-check status') def _config_check_status(self) -> HandleCommandResult: """Show whether the configuration checker feature is enabled/disabled""" - status = self.get_module_option('config_checks_enabled') + status = self.config_checks_enabled return HandleCommandResult(stdout="Enabled" if status else "Disabled") @orchestrator._cli_write_command('cephadm config-check enable') @@ -1597,6 +1650,18 @@ Then run the following: if spec.hostname in self.inventory and self.inventory.get_addr(spec.hostname) != spec.addr: self.cache.refresh_all_host_info(spec.hostname) + if spec.oob: + if not spec.oob.get('addr'): + spec.oob['addr'] = self.oob_default_addr + if not spec.oob.get('port'): + spec.oob['port'] = '443' + host_oob_info = dict() + host_oob_info['addr'] = spec.oob['addr'] + host_oob_info['port'] = spec.oob['port'] + host_oob_info['username'] = spec.oob['username'] + host_oob_info['password'] = spec.oob['password'] + self.node_proxy_cache.update_oob(spec.hostname, host_oob_info) + # prime crush map? if spec.location: self.check_mon_command({ @@ -1621,7 +1686,72 @@ Then run the following: return self._add_host(spec) @handle_orch_error - def remove_host(self, host: str, force: bool = False, offline: bool = False) -> str: + def hardware_light(self, light_type: str, action: str, hostname: str, device: Optional[str] = None) -> Dict[str, Any]: + try: + result = self.node_proxy.led(light_type=light_type, + action=action, + hostname=hostname, + device=device) + except RuntimeError as e: + self.log.error(e) + raise OrchestratorValidationError(f'Make sure the node-proxy agent is deployed and running on {hostname}') + except HTTPError as e: + self.log.error(e) + raise OrchestratorValidationError(f"http error while querying node-proxy API: {e}") + return result + + @handle_orch_error + def hardware_shutdown(self, hostname: str, force: Optional[bool] = False, yes_i_really_mean_it: bool = False) -> str: + if not yes_i_really_mean_it: + raise OrchestratorError("you must pass --yes-i-really-mean-it") + + try: + self.node_proxy.shutdown(hostname, force) + except RuntimeError as e: + self.log.error(e) + raise OrchestratorValidationError(f'Make sure the node-proxy agent is deployed and running on {hostname}') + except HTTPError as e: + self.log.error(e) + raise OrchestratorValidationError(f"Can't shutdown node {hostname}: {e}") + return f'Shutdown scheduled on {hostname}' + + @handle_orch_error + def hardware_powercycle(self, hostname: str, yes_i_really_mean_it: bool = False) -> str: + if not yes_i_really_mean_it: + raise OrchestratorError("you must pass --yes-i-really-mean-it") + + try: + self.node_proxy.powercycle(hostname) + except RuntimeError as e: + self.log.error(e) + raise OrchestratorValidationError(f'Make sure the node-proxy agent is deployed and running on {hostname}') + except HTTPError as e: + self.log.error(e) + raise OrchestratorValidationError(f"Can't perform powercycle on node {hostname}: {e}") + return f'Powercycle scheduled on {hostname}' + + @handle_orch_error + def node_proxy_fullreport(self, hostname: Optional[str] = None) -> Dict[str, Any]: + return self.node_proxy_cache.fullreport(hostname=hostname) + + @handle_orch_error + def node_proxy_summary(self, hostname: Optional[str] = None) -> Dict[str, Any]: + return self.node_proxy_cache.summary(hostname=hostname) + + @handle_orch_error + def node_proxy_firmwares(self, hostname: Optional[str] = None) -> Dict[str, Any]: + return self.node_proxy_cache.firmwares(hostname=hostname) + + @handle_orch_error + def node_proxy_criticals(self, hostname: Optional[str] = None) -> Dict[str, Any]: + return self.node_proxy_cache.criticals(hostname=hostname) + + @handle_orch_error + def node_proxy_common(self, category: str, hostname: Optional[str] = None) -> Dict[str, Any]: + return self.node_proxy_cache.common(category, hostname=hostname) + + @handle_orch_error + def remove_host(self, host: str, force: bool = False, offline: bool = False, rm_crush_entry: bool = False) -> str: """ Remove a host from orchestrator management. @@ -1701,6 +1831,17 @@ Then run the following: } run_cmd(cmd_args) + if rm_crush_entry: + try: + self.check_mon_command({ + 'prefix': 'osd crush remove', + 'name': host, + }) + except MonCommandFailed as e: + self.log.error(f'Couldn\'t remove host {host} from CRUSH map: {str(e)}') + return (f'Cephadm failed removing host {host}\n' + f'Failed to remove host {host} from the CRUSH map: {str(e)}') + self.inventory.rm_host(host) self.cache.rm_host(host) self.ssh.reset_con(host) @@ -2665,6 +2806,15 @@ Then run the following: pass deps = sorted([self.get_mgr_ip(), server_port, root_cert, str(self.device_enhanced_scan)]) + elif daemon_type == 'node-proxy': + root_cert = '' + server_port = '' + try: + server_port = str(self.http_server.agent.server_port) + root_cert = self.http_server.agent.ssl_certs.get_root_cert() + except Exception: + pass + deps = sorted([self.get_mgr_ip(), server_port, root_cert]) elif daemon_type == 'iscsi': if spec: iscsi_spec = cast(IscsiServiceSpec, spec) @@ -2704,6 +2854,12 @@ Then run the following: deps.append(f'{hash(alertmanager_user + alertmanager_password)}') elif daemon_type == 'promtail': deps += get_daemon_names(['loki']) + elif daemon_type == JaegerAgentService.TYPE: + for dd in self.cache.get_daemons_by_type(JaegerCollectorService.TYPE): + assert dd.hostname is not None + port = dd.ports[0] if dd.ports else JaegerCollectorService.DEFAULT_SERVICE_PORT + deps.append(build_url(host=dd.hostname, port=port).lstrip('/')) + deps = sorted(deps) else: # TODO(redo): some error message! pass @@ -3347,8 +3503,7 @@ Then run the following: """ for osd_id in osd_ids: try: - self.to_remove_osds.rm(OSD(osd_id=int(osd_id), - remove_util=self.to_remove_osds.rm_util)) + self.to_remove_osds.rm_by_osd_id(int(osd_id)) except (NotFoundError, KeyError, ValueError): return f'Unable to find OSD in the queue: {osd_id}' @@ -3382,6 +3537,18 @@ Then run the following: f"It is recommended to add the {SpecialHostLabels.ADMIN} label to another host" " before completing this operation.\nIf you're certain this is" " what you want rerun this command with --force.") + # if the user has specified the host we are going to drain + # explicitly in any service spec, warn the user. Having a + # drained host listed in a placement provides no value, so + # they may want to fix it. + services_matching_drain_host: List[str] = [] + for sname, sspec in self.spec_store.all_specs.items(): + if sspec.placement.hosts and hostname in [h.hostname for h in sspec.placement.hosts]: + services_matching_drain_host.append(sname) + if services_matching_drain_host: + raise OrchestratorValidationError(f'Host {hostname} was found explicitly listed in the placements ' + f'of services:\n {services_matching_drain_host}.\nPlease update those ' + 'specs to not list this host.\nThis warning can be bypassed with --force') self.add_host_label(hostname, '_no_schedule') if not keep_conf_keyring: -- cgit v1.2.3