summaryrefslogtreecommitdiffstats
path: root/src/pybind/mgr/cephadm/module.py
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-23 16:45:17 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-23 16:45:44 +0000
commit17d6a993fc17d533460c5f40f3908c708e057c18 (patch)
tree1a3bd93e0ecd74fa02f93a528fe2f87e5314c4b5 /src/pybind/mgr/cephadm/module.py
parentReleasing progress-linux version 18.2.2-0progress7.99u1. (diff)
downloadceph-17d6a993fc17d533460c5f40f3908c708e057c18.tar.xz
ceph-17d6a993fc17d533460c5f40f3908c708e057c18.zip
Merging upstream version 18.2.3.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/pybind/mgr/cephadm/module.py')
-rw-r--r--src/pybind/mgr/cephadm/module.py189
1 files changed, 178 insertions, 11 deletions
diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py
index 7b97ce74a..f1d234fb2 100644
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -1,4 +1,5 @@
import asyncio
+import concurrent
import json
import errno
import ipaddress
@@ -10,6 +11,7 @@ from configparser import ConfigParser
from contextlib import contextmanager
from functools import wraps
from tempfile import TemporaryDirectory, NamedTemporaryFile
+from urllib.error import HTTPError
from threading import Event
from cephadm.service_discovery import ServiceDiscovery
@@ -39,7 +41,14 @@ from cephadm.http_server import CephadmHttpServer
from cephadm.agent import CephadmAgentHelpers
-from mgr_module import MgrModule, HandleCommandResult, Option, NotifyType
+from mgr_module import (
+ MgrModule,
+ HandleCommandResult,
+ Option,
+ NotifyType,
+ MonCommandFailed,
+)
+from mgr_util import build_url
import orchestrator
from orchestrator.module import to_format, Format
@@ -64,9 +73,10 @@ from .services.osd import OSDRemovalQueue, OSDService, OSD, NotFoundError
from .services.monitoring import GrafanaService, AlertmanagerService, PrometheusService, \
NodeExporterService, SNMPGatewayService, LokiService, PromtailService
from .services.jaeger import ElasticSearchService, JaegerAgentService, JaegerCollectorService, JaegerQueryService
+from .services.node_proxy import NodeProxy
from .schedule import HostAssignment
from .inventory import Inventory, SpecStore, HostCache, AgentCache, EventStore, \
- ClientKeyringStore, ClientKeyringSpec, TunedProfileStore
+ ClientKeyringStore, ClientKeyringSpec, TunedProfileStore, NodeProxyCache
from .upgrade import CephadmUpgrade
from .template import TemplateMgr
from .utils import CEPH_IMAGE_TYPES, RESCHEDULE_FROM_OFFLINE_HOSTS_TYPES, forall_hosts, \
@@ -107,7 +117,7 @@ os._exit = os_exit_noop # type: ignore
DEFAULT_IMAGE = 'quay.io/ceph/ceph' # DO NOT ADD TAG TO THIS
DEFAULT_PROMETHEUS_IMAGE = 'quay.io/prometheus/prometheus:v2.43.0'
DEFAULT_NODE_EXPORTER_IMAGE = 'quay.io/prometheus/node-exporter:v1.5.0'
-DEFAULT_NVMEOF_IMAGE = 'quay.io/ceph/nvmeof:0.0.2'
+DEFAULT_NVMEOF_IMAGE = 'quay.io/ceph/nvmeof:1.0.0'
DEFAULT_LOKI_IMAGE = 'docker.io/grafana/loki:2.4.0'
DEFAULT_PROMTAIL_IMAGE = 'docker.io/grafana/promtail:2.4.0'
DEFAULT_ALERT_MANAGER_IMAGE = 'quay.io/prometheus/alertmanager:v0.25.0'
@@ -436,6 +446,12 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
desc='Multiplied by agent refresh rate to calculate how long agent must not report before being marked down'
),
Option(
+ 'hw_monitoring',
+ type='bool',
+ default=False,
+ desc='Deploy hw monitoring daemon on every host.'
+ ),
+ Option(
'max_osd_draining_count',
type='int',
default=10,
@@ -467,11 +483,17 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
),
Option(
'default_cephadm_command_timeout',
- type='secs',
+ type='int',
default=15 * 60,
desc='Default timeout applied to cephadm commands run directly on '
'the host (in seconds)'
),
+ Option(
+ 'oob_default_addr',
+ type='str',
+ default='169.254.1.1',
+ desc="Default address for RedFish API (oob management)."
+ ),
]
def __init__(self, *args: Any, **kwargs: Any):
@@ -529,6 +551,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
self.registry_password: Optional[str] = None
self.registry_insecure: bool = False
self.use_repo_digest = True
+ self.config_checks_enabled = False
self.default_registry = ''
self.autotune_memory_target_ratio = 0.0
self.autotune_interval = 0
@@ -545,6 +568,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
self.agent_refresh_rate = 0
self.agent_down_multiplier = 0.0
self.agent_starting_port = 0
+ self.hw_monitoring = False
self.service_discovery_port = 0
self.secure_monitoring_stack = False
self.apply_spec_fails: List[Tuple[str, str]] = []
@@ -554,6 +578,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
self.cgroups_split = True
self.log_refresh_metadata = False
self.default_cephadm_command_timeout = 0
+ self.oob_default_addr = ''
self.notify(NotifyType.mon_map, None)
self.config_notify()
@@ -584,6 +609,9 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
self.cache = HostCache(self)
self.cache.load()
+ self.node_proxy_cache = NodeProxyCache(self)
+ self.node_proxy_cache.load()
+
self.agent_cache = AgentCache(self)
self.agent_cache.load()
@@ -621,7 +649,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
PrometheusService, NodeExporterService, LokiService, PromtailService, CrashService, IscsiService,
IngressService, CustomContainerService, CephfsMirrorService, NvmeofService,
CephadmAgent, CephExporterService, SNMPGatewayService, ElasticSearchService,
- JaegerQueryService, JaegerAgentService, JaegerCollectorService
+ JaegerQueryService, JaegerAgentService, JaegerCollectorService, NodeProxy
]
# https://github.com/python/mypy/issues/8993
@@ -632,6 +660,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
self.osd_service: OSDService = cast(OSDService, self.cephadm_services['osd'])
self.iscsi_service: IscsiService = cast(IscsiService, self.cephadm_services['iscsi'])
self.nvmeof_service: NvmeofService = cast(NvmeofService, self.cephadm_services['nvmeof'])
+ self.node_proxy_service: NodeProxy = cast(NodeProxy, self.cephadm_services['node-proxy'])
self.scheduled_async_actions: List[Callable] = []
@@ -644,6 +673,9 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
self.http_server = CephadmHttpServer(self)
self.http_server.start()
+
+ self.node_proxy = NodeProxy(self)
+
self.agent_helpers = CephadmAgentHelpers(self)
if self.use_agent:
self.agent_helpers._apply_agent()
@@ -709,7 +741,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
# are provided, that will be included in the OrchestratorError's message
try:
yield
- except asyncio.TimeoutError:
+ except (asyncio.TimeoutError, concurrent.futures.TimeoutError):
err_str: str = ''
if cmd:
err_str = f'Command "{cmd}" timed out '
@@ -722,6 +754,16 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
else:
err_str += (f'(default {self.default_cephadm_command_timeout} second timeout)')
raise OrchestratorError(err_str)
+ except concurrent.futures.CancelledError as e:
+ err_str = ''
+ if cmd:
+ err_str = f'Command "{cmd}" failed '
+ else:
+ err_str = 'Command failed '
+ if host:
+ err_str += f'on host {host} '
+ err_str += f' - {str(e)}'
+ raise OrchestratorError(err_str)
def set_container_image(self, entity: str, image: str) -> None:
self.check_mon_command({
@@ -810,7 +852,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
Generate a unique random service name
"""
suffix = daemon_type not in [
- 'mon', 'crash', 'ceph-exporter',
+ 'mon', 'crash', 'ceph-exporter', 'node-proxy',
'prometheus', 'node-exporter', 'grafana', 'alertmanager',
'container', 'agent', 'snmp-gateway', 'loki', 'promtail',
'elasticsearch', 'jaeger-collector', 'jaeger-agent', 'jaeger-query'
@@ -948,6 +990,17 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
self.set_health_warning('CEPHADM_FAILED_DAEMON', f'{len(failed_daemons)} failed cephadm daemon(s)', len(
failed_daemons), failed_daemons)
+ def get_first_matching_network_ip(self, host: str, sspec: ServiceSpec) -> Optional[str]:
+ sspec_networks = sspec.networks
+ for subnet, ifaces in self.cache.networks.get(host, {}).items():
+ host_network = ipaddress.ip_network(subnet)
+ for spec_network_str in sspec_networks:
+ spec_network = ipaddress.ip_network(spec_network_str)
+ if host_network.overlaps(spec_network):
+ return list(ifaces.values())[0][0]
+ logger.error(f'{spec_network} from {sspec.service_name()} spec does not overlap with {host_network} on {host}')
+ return None
+
@staticmethod
def can_run() -> Tuple[bool, str]:
if asyncssh is not None:
@@ -1320,7 +1373,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
@orchestrator._cli_read_command('cephadm config-check status')
def _config_check_status(self) -> HandleCommandResult:
"""Show whether the configuration checker feature is enabled/disabled"""
- status = self.get_module_option('config_checks_enabled')
+ status = self.config_checks_enabled
return HandleCommandResult(stdout="Enabled" if status else "Disabled")
@orchestrator._cli_write_command('cephadm config-check enable')
@@ -1597,6 +1650,18 @@ Then run the following:
if spec.hostname in self.inventory and self.inventory.get_addr(spec.hostname) != spec.addr:
self.cache.refresh_all_host_info(spec.hostname)
+ if spec.oob:
+ if not spec.oob.get('addr'):
+ spec.oob['addr'] = self.oob_default_addr
+ if not spec.oob.get('port'):
+ spec.oob['port'] = '443'
+ host_oob_info = dict()
+ host_oob_info['addr'] = spec.oob['addr']
+ host_oob_info['port'] = spec.oob['port']
+ host_oob_info['username'] = spec.oob['username']
+ host_oob_info['password'] = spec.oob['password']
+ self.node_proxy_cache.update_oob(spec.hostname, host_oob_info)
+
# prime crush map?
if spec.location:
self.check_mon_command({
@@ -1621,7 +1686,72 @@ Then run the following:
return self._add_host(spec)
@handle_orch_error
- def remove_host(self, host: str, force: bool = False, offline: bool = False) -> str:
+ def hardware_light(self, light_type: str, action: str, hostname: str, device: Optional[str] = None) -> Dict[str, Any]:
+ try:
+ result = self.node_proxy.led(light_type=light_type,
+ action=action,
+ hostname=hostname,
+ device=device)
+ except RuntimeError as e:
+ self.log.error(e)
+ raise OrchestratorValidationError(f'Make sure the node-proxy agent is deployed and running on {hostname}')
+ except HTTPError as e:
+ self.log.error(e)
+ raise OrchestratorValidationError(f"http error while querying node-proxy API: {e}")
+ return result
+
+ @handle_orch_error
+ def hardware_shutdown(self, hostname: str, force: Optional[bool] = False, yes_i_really_mean_it: bool = False) -> str:
+ if not yes_i_really_mean_it:
+ raise OrchestratorError("you must pass --yes-i-really-mean-it")
+
+ try:
+ self.node_proxy.shutdown(hostname, force)
+ except RuntimeError as e:
+ self.log.error(e)
+ raise OrchestratorValidationError(f'Make sure the node-proxy agent is deployed and running on {hostname}')
+ except HTTPError as e:
+ self.log.error(e)
+ raise OrchestratorValidationError(f"Can't shutdown node {hostname}: {e}")
+ return f'Shutdown scheduled on {hostname}'
+
+ @handle_orch_error
+ def hardware_powercycle(self, hostname: str, yes_i_really_mean_it: bool = False) -> str:
+ if not yes_i_really_mean_it:
+ raise OrchestratorError("you must pass --yes-i-really-mean-it")
+
+ try:
+ self.node_proxy.powercycle(hostname)
+ except RuntimeError as e:
+ self.log.error(e)
+ raise OrchestratorValidationError(f'Make sure the node-proxy agent is deployed and running on {hostname}')
+ except HTTPError as e:
+ self.log.error(e)
+ raise OrchestratorValidationError(f"Can't perform powercycle on node {hostname}: {e}")
+ return f'Powercycle scheduled on {hostname}'
+
+ @handle_orch_error
+ def node_proxy_fullreport(self, hostname: Optional[str] = None) -> Dict[str, Any]:
+ return self.node_proxy_cache.fullreport(hostname=hostname)
+
+ @handle_orch_error
+ def node_proxy_summary(self, hostname: Optional[str] = None) -> Dict[str, Any]:
+ return self.node_proxy_cache.summary(hostname=hostname)
+
+ @handle_orch_error
+ def node_proxy_firmwares(self, hostname: Optional[str] = None) -> Dict[str, Any]:
+ return self.node_proxy_cache.firmwares(hostname=hostname)
+
+ @handle_orch_error
+ def node_proxy_criticals(self, hostname: Optional[str] = None) -> Dict[str, Any]:
+ return self.node_proxy_cache.criticals(hostname=hostname)
+
+ @handle_orch_error
+ def node_proxy_common(self, category: str, hostname: Optional[str] = None) -> Dict[str, Any]:
+ return self.node_proxy_cache.common(category, hostname=hostname)
+
+ @handle_orch_error
+ def remove_host(self, host: str, force: bool = False, offline: bool = False, rm_crush_entry: bool = False) -> str:
"""
Remove a host from orchestrator management.
@@ -1701,6 +1831,17 @@ Then run the following:
}
run_cmd(cmd_args)
+ if rm_crush_entry:
+ try:
+ self.check_mon_command({
+ 'prefix': 'osd crush remove',
+ 'name': host,
+ })
+ except MonCommandFailed as e:
+ self.log.error(f'Couldn\'t remove host {host} from CRUSH map: {str(e)}')
+ return (f'Cephadm failed removing host {host}\n'
+ f'Failed to remove host {host} from the CRUSH map: {str(e)}')
+
self.inventory.rm_host(host)
self.cache.rm_host(host)
self.ssh.reset_con(host)
@@ -2665,6 +2806,15 @@ Then run the following:
pass
deps = sorted([self.get_mgr_ip(), server_port, root_cert,
str(self.device_enhanced_scan)])
+ elif daemon_type == 'node-proxy':
+ root_cert = ''
+ server_port = ''
+ try:
+ server_port = str(self.http_server.agent.server_port)
+ root_cert = self.http_server.agent.ssl_certs.get_root_cert()
+ except Exception:
+ pass
+ deps = sorted([self.get_mgr_ip(), server_port, root_cert])
elif daemon_type == 'iscsi':
if spec:
iscsi_spec = cast(IscsiServiceSpec, spec)
@@ -2704,6 +2854,12 @@ Then run the following:
deps.append(f'{hash(alertmanager_user + alertmanager_password)}')
elif daemon_type == 'promtail':
deps += get_daemon_names(['loki'])
+ elif daemon_type == JaegerAgentService.TYPE:
+ for dd in self.cache.get_daemons_by_type(JaegerCollectorService.TYPE):
+ assert dd.hostname is not None
+ port = dd.ports[0] if dd.ports else JaegerCollectorService.DEFAULT_SERVICE_PORT
+ deps.append(build_url(host=dd.hostname, port=port).lstrip('/'))
+ deps = sorted(deps)
else:
# TODO(redo): some error message!
pass
@@ -3347,8 +3503,7 @@ Then run the following:
"""
for osd_id in osd_ids:
try:
- self.to_remove_osds.rm(OSD(osd_id=int(osd_id),
- remove_util=self.to_remove_osds.rm_util))
+ self.to_remove_osds.rm_by_osd_id(int(osd_id))
except (NotFoundError, KeyError, ValueError):
return f'Unable to find OSD in the queue: {osd_id}'
@@ -3382,6 +3537,18 @@ Then run the following:
f"It is recommended to add the {SpecialHostLabels.ADMIN} label to another host"
" before completing this operation.\nIf you're certain this is"
" what you want rerun this command with --force.")
+ # if the user has specified the host we are going to drain
+ # explicitly in any service spec, warn the user. Having a
+ # drained host listed in a placement provides no value, so
+ # they may want to fix it.
+ services_matching_drain_host: List[str] = []
+ for sname, sspec in self.spec_store.all_specs.items():
+ if sspec.placement.hosts and hostname in [h.hostname for h in sspec.placement.hosts]:
+ services_matching_drain_host.append(sname)
+ if services_matching_drain_host:
+ raise OrchestratorValidationError(f'Host {hostname} was found explicitly listed in the placements '
+ f'of services:\n {services_matching_drain_host}.\nPlease update those '
+ 'specs to not list this host.\nThis warning can be bypassed with --force')
self.add_host_label(hostname, '_no_schedule')
if not keep_conf_keyring: