diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-23 16:45:17 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-23 16:45:44 +0000 |
commit | 17d6a993fc17d533460c5f40f3908c708e057c18 (patch) | |
tree | 1a3bd93e0ecd74fa02f93a528fe2f87e5314c4b5 /src/pybind/mgr/cephadm/services | |
parent | Releasing progress-linux version 18.2.2-0progress7.99u1. (diff) | |
download | ceph-17d6a993fc17d533460c5f40f3908c708e057c18.tar.xz ceph-17d6a993fc17d533460c5f40f3908c708e057c18.zip |
Merging upstream version 18.2.3.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/pybind/mgr/cephadm/services')
-rw-r--r-- | src/pybind/mgr/cephadm/services/cephadmservice.py | 2 | ||||
-rw-r--r-- | src/pybind/mgr/cephadm/services/ingress.py | 1 | ||||
-rw-r--r-- | src/pybind/mgr/cephadm/services/jaeger.py | 3 | ||||
-rw-r--r-- | src/pybind/mgr/cephadm/services/monitoring.py | 30 | ||||
-rw-r--r-- | src/pybind/mgr/cephadm/services/nfs.py | 20 | ||||
-rw-r--r-- | src/pybind/mgr/cephadm/services/node_proxy.py | 180 | ||||
-rw-r--r-- | src/pybind/mgr/cephadm/services/nvmeof.py | 1 | ||||
-rw-r--r-- | src/pybind/mgr/cephadm/services/osd.py | 22 |
8 files changed, 251 insertions, 8 deletions
diff --git a/src/pybind/mgr/cephadm/services/cephadmservice.py b/src/pybind/mgr/cephadm/services/cephadmservice.py index 7d7a04dad..f3c45b164 100644 --- a/src/pybind/mgr/cephadm/services/cephadmservice.py +++ b/src/pybind/mgr/cephadm/services/cephadmservice.py @@ -41,7 +41,7 @@ def get_auth_entity(daemon_type: str, daemon_id: str, host: str = "") -> AuthEnt # the CephService class refers to service types, not daemon types if daemon_type in ['rgw', 'rbd-mirror', 'cephfs-mirror', 'nfs', "iscsi", 'nvmeof', 'ingress', 'ceph-exporter']: return AuthEntity(f'client.{daemon_type}.{daemon_id}') - elif daemon_type in ['crash', 'agent']: + elif daemon_type in ['crash', 'agent', 'node-proxy']: if host == "": raise OrchestratorError( f'Host not provided to generate <{daemon_type}> auth entity name') diff --git a/src/pybind/mgr/cephadm/services/ingress.py b/src/pybind/mgr/cephadm/services/ingress.py index 55be30454..5edd2517d 100644 --- a/src/pybind/mgr/cephadm/services/ingress.py +++ b/src/pybind/mgr/cephadm/services/ingress.py @@ -187,6 +187,7 @@ class IngressService(CephService): 'monitor_port': daemon_spec.ports[1] if daemon_spec.ports else spec.monitor_port, 'local_host_ip': host_ip, 'default_server_opts': server_opts, + 'health_check_interval': spec.health_check_interval or '2s', } ) config_files = { diff --git a/src/pybind/mgr/cephadm/services/jaeger.py b/src/pybind/mgr/cephadm/services/jaeger.py index c136d20e6..c83c765d0 100644 --- a/src/pybind/mgr/cephadm/services/jaeger.py +++ b/src/pybind/mgr/cephadm/services/jaeger.py @@ -20,13 +20,16 @@ class JaegerAgentService(CephadmService): def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec: assert self.TYPE == daemon_spec.daemon_type collectors = [] + deps: List[str] = [] for dd in self.mgr.cache.get_daemons_by_type(JaegerCollectorService.TYPE): # scrape jaeger-collector nodes assert dd.hostname is not None port = dd.ports[0] if dd.ports else JaegerCollectorService.DEFAULT_SERVICE_PORT url = build_url(host=dd.hostname, port=port).lstrip('/') collectors.append(url) + deps.append(url) daemon_spec.final_config = {'collector_nodes': ",".join(collectors)} + daemon_spec.deps = sorted(deps) return daemon_spec diff --git a/src/pybind/mgr/cephadm/services/monitoring.py b/src/pybind/mgr/cephadm/services/monitoring.py index 114c84860..10ddcbbd0 100644 --- a/src/pybind/mgr/cephadm/services/monitoring.py +++ b/src/pybind/mgr/cephadm/services/monitoring.py @@ -67,13 +67,24 @@ class GrafanaService(CephadmService): spec: GrafanaSpec = cast( GrafanaSpec, self.mgr.spec_store.active_specs[daemon_spec.service_name]) + + grafana_port = daemon_spec.ports[0] if daemon_spec.ports else self.DEFAULT_SERVICE_PORT + grafana_ip = daemon_spec.ip if daemon_spec.ip else '' + + if spec.only_bind_port_on_networks and spec.networks: + assert daemon_spec.host is not None + ip_to_bind_to = self.mgr.get_first_matching_network_ip(daemon_spec.host, spec) + if ip_to_bind_to: + daemon_spec.port_ips = {str(grafana_port): ip_to_bind_to} + grafana_ip = ip_to_bind_to + grafana_ini = self.mgr.template.render( 'services/grafana/grafana.ini.j2', { 'anonymous_access': spec.anonymous_access, 'initial_admin_password': spec.initial_admin_password, - 'http_port': daemon_spec.ports[0] if daemon_spec.ports else self.DEFAULT_SERVICE_PORT, + 'http_port': grafana_port, 'protocol': spec.protocol, - 'http_addr': daemon_spec.ip if daemon_spec.ip else '' + 'http_addr': grafana_ip }) if 'dashboard' in self.mgr.get('mgr_map')['modules'] and spec.initial_admin_password: @@ -402,6 +413,7 @@ class PrometheusService(CephadmService): haproxy_sd_url = f'{srv_end_point}service=haproxy' if haproxy_cnt > 0 else None mgr_prometheus_sd_url = f'{srv_end_point}service=mgr-prometheus' # always included ceph_exporter_sd_url = f'{srv_end_point}service=ceph-exporter' # always included + nvmeof_sd_url = f'{srv_end_point}service=nvmeof' # always included alertmanager_user, alertmanager_password = self.mgr._get_alertmanager_credentials() prometheus_user, prometheus_password = self.mgr._get_prometheus_credentials() @@ -417,9 +429,17 @@ class PrometheusService(CephadmService): 'node_exporter_sd_url': node_exporter_sd_url, 'alertmanager_sd_url': alertmanager_sd_url, 'haproxy_sd_url': haproxy_sd_url, - 'ceph_exporter_sd_url': ceph_exporter_sd_url + 'ceph_exporter_sd_url': ceph_exporter_sd_url, + 'nvmeof_sd_url': nvmeof_sd_url, } + ip_to_bind_to = '' + if spec.only_bind_port_on_networks and spec.networks: + assert daemon_spec.host is not None + ip_to_bind_to = self.mgr.get_first_matching_network_ip(daemon_spec.host, spec) or '' + if ip_to_bind_to: + daemon_spec.port_ips = {str(port): ip_to_bind_to} + web_context = { 'prometheus_web_user': prometheus_user, 'prometheus_web_password': password_hash(prometheus_password), @@ -446,6 +466,7 @@ class PrometheusService(CephadmService): }, 'retention_time': retention_time, 'retention_size': retention_size, + 'ip_to_bind_to': ip_to_bind_to, 'web_config': '/etc/prometheus/web.yml' } else: @@ -454,7 +475,8 @@ class PrometheusService(CephadmService): 'prometheus.yml': self.mgr.template.render('services/prometheus/prometheus.yml.j2', context) }, 'retention_time': retention_time, - 'retention_size': retention_size + 'retention_size': retention_size, + 'ip_to_bind_to': ip_to_bind_to } # include alerts, if present in the container diff --git a/src/pybind/mgr/cephadm/services/nfs.py b/src/pybind/mgr/cephadm/services/nfs.py index f94a00f5b..e0c61b117 100644 --- a/src/pybind/mgr/cephadm/services/nfs.py +++ b/src/pybind/mgr/cephadm/services/nfs.py @@ -5,6 +5,8 @@ import os import subprocess import tempfile from typing import Dict, Tuple, Any, List, cast, Optional +from configparser import ConfigParser +from io import StringIO from mgr_module import HandleCommandResult from mgr_module import NFS_POOL_NAME as POOL_NAME @@ -79,6 +81,8 @@ class NFSService(CephService): nodeid = f'{daemon_spec.service_name}.{daemon_spec.rank}' + nfs_idmap_conf = '/etc/ganesha/idmap.conf' + # create the RADOS recovery pool keyring rados_user = f'{daemon_type}.{daemon_id}' rados_keyring = self.create_keyring(daemon_spec) @@ -115,12 +119,27 @@ class NFSService(CephService): "port": daemon_spec.ports[0] if daemon_spec.ports else 2049, "bind_addr": bind_addr, "haproxy_hosts": [], + "nfs_idmap_conf": nfs_idmap_conf, } if spec.enable_haproxy_protocol: context["haproxy_hosts"] = self._haproxy_hosts() logger.debug("selected haproxy_hosts: %r", context["haproxy_hosts"]) return self.mgr.template.render('services/nfs/ganesha.conf.j2', context) + # generate the idmap config + def get_idmap_conf() -> str: + idmap_conf = spec.idmap_conf + output = '' + if idmap_conf is not None: + cp = ConfigParser() + out = StringIO() + cp.read_dict(idmap_conf) + cp.write(out) + out.seek(0) + output = out.read() + out.close() + return output + # generate the cephadm config json def get_cephadm_config() -> Dict[str, Any]: config: Dict[str, Any] = {} @@ -130,6 +149,7 @@ class NFSService(CephService): config['extra_args'] = ['-N', 'NIV_EVENT'] config['files'] = { 'ganesha.conf': get_ganesha_conf(), + 'idmap.conf': get_idmap_conf() } config.update( self.get_config_and_keyring( diff --git a/src/pybind/mgr/cephadm/services/node_proxy.py b/src/pybind/mgr/cephadm/services/node_proxy.py new file mode 100644 index 000000000..e5608ca42 --- /dev/null +++ b/src/pybind/mgr/cephadm/services/node_proxy.py @@ -0,0 +1,180 @@ +import json +import ssl +import base64 + +from urllib.error import HTTPError, URLError +from typing import List, Any, Dict, Tuple, Optional, MutableMapping + +from .cephadmservice import CephadmDaemonDeploySpec, CephService +from ceph.deployment.service_spec import ServiceSpec, PlacementSpec +from ceph.utils import http_req +from orchestrator import OrchestratorError + + +class NodeProxy(CephService): + TYPE = 'node-proxy' + + def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec: + assert self.TYPE == daemon_spec.daemon_type + daemon_id, host = daemon_spec.daemon_id, daemon_spec.host + + if not self.mgr.http_server.agent: + raise OrchestratorError('Cannot deploy node-proxy before creating cephadm endpoint') + + keyring = self.get_keyring_with_caps(self.get_auth_entity(daemon_id, host=host), []) + daemon_spec.keyring = keyring + self.mgr.node_proxy_cache.update_keyring(host, keyring) + + daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec) + + return daemon_spec + + def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]: + # node-proxy is re-using the agent endpoint and therefore + # needs similar checks to see if the endpoint is ready. + self.agent_endpoint = self.mgr.http_server.agent + try: + assert self.agent_endpoint + assert self.agent_endpoint.ssl_certs.get_root_cert() + assert self.agent_endpoint.server_port + except Exception: + raise OrchestratorError( + 'Cannot deploy node-proxy daemons until cephadm endpoint has finished generating certs') + + listener_cert, listener_key = self.agent_endpoint.ssl_certs.generate_cert(daemon_spec.host, self.mgr.inventory.get_addr(daemon_spec.host)) + cfg = { + 'target_ip': self.mgr.get_mgr_ip(), + 'target_port': self.agent_endpoint.server_port, + 'name': f'node-proxy.{daemon_spec.host}', + 'keyring': daemon_spec.keyring, + 'root_cert.pem': self.agent_endpoint.ssl_certs.get_root_cert(), + 'listener.crt': listener_cert, + 'listener.key': listener_key, + } + config = {'node-proxy.json': json.dumps(cfg)} + + return config, sorted([str(self.mgr.get_mgr_ip()), str(self.agent_endpoint.server_port), + self.agent_endpoint.ssl_certs.get_root_cert()]) + + def handle_hw_monitoring_setting(self) -> bool: + # function to apply or remove node-proxy service spec depending + # on whether the hw_mointoring config option is set or not. + # It should return True when it either creates or deletes a spec + # and False otherwise. + if self.mgr.hw_monitoring: + if 'node-proxy' not in self.mgr.spec_store: + spec = ServiceSpec( + service_type='node-proxy', + placement=PlacementSpec(host_pattern='*') + ) + self.mgr.spec_store.save(spec) + return True + return False + else: + if 'node-proxy' in self.mgr.spec_store: + self.mgr.spec_store.rm('node-proxy') + return True + return False + + def get_ssl_ctx(self) -> ssl.SSLContext: + ssl_root_crt = self.mgr.http_server.agent.ssl_certs.get_root_cert() + ssl_ctx = ssl.create_default_context() + ssl_ctx.check_hostname = True + ssl_ctx.verify_mode = ssl.CERT_REQUIRED + ssl_ctx.load_verify_locations(cadata=ssl_root_crt) + return ssl_ctx + + def led(self, light_type: str, action: str, hostname: str, device: Optional[str] = None) -> Dict[str, Any]: + ssl_ctx: ssl.SSLContext = self.get_ssl_ctx() + header: MutableMapping[str, str] = {} + method: str = 'PATCH' if action in ['on', 'off'] else 'GET' + payload: Optional[Dict[str, str]] = None + addr: str = self.mgr.inventory.get_addr(hostname) + endpoint: List[str] = ['led', light_type] + _device: str = device if device else '' + + if light_type == 'drive': + endpoint.append(_device) + + if method == 'PATCH': + payload = dict(state=action) + + header = self.generate_auth_header(hostname) + + endpoint = f'/{"/".join(endpoint)}' + + try: + headers, result, status = http_req(hostname=addr, + port='9456', + headers=header, + method=method, + data=json.dumps(payload), + endpoint=endpoint, + ssl_ctx=ssl_ctx) + result_json = json.loads(result) + except HTTPError as e: + self.mgr.log.error(e) + raise + except URLError as e: + raise RuntimeError(e) + + return result_json + + def generate_auth_header(self, hostname: str) -> Dict[str, str]: + try: + username = self.mgr.node_proxy_cache.oob[hostname]['username'] + password = self.mgr.node_proxy_cache.oob[hostname]['password'] + auth: bytes = f'{username}:{password}'.encode('utf-8') + auth_str: str = base64.b64encode(auth).decode('utf-8') + header = {'Authorization': f'Basic {auth_str}'} + except KeyError as e: + self.mgr.log.error(f'Check oob information is provided for {hostname}.') + raise RuntimeError(e) + return header + + def shutdown(self, hostname: str, force: Optional[bool] = False) -> Dict[str, Any]: + ssl_ctx: ssl.SSLContext = self.get_ssl_ctx() + header: Dict[str, str] = self.generate_auth_header(hostname) + addr: str = self.mgr.inventory.get_addr(hostname) + + endpoint = '/shutdown' + payload: Dict[str, Optional[bool]] = dict(force=force) + + try: + headers, result, status = http_req(hostname=addr, + port='9456', + headers=header, + data=json.dumps(payload), + endpoint=endpoint, + ssl_ctx=ssl_ctx) + result_json = json.loads(result) + except HTTPError as e: + self.mgr.log.error(e) + raise + except URLError as e: + raise RuntimeError(e) + + return result_json + + def powercycle(self, hostname: str) -> Dict[str, Any]: + ssl_ctx: ssl.SSLContext = self.get_ssl_ctx() + header: Dict[str, str] = self.generate_auth_header(hostname) + addr: str = self.mgr.inventory.get_addr(hostname) + + endpoint = '/powercycle' + + try: + headers, result, status = http_req(hostname=addr, + port='9456', + headers=header, + data="{}", + endpoint=endpoint, + ssl_ctx=ssl_ctx) + result_json = json.loads(result) + except HTTPError as e: + self.mgr.log.error(e) + raise + except URLError as e: + raise RuntimeError(e) + + return result_json diff --git a/src/pybind/mgr/cephadm/services/nvmeof.py b/src/pybind/mgr/cephadm/services/nvmeof.py index 7d2dd16cf..5f28273d4 100644 --- a/src/pybind/mgr/cephadm/services/nvmeof.py +++ b/src/pybind/mgr/cephadm/services/nvmeof.py @@ -15,6 +15,7 @@ logger = logging.getLogger(__name__) class NvmeofService(CephService): TYPE = 'nvmeof' + PROMETHEUS_PORT = 10008 def config(self, spec: NvmeofServiceSpec) -> None: # type: ignore assert self.TYPE == spec.service_type diff --git a/src/pybind/mgr/cephadm/services/osd.py b/src/pybind/mgr/cephadm/services/osd.py index bfecc5723..75b3fc58c 100644 --- a/src/pybind/mgr/cephadm/services/osd.py +++ b/src/pybind/mgr/cephadm/services/osd.py @@ -319,11 +319,16 @@ class OSDService(CephService): logger.exception('Cannot decode JSON: \'%s\'' % ' '.join(out)) concat_out = {} notes = [] - if osdspec.data_devices is not None and osdspec.data_devices.limit and len(concat_out) < osdspec.data_devices.limit: + if ( + osdspec.data_devices is not None + and osdspec.data_devices.limit + and (len(concat_out) + ds.existing_daemons) < osdspec.data_devices.limit + ): found = len(concat_out) limit = osdspec.data_devices.limit notes.append( - f'NOTE: Did not find enough disks matching filter on host {host} to reach data device limit (Found: {found} | Limit: {limit})') + f'NOTE: Did not find enough disks matching filter on host {host} to reach data device limit\n' + f'(New Devices: {found} | Existing Matching Daemons: {ds.existing_daemons} | Limit: {limit})') ret_all.append({'data': concat_out, 'osdspec': osdspec.service_id, 'host': host, @@ -664,6 +669,7 @@ class OSD: return None self.started = True self.stopped = False + self.original_weight = self.rm_util.get_weight(self) def start_draining(self) -> bool: if self.stopped: @@ -672,7 +678,6 @@ class OSD: if self.replace: self.rm_util.set_osd_flag([self], 'out') else: - self.original_weight = self.rm_util.get_weight(self) self.rm_util.reweight_osd(self, 0.0) self.drain_started_at = datetime.utcnow() self.draining = True @@ -761,6 +766,7 @@ class OSD: out['force'] = self.force out['zap'] = self.zap out['hostname'] = self.hostname # type: ignore + out['original_weight'] = self.original_weight for k in ['drain_started_at', 'drain_stopped_at', 'drain_done_at', 'process_started_at']: if getattr(self, k): @@ -953,6 +959,16 @@ class OSDRemovalQueue(object): self.osds.add(osd) osd.start() + def rm_by_osd_id(self, osd_id: int) -> None: + osd: Optional["OSD"] = None + for o in self.osds: + if o.osd_id == osd_id: + osd = o + if not osd: + logger.debug(f"Could not find osd with id {osd_id} in queue.") + raise KeyError(f'No osd with id {osd_id} in removal queue') + self.rm(osd) + def rm(self, osd: "OSD") -> None: if not osd.exists: raise NotFoundError() |