diff options
Diffstat (limited to 'src/pybind/mgr/cephadm')
31 files changed, 1749 insertions, 69 deletions
diff --git a/src/pybind/mgr/cephadm/agent.py b/src/pybind/mgr/cephadm/agent.py index 93a08cb34..9e71477d4 100644 --- a/src/pybind/mgr/cephadm/agent.py +++ b/src/pybind/mgr/cephadm/agent.py @@ -16,14 +16,15 @@ import time from orchestrator import DaemonDescriptionStatus from orchestrator._interface import daemon_type_to_service -from ceph.utils import datetime_now +from ceph.utils import datetime_now, http_req from ceph.deployment.inventory import Devices from ceph.deployment.service_spec import ServiceSpec, PlacementSpec from cephadm.services.cephadmservice import CephadmDaemonDeploySpec from cephadm.ssl_cert_utils import SSLCerts from mgr_util import test_port_allocation, PortAlreadyInUse -from typing import Any, Dict, List, Set, TYPE_CHECKING, Optional +from urllib.error import HTTPError, URLError +from typing import Any, Dict, List, Set, TYPE_CHECKING, Optional, MutableMapping if TYPE_CHECKING: from cephadm.module import CephadmOrchestrator @@ -53,11 +54,10 @@ class AgentEndpoint: self.server_addr = self.mgr.get_mgr_ip() def configure_routes(self) -> None: - d = cherrypy.dispatch.RoutesDispatcher() - d.connect(name='host-data', route='/data/', - controller=self.host_data.POST, - conditions=dict(method=['POST'])) - cherrypy.tree.mount(None, '/', config={'/': {'request.dispatch': d}}) + conf = {'/': {'tools.trailing_slash.on': False}} + + cherrypy.tree.mount(self.host_data, '/data', config=conf) + cherrypy.tree.mount(self.node_proxy_endpoint, '/node-proxy', config=conf) def configure_tls(self, server: Server) -> None: old_cert = self.mgr.get_store(self.KV_STORE_AGENT_ROOT_CERT) @@ -88,10 +88,540 @@ class AgentEndpoint: def configure(self) -> None: self.host_data = HostData(self.mgr, self.server_port, self.server_addr) self.configure_tls(self.host_data) + self.node_proxy_endpoint = NodeProxyEndpoint(self.mgr) self.configure_routes() self.find_free_port() +class NodeProxyEndpoint: + def __init__(self, mgr: "CephadmOrchestrator"): + self.mgr = mgr + self.ssl_root_crt = self.mgr.http_server.agent.ssl_certs.get_root_cert() + self.ssl_ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) + self.ssl_ctx.check_hostname = False + self.ssl_ctx.verify_mode = ssl.CERT_NONE + # self.ssl_ctx = ssl.create_default_context() + # self.ssl_ctx.check_hostname = True + # self.ssl_ctx.verify_mode = ssl.CERT_REQUIRED + # self.ssl_ctx.load_verify_locations(cadata=self.ssl_root_crt) + self.redfish_token: str = '' + self.redfish_session_location: str = '' + + def _cp_dispatch(self, vpath: List[str]) -> "NodeProxyEndpoint": + if len(vpath) > 1: # /{hostname}/<endpoint> + hostname = vpath.pop(0) # /<endpoint> + cherrypy.request.params['hostname'] = hostname + # /{hostname}/led/{type}/{drive} eg: /{hostname}/led/chassis or /{hostname}/led/drive/{id} + if vpath[0] == 'led' and len(vpath) > 1: # /led/{type}/{id} + _type = vpath[1] + cherrypy.request.params['type'] = _type + vpath.pop(1) # /led/{id} or # /led + if _type == 'drive' and len(vpath) > 1: # /led/{id} + _id = vpath[1] + vpath.pop(1) # /led + cherrypy.request.params['id'] = _id + # /<endpoint> + return self + + @cherrypy.expose + @cherrypy.tools.allow(methods=['POST']) + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + def oob(self) -> Dict[str, Any]: + """ + Get the out-of-band management tool details for a given host. + + :return: oob details. + :rtype: dict + """ + data: Dict[str, Any] = cherrypy.request.json + results: Dict[str, Any] = {} + + self.validate_node_proxy_data(data) + + # expecting name to be "node-proxy.<hostname>" + hostname = data['cephx']['name'][11:] + results['result'] = self.mgr.node_proxy_cache.oob.get(hostname, '') + if not results['result']: + raise cherrypy.HTTPError(400, 'The provided host has no iDrac details.') + return results + + def validate_node_proxy_data(self, data: Dict[str, Any]) -> None: + """ + Validate received data. + + :param data: data to validate. + :type data: dict + + :raises cherrypy.HTTPError 400: If the data is not valid (missing fields) + :raises cherrypy.HTTPError 403: If the secret provided is wrong. + """ + cherrypy.response.status = 200 + try: + if 'cephx' not in data.keys(): + raise cherrypy.HTTPError(400, 'The field \'cephx\' must be provided.') + elif 'name' not in data['cephx'].keys(): + cherrypy.response.status = 400 + raise cherrypy.HTTPError(400, 'The field \'name\' must be provided.') + # expecting name to be "node-proxy.<hostname>" + hostname = data['cephx']['name'][11:] + if 'secret' not in data['cephx'].keys(): + raise cherrypy.HTTPError(400, 'The node-proxy keyring must be provided.') + elif not self.mgr.node_proxy_cache.keyrings.get(hostname, ''): + raise cherrypy.HTTPError(502, f'Make sure the node-proxy is running on {hostname}') + elif data['cephx']['secret'] != self.mgr.node_proxy_cache.keyrings[hostname]: + raise cherrypy.HTTPError(403, f'Got wrong keyring from agent on host {hostname}.') + except AttributeError: + raise cherrypy.HTTPError(400, 'Malformed data received.') + + # TODO(guits): refactor this + # TODO(guits): use self.node_proxy.get_critical_from_host() ? + def get_nok_members(self, + data: Dict[str, Any]) -> List[Dict[str, str]]: + """ + Retrieves members whose status is not 'ok'. + + :param data: Data containing information about members. + :type data: dict + + :return: A list containing dictionaries of members whose status is not 'ok'. + :rtype: List[Dict[str, str]] + + :return: None + :rtype: None + """ + nok_members: List[Dict[str, str]] = [] + + for member in data.keys(): + _status = data[member]['status']['health'].lower() + if _status.lower() != 'ok': + state = data[member]['status']['state'] + _member = dict( + member=member, + status=_status, + state=state + ) + nok_members.append(_member) + + return nok_members + + def raise_alert(self, data: Dict[str, Any]) -> None: + """ + Raises hardware alerts based on the provided patch status. + + :param data: Data containing patch status information. + :type data: dict + + This function iterates through the provided status + information to raise hardware alerts. + For each component in the provided data, it removes any + existing health warnings associated with it and checks + for non-okay members using the `get_nok_members` method. + If non-okay members are found, it sets a new health + warning for that component and generates a report detailing + the non-okay members' statuses. + + Note: This function relies on the `get_nok_members` method to + identify non-okay members. + + :return: None + :rtype: None + """ + + for component in data['patch']['status'].keys(): + alert_name = f"HARDWARE_{component.upper()}" + self.mgr.remove_health_warning(alert_name) + nok_members = self.get_nok_members(data['patch']['status'][component]) + + if nok_members: + count = len(nok_members) + self.mgr.set_health_warning( + alert_name, + summary=f'{count} {component} member{"s" if count > 1 else ""} {"are" if count > 1 else "is"} not ok', + count=count, + detail=[f"{member['member']} is {member['status']}: {member['state']}" for member in nok_members], + ) + + @cherrypy.expose + @cherrypy.tools.allow(methods=['POST']) + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + def data(self) -> None: + """ + Handles incoming data via a POST request. + + This function is exposed to handle POST requests and expects incoming + JSON data. It processes the incoming data by first validating it + through the `validate_node_proxy_data` method. Subsequently, it + extracts the hostname from the data and saves the information + using `mgr.node_proxy.save`. Finally, it raises alerts based on the + provided status through the `raise_alert` method. + + :return: None + :rtype: None + """ + data: Dict[str, Any] = cherrypy.request.json + self.validate_node_proxy_data(data) + if 'patch' not in data.keys(): + raise cherrypy.HTTPError(400, 'Malformed data received.') + host = data['cephx']['name'][11:] + self.mgr.node_proxy_cache.save(host, data['patch']) + self.raise_alert(data) + + @cherrypy.expose + @cherrypy.tools.allow(methods=['GET', 'PATCH']) + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + def led(self, **kw: Any) -> Dict[str, Any]: + """ + Handles enclosure LED operations for the specified hostname. + + This function handles GET and PATCH requests related to LED status for a + specific hostname. It identifies the request method and provided hostname. + If the hostname is missing, it logs an error and returns an error message. + + For PATCH requests, it prepares authorization headers based on the + provided ID and password, encodes them, and constructs the authorization + header. + + After processing, it queries the endpoint and returns the result. + + :param kw: Keyword arguments including 'hostname'. + :type kw: dict + + :return: Result of the LED-related operation. + :rtype: dict[str, Any] + """ + method: str = cherrypy.request.method + header: MutableMapping[str, str] = {} + hostname: Optional[str] = kw.get('hostname') + led_type: Optional[str] = kw.get('type') + id_drive: Optional[str] = kw.get('id') + payload: Optional[Dict[str, str]] = None + endpoint: List[Any] = ['led', led_type] + device: str = id_drive if id_drive else '' + + ssl_root_crt = self.mgr.http_server.agent.ssl_certs.get_root_cert() + ssl_ctx = ssl.create_default_context() + ssl_ctx.check_hostname = True + ssl_ctx.verify_mode = ssl.CERT_REQUIRED + ssl_ctx.load_verify_locations(cadata=ssl_root_crt) + + if not hostname: + msg: str = "listing enclosure LED status for all nodes is not implemented." + self.mgr.log.debug(msg) + raise cherrypy.HTTPError(501, msg) + + if not led_type: + msg = "the led type must be provided (either 'chassis' or 'drive')." + self.mgr.log.debug(msg) + raise cherrypy.HTTPError(400, msg) + + if led_type == 'drive' and not id_drive: + msg = "the id of the drive must be provided when type is 'drive'." + self.mgr.log.debug(msg) + raise cherrypy.HTTPError(400, msg) + + if led_type == 'drive': + endpoint.append(device) + + if hostname not in self.mgr.node_proxy_cache.data.keys(): + # TODO(guits): update unit test for this + msg = f"'{hostname}' not found." + self.mgr.log.debug(msg) + raise cherrypy.HTTPError(400, msg) + + addr: str = self.mgr.inventory.get_addr(hostname) + + if method == 'PATCH': + # TODO(guits): need to check the request is authorized + # allowing a specific keyring only ? (client.admin or client.agent.. ?) + data: Dict[str, Any] = cherrypy.request.json + if 'state' not in data.keys(): + msg = "'state' key not provided." + raise cherrypy.HTTPError(400, msg) + if 'keyring' not in data.keys(): + msg = "'keyring' key must be provided." + raise cherrypy.HTTPError(400, msg) + if data['keyring'] != self.mgr.node_proxy_cache.keyrings.get(hostname): + msg = 'wrong keyring provided.' + raise cherrypy.HTTPError(401, msg) + payload = {} + payload['state'] = data['state'] + + if led_type == 'drive': + if id_drive not in self.mgr.node_proxy_cache.data[hostname]['status']['storage'].keys(): + # TODO(guits): update unit test for this + msg = f"'{id_drive}' not found." + self.mgr.log.debug(msg) + raise cherrypy.HTTPError(400, msg) + + endpoint = f'/{"/".join(endpoint)}' + header = self.mgr.node_proxy.generate_auth_header(hostname) + + try: + headers, result, status = http_req(hostname=addr, + port='9456', + headers=header, + method=method, + data=json.dumps(payload), + endpoint=endpoint, + ssl_ctx=ssl_ctx) + response_json = json.loads(result) + except HTTPError as e: + self.mgr.log.debug(e) + except URLError: + raise cherrypy.HTTPError(502, f'Make sure the node-proxy agent is deployed and running on {hostname}') + + return response_json + + @cherrypy.expose + @cherrypy.tools.allow(methods=['GET']) + @cherrypy.tools.json_out() + def fullreport(self, **kw: Any) -> Dict[str, Any]: + """ + Handles GET request to retrieve a full report. + + This function is exposed to handle GET requests and retrieves a comprehensive + report using the 'fullreport' method from the NodeProxyCache class. + + :param kw: Keyword arguments for the request. + :type kw: dict + + :return: The full report data. + :rtype: dict[str, Any] + + :raises cherrypy.HTTPError 404: If the passed hostname is not found. + """ + try: + results = self.mgr.node_proxy_cache.fullreport(**kw) + except KeyError: + raise cherrypy.HTTPError(404, f"{kw.get('hostname')} not found.") + return results + + @cherrypy.expose + @cherrypy.tools.allow(methods=['GET']) + @cherrypy.tools.json_out() + def criticals(self, **kw: Any) -> Dict[str, Any]: + """ + Handles GET request to retrieve critical information. + + This function is exposed to handle GET requests and fetches critical data + using the 'criticals' method from the NodeProxyCache class. + + :param kw: Keyword arguments for the request. + :type kw: dict + + :return: Critical information data. + :rtype: dict[str, Any] + + :raises cherrypy.HTTPError 404: If the passed hostname is not found. + """ + try: + results = self.mgr.node_proxy_cache.criticals(**kw) + except KeyError: + raise cherrypy.HTTPError(404, f"{kw.get('hostname')} not found.") + return results + + @cherrypy.expose + @cherrypy.tools.allow(methods=['GET']) + @cherrypy.tools.json_out() + def summary(self, **kw: Any) -> Dict[str, Any]: + """ + Handles GET request to retrieve summary information. + + This function is exposed to handle GET requests and fetches summary + data using the 'summary' method from the NodeProxyCache class. + + :param kw: Keyword arguments for the request. + :type kw: dict + + :return: Summary information data. + :rtype: dict[str, Any] + + :raises cherrypy.HTTPError 404: If the passed hostname is not found. + """ + try: + results = self.mgr.node_proxy_cache.summary(**kw) + except KeyError: + raise cherrypy.HTTPError(404, f"{kw.get('hostname')} not found.") + return results + + @cherrypy.expose + @cherrypy.tools.allow(methods=['GET']) + @cherrypy.tools.json_out() + def memory(self, **kw: Any) -> Dict[str, Any]: + """ + Handles GET request to retrieve specific information. + + This function is exposed to handle GET requests + and fetch specific data using the 'common' method + from the NodeProxyCache class with. + + :param kw: Keyword arguments for the request. + :type kw: dict + + :return: Specific information data. + :rtype: dict[str, Any] + + :raises cherrypy.HTTPError 404: If the passed hostname is not found. + """ + try: + results = self.mgr.node_proxy_cache.common('memory', **kw) + except KeyError: + raise cherrypy.HTTPError(404, f"{kw.get('hostname')} not found.") + return results + + @cherrypy.expose + @cherrypy.tools.allow(methods=['GET']) + @cherrypy.tools.json_out() + def network(self, **kw: Any) -> Dict[str, Any]: + """ + Handles GET request to retrieve specific information. + + This function is exposed to handle GET requests + and fetch specific data using the 'common' method + from the NodeProxyCache class with. + + :param kw: Keyword arguments for the request. + :type kw: dict + + :return: Specific information data. + :rtype: dict[str, Any] + + :raises cherrypy.HTTPError 404: If the passed hostname is not found. + """ + try: + results = self.mgr.node_proxy_cache.common('network', **kw) + except KeyError: + raise cherrypy.HTTPError(404, f"{kw.get('hostname')} not found.") + return results + + @cherrypy.expose + @cherrypy.tools.allow(methods=['GET']) + @cherrypy.tools.json_out() + def processors(self, **kw: Any) -> Dict[str, Any]: + """ + Handles GET request to retrieve specific information. + + This function is exposed to handle GET requests + and fetch specific data using the 'common' method + from the NodeProxyCache class with. + + :param kw: Keyword arguments for the request. + :type kw: dict + + :return: Specific information data. + :rtype: dict[str, Any] + + :raises cherrypy.HTTPError 404: If the passed hostname is not found. + """ + try: + results = self.mgr.node_proxy_cache.common('processors', **kw) + except KeyError: + raise cherrypy.HTTPError(404, f"{kw.get('hostname')} not found.") + return results + + @cherrypy.expose + @cherrypy.tools.allow(methods=['GET']) + @cherrypy.tools.json_out() + def storage(self, **kw: Any) -> Dict[str, Any]: + """ + Handles GET request to retrieve specific information. + + This function is exposed to handle GET requests + and fetch specific data using the 'common' method + from the NodeProxyCache class with. + + :param kw: Keyword arguments for the request. + :type kw: dict + + :return: Specific information data. + :rtype: dict[str, Any] + + :raises cherrypy.HTTPError 404: If the passed hostname is not found. + """ + try: + results = self.mgr.node_proxy_cache.common('storage', **kw) + except KeyError: + raise cherrypy.HTTPError(404, f"{kw.get('hostname')} not found.") + return results + + @cherrypy.expose + @cherrypy.tools.allow(methods=['GET']) + @cherrypy.tools.json_out() + def power(self, **kw: Any) -> Dict[str, Any]: + """ + Handles GET request to retrieve specific information. + + This function is exposed to handle GET requests + and fetch specific data using the 'common' method + from the NodeProxyCache class with. + + :param kw: Keyword arguments for the request. + :type kw: dict + + :return: Specific information data. + :rtype: dict[str, Any] + + :raises cherrypy.HTTPError 404: If the passed hostname is not found. + """ + try: + results = self.mgr.node_proxy_cache.common('power', **kw) + except KeyError: + raise cherrypy.HTTPError(404, f"{kw.get('hostname')} not found.") + return results + + @cherrypy.expose + @cherrypy.tools.allow(methods=['GET']) + @cherrypy.tools.json_out() + def fans(self, **kw: Any) -> Dict[str, Any]: + """ + Handles GET request to retrieve specific information. + + This function is exposed to handle GET requests + and fetch specific data using the 'common' method + from the NodeProxyCache class with. + + :param kw: Keyword arguments for the request. + :type kw: dict + + :return: Specific information data. + :rtype: dict[str, Any] + + :raises cherrypy.HTTPError 404: If the passed hostname is not found. + """ + try: + results = self.mgr.node_proxy_cache.common('fans', **kw) + except KeyError: + raise cherrypy.HTTPError(404, f"{kw.get('hostname')} not found.") + return results + + @cherrypy.expose + @cherrypy.tools.allow(methods=['GET']) + @cherrypy.tools.json_out() + def firmwares(self, **kw: Any) -> Dict[str, Any]: + """ + Handles GET request to retrieve firmware information. + + This function is exposed to handle GET requests and fetches firmware data using + the 'firmwares' method from the NodeProxyCache class. + + :param kw: Keyword arguments for the request. + :type kw: dict + + :return: Firmware information data. + :rtype: dict[str, Any] + + :raises cherrypy.HTTPError 404: If the passed hostname is not found. + """ + try: + results = self.mgr.node_proxy_cache.firmwares(**kw) + except KeyError: + raise cherrypy.HTTPError(404, f"{kw.get('hostname')} not found.") + return results + + class HostData(Server): exposed = True @@ -109,9 +639,11 @@ class HostData(Server): self.unsubscribe() super().stop() + @cherrypy.tools.allow(methods=['POST']) @cherrypy.tools.json_in() @cherrypy.tools.json_out() - def POST(self) -> Dict[str, Any]: + @cherrypy.expose + def index(self) -> Dict[str, Any]: data: Dict[str, Any] = cherrypy.request.json results: Dict[str, Any] = {} try: @@ -234,6 +766,7 @@ class AgentMessageThread(threading.Thread): self.port = port self.data: str = json.dumps(data) self.daemon_spec: Optional[CephadmDaemonDeploySpec] = daemon_spec + self.agent_response: str = '' super().__init__(target=self.run) def run(self) -> None: @@ -286,8 +819,8 @@ class AgentMessageThread(threading.Thread): secure_agent_socket.connect((self.addr, self.port)) msg = (bytes_len + self.data) secure_agent_socket.sendall(msg.encode('utf-8')) - agent_response = secure_agent_socket.recv(1024).decode() - self.mgr.log.debug(f'Received "{agent_response}" from agent on host {self.host}') + self.agent_response = secure_agent_socket.recv(1024).decode() + self.mgr.log.debug(f'Received "{self.agent_response}" from agent on host {self.host}') if self.daemon_spec: self.mgr.agent_cache.agent_config_successfully_delivered(self.daemon_spec) self.mgr.agent_cache.sending_agent_message[self.host] = False @@ -307,6 +840,9 @@ class AgentMessageThread(threading.Thread): self.mgr.agent_cache.sending_agent_message[self.host] = False return + def get_agent_response(self) -> str: + return self.agent_response + class CephadmAgentHelpers: def __init__(self, mgr: "CephadmOrchestrator"): @@ -403,10 +939,11 @@ class CephadmAgentHelpers: if 'agent' in self.mgr.spec_store: self.mgr.spec_store.rm('agent') need_apply = True - self.mgr.agent_cache.agent_counter = {} - self.mgr.agent_cache.agent_timestamp = {} - self.mgr.agent_cache.agent_keys = {} - self.mgr.agent_cache.agent_ports = {} + if not self.mgr.cache.get_daemons_by_service('agent'): + self.mgr.agent_cache.agent_counter = {} + self.mgr.agent_cache.agent_timestamp = {} + self.mgr.agent_cache.agent_keys = {} + self.mgr.agent_cache.agent_ports = {} return need_apply def _check_agent(self, host: str) -> bool: diff --git a/src/pybind/mgr/cephadm/autotune.py b/src/pybind/mgr/cephadm/autotune.py index 51c931cba..72ebcd660 100644 --- a/src/pybind/mgr/cephadm/autotune.py +++ b/src/pybind/mgr/cephadm/autotune.py @@ -15,6 +15,7 @@ class MemoryAutotuner(object): 'crash': 128 * 1048576, 'keepalived': 128 * 1048576, 'haproxy': 128 * 1048576, + 'nvmeof': 4096 * 1048576, } default_size = 1024 * 1048576 diff --git a/src/pybind/mgr/cephadm/configchecks.py b/src/pybind/mgr/cephadm/configchecks.py index b9dcb18f4..9077a1d66 100644 --- a/src/pybind/mgr/cephadm/configchecks.py +++ b/src/pybind/mgr/cephadm/configchecks.py @@ -674,7 +674,7 @@ class CephadmConfigChecks: self.host_to_role[hostname] = list(self.mgr.cache.get_daemon_types(hostname)) def run_checks(self) -> None: - checks_enabled = self.mgr.get_module_option('config_checks_enabled') + checks_enabled = self.mgr.config_checks_enabled if checks_enabled is not True: return diff --git a/src/pybind/mgr/cephadm/inventory.py b/src/pybind/mgr/cephadm/inventory.py index 7153ca6dc..966ffc046 100644 --- a/src/pybind/mgr/cephadm/inventory.py +++ b/src/pybind/mgr/cephadm/inventory.py @@ -8,7 +8,7 @@ import logging import math import socket from typing import TYPE_CHECKING, Dict, List, Iterator, Optional, Any, Tuple, Set, Mapping, cast, \ - NamedTuple, Type + NamedTuple, Type, ValuesView import orchestrator from ceph.deployment import inventory @@ -29,6 +29,7 @@ logger = logging.getLogger(__name__) HOST_CACHE_PREFIX = "host." SPEC_STORE_PREFIX = "spec." AGENT_CACHE_PREFIX = 'agent.' +NODE_PROXY_CACHE_PREFIX = 'node_proxy' class HostCacheStatus(enum.Enum): @@ -1405,6 +1406,196 @@ class HostCache(): return self.scheduled_daemon_actions.get(host, {}).get(daemon) +class NodeProxyCache: + def __init__(self, mgr: "CephadmOrchestrator") -> None: + self.mgr = mgr + self.data: Dict[str, Any] = {} + self.oob: Dict[str, Any] = {} + self.keyrings: Dict[str, str] = {} + + def load(self) -> None: + _oob = self.mgr.get_store(f'{NODE_PROXY_CACHE_PREFIX}/oob', '{}') + self.oob = json.loads(_oob) + + _keyrings = self.mgr.get_store(f'{NODE_PROXY_CACHE_PREFIX}/keyrings', '{}') + self.keyrings = json.loads(_keyrings) + + for k, v in self.mgr.get_store_prefix(f'{NODE_PROXY_CACHE_PREFIX}/data').items(): + host = k.split('/')[-1:][0] + + if host not in self.mgr.inventory.keys(): + # remove entry for host that no longer exists + self.mgr.set_store(f"{NODE_PROXY_CACHE_PREFIX}/data/{host}", None) + try: + self.oob.pop(host) + self.data.pop(host) + self.keyrings.pop(host) + except KeyError: + pass + continue + + self.data[host] = json.loads(v) + + def save(self, + host: str = '', + data: Dict[str, Any] = {}) -> None: + self.mgr.set_store(f"{NODE_PROXY_CACHE_PREFIX}/data/{host}", json.dumps(data)) + + def update_oob(self, host: str, host_oob_info: Dict[str, str]) -> None: + self.oob[host] = host_oob_info + self.mgr.set_store(f"{NODE_PROXY_CACHE_PREFIX}/oob", json.dumps(self.oob)) + + def update_keyring(self, host: str, key: str) -> None: + self.keyrings[host] = key + self.mgr.set_store(f"{NODE_PROXY_CACHE_PREFIX}/keyrings", json.dumps(self.keyrings)) + + def fullreport(self, **kw: Any) -> Dict[str, Any]: + """ + Retrieves the full report for the specified hostname. + + If a hostname is provided in the keyword arguments, it retrieves the full report + data for that specific host. If no hostname is provided, it fetches the full + report data for all hosts available. + + :param kw: Keyword arguments including 'hostname'. + :type kw: dict + + :return: The full report data for the specified hostname(s). + :rtype: dict + """ + hostname = kw.get('hostname') + hosts = [hostname] if hostname else self.data.keys() + return {host: self.data[host] for host in hosts} + + def summary(self, **kw: Any) -> Dict[str, Any]: + """ + Summarizes the health status of components for specified hosts or all hosts. + + Generates a summary of the health status of components for given hosts. If + no hostname is provided, it generates the health status summary for all hosts. + It inspects the status of each component and categorizes it as 'ok' or 'error' + based on the health status of its members. + + :param kw: Keyword arguments including 'hostname'. + :type kw: dict + + :return: A dictionary containing the health status summary for each specified + host or all hosts and their components. + :rtype: Dict[str, Dict[str, str]] + """ + hostname = kw.get('hostname') + hosts = [hostname] if hostname else self.data.keys() + + def is_unknown(statuses: ValuesView) -> bool: + return any([status['status']['health'].lower() == 'unknown' for status in statuses]) and not is_error(statuses) + + def is_error(statuses: ValuesView) -> bool: + return any([status['status']['health'].lower() == 'error' for status in statuses]) + + _result: Dict[str, Any] = {} + + for host in hosts: + _result[host] = {} + _result[host]['status'] = {} + data = self.data[host] + for component in data['status'].keys(): + values = data['status'][component].values() + if is_error(values): + state = 'error' + elif is_unknown(values): + state = 'unknown' + else: + state = 'ok' + _result[host]['status'][component] = state + _result[host]['sn'] = data['sn'] + _result[host]['host'] = data['host'] + _result[host]['firmwares'] = data['firmwares'] + return _result + + def common(self, endpoint: str, **kw: Any) -> Dict[str, Any]: + """ + Retrieves specific endpoint information for a specific hostname or all hosts. + + Retrieves information from the specified 'endpoint' for all available hosts. + If 'hostname' is provided, retrieves the specified 'endpoint' information for that host. + + :param endpoint: The endpoint for which information is retrieved. + :type endpoint: str + :param kw: Keyword arguments, including 'hostname' if specified. + :type kw: dict + + :return: Endpoint information for the specified host(s). + :rtype: Union[Dict[str, Any], Any] + """ + hostname = kw.get('hostname') + _result = {} + hosts = [hostname] if hostname else self.data.keys() + + for host in hosts: + try: + _result[host] = self.data[host]['status'][endpoint] + except KeyError: + raise KeyError(f'Invalid host {host} or component {endpoint}.') + return _result + + def firmwares(self, **kw: Any) -> Dict[str, Any]: + """ + Retrieves firmware information for a specific hostname or all hosts. + + If a 'hostname' is provided in the keyword arguments, retrieves firmware + information for that specific host. Otherwise, retrieves firmware + information for all available hosts. + + :param kw: Keyword arguments, including 'hostname' if specified. + :type kw: dict + + :return: A dictionary containing firmware information for each host. + :rtype: Dict[str, Any] + """ + hostname = kw.get('hostname') + hosts = [hostname] if hostname else self.data.keys() + + return {host: self.data[host]['firmwares'] for host in hosts} + + def get_critical_from_host(self, hostname: str) -> Dict[str, Any]: + results: Dict[str, Any] = {} + for component, data_component in self.data[hostname]['status'].items(): + if component not in results.keys(): + results[component] = {} + for member, data_member in data_component.items(): + if component == 'power': + data_member['status']['health'] = 'critical' + data_member['status']['state'] = 'unplugged' + if component == 'memory': + data_member['status']['health'] = 'critical' + data_member['status']['state'] = 'errors detected' + if data_member['status']['health'].lower() != 'ok': + results[component][member] = data_member + return results + + def criticals(self, **kw: Any) -> Dict[str, Any]: + """ + Retrieves critical information for a specific hostname or all hosts. + + If a 'hostname' is provided in the keyword arguments, retrieves critical + information for that specific host. Otherwise, retrieves critical + information for all available hosts. + + :param kw: Keyword arguments, including 'hostname' if specified. + :type kw: dict + + :return: A dictionary containing critical information for each host. + :rtype: List[Dict[str, Any]] + """ + hostname = kw.get('hostname') + results: Dict[str, Any] = {} + + hosts = [hostname] if hostname else self.data.keys() + for host in hosts: + results[host] = self.get_critical_from_host(host) + return results + + class AgentCache(): """ AgentCache is used for storing metadata about agent daemons that must be kept @@ -1507,6 +1698,8 @@ class EventStore(): for e in self.events[event.kind_subject()]: if e.message == event.message: + # if subject and message match, just update the timestamp + e.created = event.created return self.events[event.kind_subject()].append(event) diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 7b97ce74a..f1d234fb2 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -1,4 +1,5 @@ import asyncio +import concurrent import json import errno import ipaddress @@ -10,6 +11,7 @@ from configparser import ConfigParser from contextlib import contextmanager from functools import wraps from tempfile import TemporaryDirectory, NamedTemporaryFile +from urllib.error import HTTPError from threading import Event from cephadm.service_discovery import ServiceDiscovery @@ -39,7 +41,14 @@ from cephadm.http_server import CephadmHttpServer from cephadm.agent import CephadmAgentHelpers -from mgr_module import MgrModule, HandleCommandResult, Option, NotifyType +from mgr_module import ( + MgrModule, + HandleCommandResult, + Option, + NotifyType, + MonCommandFailed, +) +from mgr_util import build_url import orchestrator from orchestrator.module import to_format, Format @@ -64,9 +73,10 @@ from .services.osd import OSDRemovalQueue, OSDService, OSD, NotFoundError from .services.monitoring import GrafanaService, AlertmanagerService, PrometheusService, \ NodeExporterService, SNMPGatewayService, LokiService, PromtailService from .services.jaeger import ElasticSearchService, JaegerAgentService, JaegerCollectorService, JaegerQueryService +from .services.node_proxy import NodeProxy from .schedule import HostAssignment from .inventory import Inventory, SpecStore, HostCache, AgentCache, EventStore, \ - ClientKeyringStore, ClientKeyringSpec, TunedProfileStore + ClientKeyringStore, ClientKeyringSpec, TunedProfileStore, NodeProxyCache from .upgrade import CephadmUpgrade from .template import TemplateMgr from .utils import CEPH_IMAGE_TYPES, RESCHEDULE_FROM_OFFLINE_HOSTS_TYPES, forall_hosts, \ @@ -107,7 +117,7 @@ os._exit = os_exit_noop # type: ignore DEFAULT_IMAGE = 'quay.io/ceph/ceph' # DO NOT ADD TAG TO THIS DEFAULT_PROMETHEUS_IMAGE = 'quay.io/prometheus/prometheus:v2.43.0' DEFAULT_NODE_EXPORTER_IMAGE = 'quay.io/prometheus/node-exporter:v1.5.0' -DEFAULT_NVMEOF_IMAGE = 'quay.io/ceph/nvmeof:0.0.2' +DEFAULT_NVMEOF_IMAGE = 'quay.io/ceph/nvmeof:1.0.0' DEFAULT_LOKI_IMAGE = 'docker.io/grafana/loki:2.4.0' DEFAULT_PROMTAIL_IMAGE = 'docker.io/grafana/promtail:2.4.0' DEFAULT_ALERT_MANAGER_IMAGE = 'quay.io/prometheus/alertmanager:v0.25.0' @@ -436,6 +446,12 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, desc='Multiplied by agent refresh rate to calculate how long agent must not report before being marked down' ), Option( + 'hw_monitoring', + type='bool', + default=False, + desc='Deploy hw monitoring daemon on every host.' + ), + Option( 'max_osd_draining_count', type='int', default=10, @@ -467,11 +483,17 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, ), Option( 'default_cephadm_command_timeout', - type='secs', + type='int', default=15 * 60, desc='Default timeout applied to cephadm commands run directly on ' 'the host (in seconds)' ), + Option( + 'oob_default_addr', + type='str', + default='169.254.1.1', + desc="Default address for RedFish API (oob management)." + ), ] def __init__(self, *args: Any, **kwargs: Any): @@ -529,6 +551,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, self.registry_password: Optional[str] = None self.registry_insecure: bool = False self.use_repo_digest = True + self.config_checks_enabled = False self.default_registry = '' self.autotune_memory_target_ratio = 0.0 self.autotune_interval = 0 @@ -545,6 +568,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, self.agent_refresh_rate = 0 self.agent_down_multiplier = 0.0 self.agent_starting_port = 0 + self.hw_monitoring = False self.service_discovery_port = 0 self.secure_monitoring_stack = False self.apply_spec_fails: List[Tuple[str, str]] = [] @@ -554,6 +578,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, self.cgroups_split = True self.log_refresh_metadata = False self.default_cephadm_command_timeout = 0 + self.oob_default_addr = '' self.notify(NotifyType.mon_map, None) self.config_notify() @@ -584,6 +609,9 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, self.cache = HostCache(self) self.cache.load() + self.node_proxy_cache = NodeProxyCache(self) + self.node_proxy_cache.load() + self.agent_cache = AgentCache(self) self.agent_cache.load() @@ -621,7 +649,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, PrometheusService, NodeExporterService, LokiService, PromtailService, CrashService, IscsiService, IngressService, CustomContainerService, CephfsMirrorService, NvmeofService, CephadmAgent, CephExporterService, SNMPGatewayService, ElasticSearchService, - JaegerQueryService, JaegerAgentService, JaegerCollectorService + JaegerQueryService, JaegerAgentService, JaegerCollectorService, NodeProxy ] # https://github.com/python/mypy/issues/8993 @@ -632,6 +660,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, self.osd_service: OSDService = cast(OSDService, self.cephadm_services['osd']) self.iscsi_service: IscsiService = cast(IscsiService, self.cephadm_services['iscsi']) self.nvmeof_service: NvmeofService = cast(NvmeofService, self.cephadm_services['nvmeof']) + self.node_proxy_service: NodeProxy = cast(NodeProxy, self.cephadm_services['node-proxy']) self.scheduled_async_actions: List[Callable] = [] @@ -644,6 +673,9 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, self.http_server = CephadmHttpServer(self) self.http_server.start() + + self.node_proxy = NodeProxy(self) + self.agent_helpers = CephadmAgentHelpers(self) if self.use_agent: self.agent_helpers._apply_agent() @@ -709,7 +741,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, # are provided, that will be included in the OrchestratorError's message try: yield - except asyncio.TimeoutError: + except (asyncio.TimeoutError, concurrent.futures.TimeoutError): err_str: str = '' if cmd: err_str = f'Command "{cmd}" timed out ' @@ -722,6 +754,16 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, else: err_str += (f'(default {self.default_cephadm_command_timeout} second timeout)') raise OrchestratorError(err_str) + except concurrent.futures.CancelledError as e: + err_str = '' + if cmd: + err_str = f'Command "{cmd}" failed ' + else: + err_str = 'Command failed ' + if host: + err_str += f'on host {host} ' + err_str += f' - {str(e)}' + raise OrchestratorError(err_str) def set_container_image(self, entity: str, image: str) -> None: self.check_mon_command({ @@ -810,7 +852,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, Generate a unique random service name """ suffix = daemon_type not in [ - 'mon', 'crash', 'ceph-exporter', + 'mon', 'crash', 'ceph-exporter', 'node-proxy', 'prometheus', 'node-exporter', 'grafana', 'alertmanager', 'container', 'agent', 'snmp-gateway', 'loki', 'promtail', 'elasticsearch', 'jaeger-collector', 'jaeger-agent', 'jaeger-query' @@ -948,6 +990,17 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, self.set_health_warning('CEPHADM_FAILED_DAEMON', f'{len(failed_daemons)} failed cephadm daemon(s)', len( failed_daemons), failed_daemons) + def get_first_matching_network_ip(self, host: str, sspec: ServiceSpec) -> Optional[str]: + sspec_networks = sspec.networks + for subnet, ifaces in self.cache.networks.get(host, {}).items(): + host_network = ipaddress.ip_network(subnet) + for spec_network_str in sspec_networks: + spec_network = ipaddress.ip_network(spec_network_str) + if host_network.overlaps(spec_network): + return list(ifaces.values())[0][0] + logger.error(f'{spec_network} from {sspec.service_name()} spec does not overlap with {host_network} on {host}') + return None + @staticmethod def can_run() -> Tuple[bool, str]: if asyncssh is not None: @@ -1320,7 +1373,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, @orchestrator._cli_read_command('cephadm config-check status') def _config_check_status(self) -> HandleCommandResult: """Show whether the configuration checker feature is enabled/disabled""" - status = self.get_module_option('config_checks_enabled') + status = self.config_checks_enabled return HandleCommandResult(stdout="Enabled" if status else "Disabled") @orchestrator._cli_write_command('cephadm config-check enable') @@ -1597,6 +1650,18 @@ Then run the following: if spec.hostname in self.inventory and self.inventory.get_addr(spec.hostname) != spec.addr: self.cache.refresh_all_host_info(spec.hostname) + if spec.oob: + if not spec.oob.get('addr'): + spec.oob['addr'] = self.oob_default_addr + if not spec.oob.get('port'): + spec.oob['port'] = '443' + host_oob_info = dict() + host_oob_info['addr'] = spec.oob['addr'] + host_oob_info['port'] = spec.oob['port'] + host_oob_info['username'] = spec.oob['username'] + host_oob_info['password'] = spec.oob['password'] + self.node_proxy_cache.update_oob(spec.hostname, host_oob_info) + # prime crush map? if spec.location: self.check_mon_command({ @@ -1621,7 +1686,72 @@ Then run the following: return self._add_host(spec) @handle_orch_error - def remove_host(self, host: str, force: bool = False, offline: bool = False) -> str: + def hardware_light(self, light_type: str, action: str, hostname: str, device: Optional[str] = None) -> Dict[str, Any]: + try: + result = self.node_proxy.led(light_type=light_type, + action=action, + hostname=hostname, + device=device) + except RuntimeError as e: + self.log.error(e) + raise OrchestratorValidationError(f'Make sure the node-proxy agent is deployed and running on {hostname}') + except HTTPError as e: + self.log.error(e) + raise OrchestratorValidationError(f"http error while querying node-proxy API: {e}") + return result + + @handle_orch_error + def hardware_shutdown(self, hostname: str, force: Optional[bool] = False, yes_i_really_mean_it: bool = False) -> str: + if not yes_i_really_mean_it: + raise OrchestratorError("you must pass --yes-i-really-mean-it") + + try: + self.node_proxy.shutdown(hostname, force) + except RuntimeError as e: + self.log.error(e) + raise OrchestratorValidationError(f'Make sure the node-proxy agent is deployed and running on {hostname}') + except HTTPError as e: + self.log.error(e) + raise OrchestratorValidationError(f"Can't shutdown node {hostname}: {e}") + return f'Shutdown scheduled on {hostname}' + + @handle_orch_error + def hardware_powercycle(self, hostname: str, yes_i_really_mean_it: bool = False) -> str: + if not yes_i_really_mean_it: + raise OrchestratorError("you must pass --yes-i-really-mean-it") + + try: + self.node_proxy.powercycle(hostname) + except RuntimeError as e: + self.log.error(e) + raise OrchestratorValidationError(f'Make sure the node-proxy agent is deployed and running on {hostname}') + except HTTPError as e: + self.log.error(e) + raise OrchestratorValidationError(f"Can't perform powercycle on node {hostname}: {e}") + return f'Powercycle scheduled on {hostname}' + + @handle_orch_error + def node_proxy_fullreport(self, hostname: Optional[str] = None) -> Dict[str, Any]: + return self.node_proxy_cache.fullreport(hostname=hostname) + + @handle_orch_error + def node_proxy_summary(self, hostname: Optional[str] = None) -> Dict[str, Any]: + return self.node_proxy_cache.summary(hostname=hostname) + + @handle_orch_error + def node_proxy_firmwares(self, hostname: Optional[str] = None) -> Dict[str, Any]: + return self.node_proxy_cache.firmwares(hostname=hostname) + + @handle_orch_error + def node_proxy_criticals(self, hostname: Optional[str] = None) -> Dict[str, Any]: + return self.node_proxy_cache.criticals(hostname=hostname) + + @handle_orch_error + def node_proxy_common(self, category: str, hostname: Optional[str] = None) -> Dict[str, Any]: + return self.node_proxy_cache.common(category, hostname=hostname) + + @handle_orch_error + def remove_host(self, host: str, force: bool = False, offline: bool = False, rm_crush_entry: bool = False) -> str: """ Remove a host from orchestrator management. @@ -1701,6 +1831,17 @@ Then run the following: } run_cmd(cmd_args) + if rm_crush_entry: + try: + self.check_mon_command({ + 'prefix': 'osd crush remove', + 'name': host, + }) + except MonCommandFailed as e: + self.log.error(f'Couldn\'t remove host {host} from CRUSH map: {str(e)}') + return (f'Cephadm failed removing host {host}\n' + f'Failed to remove host {host} from the CRUSH map: {str(e)}') + self.inventory.rm_host(host) self.cache.rm_host(host) self.ssh.reset_con(host) @@ -2665,6 +2806,15 @@ Then run the following: pass deps = sorted([self.get_mgr_ip(), server_port, root_cert, str(self.device_enhanced_scan)]) + elif daemon_type == 'node-proxy': + root_cert = '' + server_port = '' + try: + server_port = str(self.http_server.agent.server_port) + root_cert = self.http_server.agent.ssl_certs.get_root_cert() + except Exception: + pass + deps = sorted([self.get_mgr_ip(), server_port, root_cert]) elif daemon_type == 'iscsi': if spec: iscsi_spec = cast(IscsiServiceSpec, spec) @@ -2704,6 +2854,12 @@ Then run the following: deps.append(f'{hash(alertmanager_user + alertmanager_password)}') elif daemon_type == 'promtail': deps += get_daemon_names(['loki']) + elif daemon_type == JaegerAgentService.TYPE: + for dd in self.cache.get_daemons_by_type(JaegerCollectorService.TYPE): + assert dd.hostname is not None + port = dd.ports[0] if dd.ports else JaegerCollectorService.DEFAULT_SERVICE_PORT + deps.append(build_url(host=dd.hostname, port=port).lstrip('/')) + deps = sorted(deps) else: # TODO(redo): some error message! pass @@ -3347,8 +3503,7 @@ Then run the following: """ for osd_id in osd_ids: try: - self.to_remove_osds.rm(OSD(osd_id=int(osd_id), - remove_util=self.to_remove_osds.rm_util)) + self.to_remove_osds.rm_by_osd_id(int(osd_id)) except (NotFoundError, KeyError, ValueError): return f'Unable to find OSD in the queue: {osd_id}' @@ -3382,6 +3537,18 @@ Then run the following: f"It is recommended to add the {SpecialHostLabels.ADMIN} label to another host" " before completing this operation.\nIf you're certain this is" " what you want rerun this command with --force.") + # if the user has specified the host we are going to drain + # explicitly in any service spec, warn the user. Having a + # drained host listed in a placement provides no value, so + # they may want to fix it. + services_matching_drain_host: List[str] = [] + for sname, sspec in self.spec_store.all_specs.items(): + if sspec.placement.hosts and hostname in [h.hostname for h in sspec.placement.hosts]: + services_matching_drain_host.append(sname) + if services_matching_drain_host: + raise OrchestratorValidationError(f'Host {hostname} was found explicitly listed in the placements ' + f'of services:\n {services_matching_drain_host}.\nPlease update those ' + 'specs to not list this host.\nThis warning can be bypassed with --force') self.add_host_label(hostname, '_no_schedule') if not keep_conf_keyring: diff --git a/src/pybind/mgr/cephadm/schedule.py b/src/pybind/mgr/cephadm/schedule.py index 6666d761e..98d2fe998 100644 --- a/src/pybind/mgr/cephadm/schedule.py +++ b/src/pybind/mgr/cephadm/schedule.py @@ -413,6 +413,8 @@ class HostAssignment(object): hostname=x.hostname, ports=self.ports_start) for x in self.hosts_by_label(self.spec.placement.label) ] + if self.spec.placement.host_pattern: + ls = [h for h in ls if h.hostname in self.spec.placement.filter_matching_hostspecs(self.hosts)] elif self.spec.placement.host_pattern: ls = [ DaemonPlacement(daemon_type=self.primary_daemon_type, diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index 5dfdc27a3..7bfb3f633 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -67,7 +67,6 @@ class CephadmServe: of cephadm. This loop will then attempt to apply this new state. """ self.log.debug("serve starting") - self.mgr.config_checker.load_network_config() while self.mgr.run: self.log.debug("serve loop start") @@ -113,9 +112,15 @@ class CephadmServe: if self.mgr.agent_helpers._handle_use_agent_setting(): continue + if self.mgr.node_proxy_service.handle_hw_monitoring_setting(): + continue + if self.mgr.upgrade.continue_upgrade(): continue + # refresh node-proxy cache + self.mgr.node_proxy_cache.load() + except OrchestratorError as e: if e.event_subject: self.mgr.events.from_orch_error(e) @@ -316,7 +321,9 @@ class CephadmServe: self.mgr.agent_helpers._update_agent_down_healthcheck(agents_down) self.mgr.http_server.config_update() - self.mgr.config_checker.run_checks() + if self.mgr.config_checks_enabled: + self.mgr.config_checker.load_network_config() + self.mgr.config_checker.run_checks() for k in [ 'CEPHADM_HOST_CHECK_FAILED', @@ -882,6 +889,13 @@ class CephadmServe: hosts_altered.add(d.hostname) break + # do not attempt to deploy node-proxy agent when oob details are not provided. + if slot.daemon_type == 'node-proxy' and slot.hostname not in self.mgr.node_proxy_cache.oob.keys(): + self.log.debug( + f'Not deploying node-proxy agent on {slot.hostname} as oob details are not present.' + ) + continue + # deploy new daemon daemon_id = slot.name @@ -1060,6 +1074,11 @@ class CephadmServe: diff = list(set(last_deps) - set(deps)) if any('secure_monitoring_stack' in e for e in diff): action = 'redeploy' + elif dd.daemon_type == 'jaeger-agent': + # changes to jaeger-agent deps affect the way the unit.run for + # the daemon is written, which we rewrite on redeploy, but not + # on reconfig. + action = 'redeploy' elif spec is not None and hasattr(spec, 'extra_container_args') and dd.extra_container_args != spec.extra_container_args: self.log.debug( diff --git a/src/pybind/mgr/cephadm/service_discovery.py b/src/pybind/mgr/cephadm/service_discovery.py index ddc0574e2..b3b7b5499 100644 --- a/src/pybind/mgr/cephadm/service_discovery.py +++ b/src/pybind/mgr/cephadm/service_discovery.py @@ -19,6 +19,7 @@ import secrets from cephadm.services.ingress import IngressSpec from cephadm.ssl_cert_utils import SSLCerts from cephadm.services.cephadmservice import CephExporterService +from cephadm.services.nvmeof import NvmeofService if TYPE_CHECKING: from cephadm.module import CephadmOrchestrator @@ -145,6 +146,7 @@ class Root(Server): <p><a href='prometheus/sd-config?service=node-exporter'>Node exporter http sd-config</a></p> <p><a href='prometheus/sd-config?service=haproxy'>HAProxy http sd-config</a></p> <p><a href='prometheus/sd-config?service=ceph-exporter'>Ceph exporter http sd-config</a></p> +<p><a href='prometheus/sd-config?service=nvmeof'>NVMeoF http sd-config</a></p> <p><a href='prometheus/rules'>Prometheus rules</a></p> </body> </html>''' @@ -163,6 +165,8 @@ class Root(Server): return self.haproxy_sd_config() elif service == 'ceph-exporter': return self.ceph_exporter_sd_config() + elif service == 'nvmeof': + return self.nvmeof_sd_config() else: return [] @@ -231,6 +235,19 @@ class Root(Server): }) return srv_entries + def nvmeof_sd_config(self) -> List[Dict[str, Collection[str]]]: + """Return <http_sd_config> compatible prometheus config for nvmeof service.""" + srv_entries = [] + for dd in self.mgr.cache.get_daemons_by_type('nvmeof'): + assert dd.hostname is not None + addr = dd.ip if dd.ip else self.mgr.inventory.get_addr(dd.hostname) + port = NvmeofService.PROMETHEUS_PORT + srv_entries.append({ + 'targets': [build_url(host=addr, port=port).lstrip('/')], + 'labels': {'instance': dd.hostname} + }) + return srv_entries + @cherrypy.expose(alias='prometheus/rules') def get_prometheus_rules(self) -> str: """Return currently configured prometheus rules as Yaml.""" diff --git a/src/pybind/mgr/cephadm/services/cephadmservice.py b/src/pybind/mgr/cephadm/services/cephadmservice.py index 7d7a04dad..f3c45b164 100644 --- a/src/pybind/mgr/cephadm/services/cephadmservice.py +++ b/src/pybind/mgr/cephadm/services/cephadmservice.py @@ -41,7 +41,7 @@ def get_auth_entity(daemon_type: str, daemon_id: str, host: str = "") -> AuthEnt # the CephService class refers to service types, not daemon types if daemon_type in ['rgw', 'rbd-mirror', 'cephfs-mirror', 'nfs', "iscsi", 'nvmeof', 'ingress', 'ceph-exporter']: return AuthEntity(f'client.{daemon_type}.{daemon_id}') - elif daemon_type in ['crash', 'agent']: + elif daemon_type in ['crash', 'agent', 'node-proxy']: if host == "": raise OrchestratorError( f'Host not provided to generate <{daemon_type}> auth entity name') diff --git a/src/pybind/mgr/cephadm/services/ingress.py b/src/pybind/mgr/cephadm/services/ingress.py index 55be30454..5edd2517d 100644 --- a/src/pybind/mgr/cephadm/services/ingress.py +++ b/src/pybind/mgr/cephadm/services/ingress.py @@ -187,6 +187,7 @@ class IngressService(CephService): 'monitor_port': daemon_spec.ports[1] if daemon_spec.ports else spec.monitor_port, 'local_host_ip': host_ip, 'default_server_opts': server_opts, + 'health_check_interval': spec.health_check_interval or '2s', } ) config_files = { diff --git a/src/pybind/mgr/cephadm/services/jaeger.py b/src/pybind/mgr/cephadm/services/jaeger.py index c136d20e6..c83c765d0 100644 --- a/src/pybind/mgr/cephadm/services/jaeger.py +++ b/src/pybind/mgr/cephadm/services/jaeger.py @@ -20,13 +20,16 @@ class JaegerAgentService(CephadmService): def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec: assert self.TYPE == daemon_spec.daemon_type collectors = [] + deps: List[str] = [] for dd in self.mgr.cache.get_daemons_by_type(JaegerCollectorService.TYPE): # scrape jaeger-collector nodes assert dd.hostname is not None port = dd.ports[0] if dd.ports else JaegerCollectorService.DEFAULT_SERVICE_PORT url = build_url(host=dd.hostname, port=port).lstrip('/') collectors.append(url) + deps.append(url) daemon_spec.final_config = {'collector_nodes': ",".join(collectors)} + daemon_spec.deps = sorted(deps) return daemon_spec diff --git a/src/pybind/mgr/cephadm/services/monitoring.py b/src/pybind/mgr/cephadm/services/monitoring.py index 114c84860..10ddcbbd0 100644 --- a/src/pybind/mgr/cephadm/services/monitoring.py +++ b/src/pybind/mgr/cephadm/services/monitoring.py @@ -67,13 +67,24 @@ class GrafanaService(CephadmService): spec: GrafanaSpec = cast( GrafanaSpec, self.mgr.spec_store.active_specs[daemon_spec.service_name]) + + grafana_port = daemon_spec.ports[0] if daemon_spec.ports else self.DEFAULT_SERVICE_PORT + grafana_ip = daemon_spec.ip if daemon_spec.ip else '' + + if spec.only_bind_port_on_networks and spec.networks: + assert daemon_spec.host is not None + ip_to_bind_to = self.mgr.get_first_matching_network_ip(daemon_spec.host, spec) + if ip_to_bind_to: + daemon_spec.port_ips = {str(grafana_port): ip_to_bind_to} + grafana_ip = ip_to_bind_to + grafana_ini = self.mgr.template.render( 'services/grafana/grafana.ini.j2', { 'anonymous_access': spec.anonymous_access, 'initial_admin_password': spec.initial_admin_password, - 'http_port': daemon_spec.ports[0] if daemon_spec.ports else self.DEFAULT_SERVICE_PORT, + 'http_port': grafana_port, 'protocol': spec.protocol, - 'http_addr': daemon_spec.ip if daemon_spec.ip else '' + 'http_addr': grafana_ip }) if 'dashboard' in self.mgr.get('mgr_map')['modules'] and spec.initial_admin_password: @@ -402,6 +413,7 @@ class PrometheusService(CephadmService): haproxy_sd_url = f'{srv_end_point}service=haproxy' if haproxy_cnt > 0 else None mgr_prometheus_sd_url = f'{srv_end_point}service=mgr-prometheus' # always included ceph_exporter_sd_url = f'{srv_end_point}service=ceph-exporter' # always included + nvmeof_sd_url = f'{srv_end_point}service=nvmeof' # always included alertmanager_user, alertmanager_password = self.mgr._get_alertmanager_credentials() prometheus_user, prometheus_password = self.mgr._get_prometheus_credentials() @@ -417,9 +429,17 @@ class PrometheusService(CephadmService): 'node_exporter_sd_url': node_exporter_sd_url, 'alertmanager_sd_url': alertmanager_sd_url, 'haproxy_sd_url': haproxy_sd_url, - 'ceph_exporter_sd_url': ceph_exporter_sd_url + 'ceph_exporter_sd_url': ceph_exporter_sd_url, + 'nvmeof_sd_url': nvmeof_sd_url, } + ip_to_bind_to = '' + if spec.only_bind_port_on_networks and spec.networks: + assert daemon_spec.host is not None + ip_to_bind_to = self.mgr.get_first_matching_network_ip(daemon_spec.host, spec) or '' + if ip_to_bind_to: + daemon_spec.port_ips = {str(port): ip_to_bind_to} + web_context = { 'prometheus_web_user': prometheus_user, 'prometheus_web_password': password_hash(prometheus_password), @@ -446,6 +466,7 @@ class PrometheusService(CephadmService): }, 'retention_time': retention_time, 'retention_size': retention_size, + 'ip_to_bind_to': ip_to_bind_to, 'web_config': '/etc/prometheus/web.yml' } else: @@ -454,7 +475,8 @@ class PrometheusService(CephadmService): 'prometheus.yml': self.mgr.template.render('services/prometheus/prometheus.yml.j2', context) }, 'retention_time': retention_time, - 'retention_size': retention_size + 'retention_size': retention_size, + 'ip_to_bind_to': ip_to_bind_to } # include alerts, if present in the container diff --git a/src/pybind/mgr/cephadm/services/nfs.py b/src/pybind/mgr/cephadm/services/nfs.py index f94a00f5b..e0c61b117 100644 --- a/src/pybind/mgr/cephadm/services/nfs.py +++ b/src/pybind/mgr/cephadm/services/nfs.py @@ -5,6 +5,8 @@ import os import subprocess import tempfile from typing import Dict, Tuple, Any, List, cast, Optional +from configparser import ConfigParser +from io import StringIO from mgr_module import HandleCommandResult from mgr_module import NFS_POOL_NAME as POOL_NAME @@ -79,6 +81,8 @@ class NFSService(CephService): nodeid = f'{daemon_spec.service_name}.{daemon_spec.rank}' + nfs_idmap_conf = '/etc/ganesha/idmap.conf' + # create the RADOS recovery pool keyring rados_user = f'{daemon_type}.{daemon_id}' rados_keyring = self.create_keyring(daemon_spec) @@ -115,12 +119,27 @@ class NFSService(CephService): "port": daemon_spec.ports[0] if daemon_spec.ports else 2049, "bind_addr": bind_addr, "haproxy_hosts": [], + "nfs_idmap_conf": nfs_idmap_conf, } if spec.enable_haproxy_protocol: context["haproxy_hosts"] = self._haproxy_hosts() logger.debug("selected haproxy_hosts: %r", context["haproxy_hosts"]) return self.mgr.template.render('services/nfs/ganesha.conf.j2', context) + # generate the idmap config + def get_idmap_conf() -> str: + idmap_conf = spec.idmap_conf + output = '' + if idmap_conf is not None: + cp = ConfigParser() + out = StringIO() + cp.read_dict(idmap_conf) + cp.write(out) + out.seek(0) + output = out.read() + out.close() + return output + # generate the cephadm config json def get_cephadm_config() -> Dict[str, Any]: config: Dict[str, Any] = {} @@ -130,6 +149,7 @@ class NFSService(CephService): config['extra_args'] = ['-N', 'NIV_EVENT'] config['files'] = { 'ganesha.conf': get_ganesha_conf(), + 'idmap.conf': get_idmap_conf() } config.update( self.get_config_and_keyring( diff --git a/src/pybind/mgr/cephadm/services/node_proxy.py b/src/pybind/mgr/cephadm/services/node_proxy.py new file mode 100644 index 000000000..e5608ca42 --- /dev/null +++ b/src/pybind/mgr/cephadm/services/node_proxy.py @@ -0,0 +1,180 @@ +import json +import ssl +import base64 + +from urllib.error import HTTPError, URLError +from typing import List, Any, Dict, Tuple, Optional, MutableMapping + +from .cephadmservice import CephadmDaemonDeploySpec, CephService +from ceph.deployment.service_spec import ServiceSpec, PlacementSpec +from ceph.utils import http_req +from orchestrator import OrchestratorError + + +class NodeProxy(CephService): + TYPE = 'node-proxy' + + def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec: + assert self.TYPE == daemon_spec.daemon_type + daemon_id, host = daemon_spec.daemon_id, daemon_spec.host + + if not self.mgr.http_server.agent: + raise OrchestratorError('Cannot deploy node-proxy before creating cephadm endpoint') + + keyring = self.get_keyring_with_caps(self.get_auth_entity(daemon_id, host=host), []) + daemon_spec.keyring = keyring + self.mgr.node_proxy_cache.update_keyring(host, keyring) + + daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec) + + return daemon_spec + + def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]: + # node-proxy is re-using the agent endpoint and therefore + # needs similar checks to see if the endpoint is ready. + self.agent_endpoint = self.mgr.http_server.agent + try: + assert self.agent_endpoint + assert self.agent_endpoint.ssl_certs.get_root_cert() + assert self.agent_endpoint.server_port + except Exception: + raise OrchestratorError( + 'Cannot deploy node-proxy daemons until cephadm endpoint has finished generating certs') + + listener_cert, listener_key = self.agent_endpoint.ssl_certs.generate_cert(daemon_spec.host, self.mgr.inventory.get_addr(daemon_spec.host)) + cfg = { + 'target_ip': self.mgr.get_mgr_ip(), + 'target_port': self.agent_endpoint.server_port, + 'name': f'node-proxy.{daemon_spec.host}', + 'keyring': daemon_spec.keyring, + 'root_cert.pem': self.agent_endpoint.ssl_certs.get_root_cert(), + 'listener.crt': listener_cert, + 'listener.key': listener_key, + } + config = {'node-proxy.json': json.dumps(cfg)} + + return config, sorted([str(self.mgr.get_mgr_ip()), str(self.agent_endpoint.server_port), + self.agent_endpoint.ssl_certs.get_root_cert()]) + + def handle_hw_monitoring_setting(self) -> bool: + # function to apply or remove node-proxy service spec depending + # on whether the hw_mointoring config option is set or not. + # It should return True when it either creates or deletes a spec + # and False otherwise. + if self.mgr.hw_monitoring: + if 'node-proxy' not in self.mgr.spec_store: + spec = ServiceSpec( + service_type='node-proxy', + placement=PlacementSpec(host_pattern='*') + ) + self.mgr.spec_store.save(spec) + return True + return False + else: + if 'node-proxy' in self.mgr.spec_store: + self.mgr.spec_store.rm('node-proxy') + return True + return False + + def get_ssl_ctx(self) -> ssl.SSLContext: + ssl_root_crt = self.mgr.http_server.agent.ssl_certs.get_root_cert() + ssl_ctx = ssl.create_default_context() + ssl_ctx.check_hostname = True + ssl_ctx.verify_mode = ssl.CERT_REQUIRED + ssl_ctx.load_verify_locations(cadata=ssl_root_crt) + return ssl_ctx + + def led(self, light_type: str, action: str, hostname: str, device: Optional[str] = None) -> Dict[str, Any]: + ssl_ctx: ssl.SSLContext = self.get_ssl_ctx() + header: MutableMapping[str, str] = {} + method: str = 'PATCH' if action in ['on', 'off'] else 'GET' + payload: Optional[Dict[str, str]] = None + addr: str = self.mgr.inventory.get_addr(hostname) + endpoint: List[str] = ['led', light_type] + _device: str = device if device else '' + + if light_type == 'drive': + endpoint.append(_device) + + if method == 'PATCH': + payload = dict(state=action) + + header = self.generate_auth_header(hostname) + + endpoint = f'/{"/".join(endpoint)}' + + try: + headers, result, status = http_req(hostname=addr, + port='9456', + headers=header, + method=method, + data=json.dumps(payload), + endpoint=endpoint, + ssl_ctx=ssl_ctx) + result_json = json.loads(result) + except HTTPError as e: + self.mgr.log.error(e) + raise + except URLError as e: + raise RuntimeError(e) + + return result_json + + def generate_auth_header(self, hostname: str) -> Dict[str, str]: + try: + username = self.mgr.node_proxy_cache.oob[hostname]['username'] + password = self.mgr.node_proxy_cache.oob[hostname]['password'] + auth: bytes = f'{username}:{password}'.encode('utf-8') + auth_str: str = base64.b64encode(auth).decode('utf-8') + header = {'Authorization': f'Basic {auth_str}'} + except KeyError as e: + self.mgr.log.error(f'Check oob information is provided for {hostname}.') + raise RuntimeError(e) + return header + + def shutdown(self, hostname: str, force: Optional[bool] = False) -> Dict[str, Any]: + ssl_ctx: ssl.SSLContext = self.get_ssl_ctx() + header: Dict[str, str] = self.generate_auth_header(hostname) + addr: str = self.mgr.inventory.get_addr(hostname) + + endpoint = '/shutdown' + payload: Dict[str, Optional[bool]] = dict(force=force) + + try: + headers, result, status = http_req(hostname=addr, + port='9456', + headers=header, + data=json.dumps(payload), + endpoint=endpoint, + ssl_ctx=ssl_ctx) + result_json = json.loads(result) + except HTTPError as e: + self.mgr.log.error(e) + raise + except URLError as e: + raise RuntimeError(e) + + return result_json + + def powercycle(self, hostname: str) -> Dict[str, Any]: + ssl_ctx: ssl.SSLContext = self.get_ssl_ctx() + header: Dict[str, str] = self.generate_auth_header(hostname) + addr: str = self.mgr.inventory.get_addr(hostname) + + endpoint = '/powercycle' + + try: + headers, result, status = http_req(hostname=addr, + port='9456', + headers=header, + data="{}", + endpoint=endpoint, + ssl_ctx=ssl_ctx) + result_json = json.loads(result) + except HTTPError as e: + self.mgr.log.error(e) + raise + except URLError as e: + raise RuntimeError(e) + + return result_json diff --git a/src/pybind/mgr/cephadm/services/nvmeof.py b/src/pybind/mgr/cephadm/services/nvmeof.py index 7d2dd16cf..5f28273d4 100644 --- a/src/pybind/mgr/cephadm/services/nvmeof.py +++ b/src/pybind/mgr/cephadm/services/nvmeof.py @@ -15,6 +15,7 @@ logger = logging.getLogger(__name__) class NvmeofService(CephService): TYPE = 'nvmeof' + PROMETHEUS_PORT = 10008 def config(self, spec: NvmeofServiceSpec) -> None: # type: ignore assert self.TYPE == spec.service_type diff --git a/src/pybind/mgr/cephadm/services/osd.py b/src/pybind/mgr/cephadm/services/osd.py index bfecc5723..75b3fc58c 100644 --- a/src/pybind/mgr/cephadm/services/osd.py +++ b/src/pybind/mgr/cephadm/services/osd.py @@ -319,11 +319,16 @@ class OSDService(CephService): logger.exception('Cannot decode JSON: \'%s\'' % ' '.join(out)) concat_out = {} notes = [] - if osdspec.data_devices is not None and osdspec.data_devices.limit and len(concat_out) < osdspec.data_devices.limit: + if ( + osdspec.data_devices is not None + and osdspec.data_devices.limit + and (len(concat_out) + ds.existing_daemons) < osdspec.data_devices.limit + ): found = len(concat_out) limit = osdspec.data_devices.limit notes.append( - f'NOTE: Did not find enough disks matching filter on host {host} to reach data device limit (Found: {found} | Limit: {limit})') + f'NOTE: Did not find enough disks matching filter on host {host} to reach data device limit\n' + f'(New Devices: {found} | Existing Matching Daemons: {ds.existing_daemons} | Limit: {limit})') ret_all.append({'data': concat_out, 'osdspec': osdspec.service_id, 'host': host, @@ -664,6 +669,7 @@ class OSD: return None self.started = True self.stopped = False + self.original_weight = self.rm_util.get_weight(self) def start_draining(self) -> bool: if self.stopped: @@ -672,7 +678,6 @@ class OSD: if self.replace: self.rm_util.set_osd_flag([self], 'out') else: - self.original_weight = self.rm_util.get_weight(self) self.rm_util.reweight_osd(self, 0.0) self.drain_started_at = datetime.utcnow() self.draining = True @@ -761,6 +766,7 @@ class OSD: out['force'] = self.force out['zap'] = self.zap out['hostname'] = self.hostname # type: ignore + out['original_weight'] = self.original_weight for k in ['drain_started_at', 'drain_stopped_at', 'drain_done_at', 'process_started_at']: if getattr(self, k): @@ -953,6 +959,16 @@ class OSDRemovalQueue(object): self.osds.add(osd) osd.start() + def rm_by_osd_id(self, osd_id: int) -> None: + osd: Optional["OSD"] = None + for o in self.osds: + if o.osd_id == osd_id: + osd = o + if not osd: + logger.debug(f"Could not find osd with id {osd_id} in queue.") + raise KeyError(f'No osd with id {osd_id} in removal queue') + self.rm(osd) + def rm(self, osd: "OSD") -> None: if not osd.exists: raise NotFoundError() diff --git a/src/pybind/mgr/cephadm/ssh.py b/src/pybind/mgr/cephadm/ssh.py index d17cc0fcc..7460fc159 100644 --- a/src/pybind/mgr/cephadm/ssh.py +++ b/src/pybind/mgr/cephadm/ssh.py @@ -1,6 +1,7 @@ import logging import os import asyncio +import concurrent from tempfile import NamedTemporaryFile from threading import Thread from contextlib import contextmanager @@ -61,7 +62,7 @@ class EventLoopThread(Thread): future = asyncio.run_coroutine_threadsafe(coro, self._loop) try: return future.result(timeout) - except asyncio.TimeoutError: + except (asyncio.TimeoutError, concurrent.futures.TimeoutError): # try to cancel the task before raising the exception further up future.cancel() raise diff --git a/src/pybind/mgr/cephadm/ssl_cert_utils.py b/src/pybind/mgr/cephadm/ssl_cert_utils.py index fcc6f00ea..6295152c7 100644 --- a/src/pybind/mgr/cephadm/ssl_cert_utils.py +++ b/src/pybind/mgr/cephadm/ssl_cert_utils.py @@ -46,7 +46,7 @@ class SSLCerts: root_builder = root_builder.public_key(root_public_key) root_builder = root_builder.add_extension( x509.SubjectAlternativeName( - [x509.IPAddress(ipaddress.IPv4Address(addr))] + [x509.IPAddress(ipaddress.ip_address(addr))] ), critical=False ) @@ -70,12 +70,9 @@ class SSLCerts: def generate_cert(self, host: str, addr: str) -> Tuple[str, str]: have_ip = True try: - ip = x509.IPAddress(ipaddress.IPv4Address(addr)) + ip = x509.IPAddress(ipaddress.ip_address(addr)) except Exception: - try: - ip = x509.IPAddress(ipaddress.IPv6Address(addr)) - except Exception: - have_ip = False + have_ip = False private_key = rsa.generate_private_key( public_exponent=65537, key_size=4096, backend=default_backend()) diff --git a/src/pybind/mgr/cephadm/templates/services/ingress/haproxy.cfg.j2 b/src/pybind/mgr/cephadm/templates/services/ingress/haproxy.cfg.j2 index 100acce40..9a0309ab4 100644 --- a/src/pybind/mgr/cephadm/templates/services/ingress/haproxy.cfg.j2 +++ b/src/pybind/mgr/cephadm/templates/services/ingress/haproxy.cfg.j2 @@ -74,7 +74,7 @@ backend backend balance static-rr option httpchk HEAD / HTTP/1.0 {% for server in servers %} - server {{ server.name }} {{ server.ip }}:{{ server.port }} check weight 100 + server {{ server.name }} {{ server.ip }}:{{ server.port }} check weight 100 inter {{ health_check_interval }} {% endfor %} {% endif %} {% if mode == 'tcp' %} @@ -85,6 +85,6 @@ backend backend default-server {{ default_server_opts|join(" ") }} {% endif %} {% for server in servers %} - server {{ server.name }} {{ server.ip }}:{{ server.port }} + server {{ server.name }} {{ server.ip }}:{{ server.port }} check {% endfor %} {% endif %} diff --git a/src/pybind/mgr/cephadm/templates/services/nfs/ganesha.conf.j2 b/src/pybind/mgr/cephadm/templates/services/nfs/ganesha.conf.j2 index ab8df7192..7bc0278d7 100644 --- a/src/pybind/mgr/cephadm/templates/services/nfs/ganesha.conf.j2 +++ b/src/pybind/mgr/cephadm/templates/services/nfs/ganesha.conf.j2 @@ -16,6 +16,9 @@ NFSv4 { Delegations = false; RecoveryBackend = 'rados_cluster'; Minor_Versions = 1, 2; +{% if nfs_idmap_conf %} + IdmapConf = "{{ nfs_idmap_conf }}"; +{% endif %} } RADOS_KV { diff --git a/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2 b/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2 index 69b8332cd..17290f504 100644 --- a/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2 +++ b/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2 @@ -7,6 +7,9 @@ port = {{ port }} enable_auth = {{ spec.enable_auth }} state_update_notify = True state_update_interval_sec = 5 +enable_prometheus_exporter = True +prometheus_exporter_ssl = False +prometheus_port = 10008 [ceph] pool = {{ spec.pool }} diff --git a/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 b/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 index b56843994..57d2f8a3f 100644 --- a/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 +++ b/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 @@ -107,3 +107,23 @@ scrape_configs: - url: {{ ceph_exporter_sd_url }} {% endif %} {% endif %} + +{% if nvmeof_sd_url %} + - job_name: 'nvmeof' +{% if secure_monitoring_stack %} + honor_labels: true + scheme: https + tls_config: + ca_file: root_cert.pem + http_sd_configs: + - url: {{ nvmeof_sd_url }} + basic_auth: + username: {{ service_discovery_username }} + password: {{ service_discovery_password }} + tls_config: + ca_file: root_cert.pem +{% else %} + http_sd_configs: + - url: {{ nvmeof_sd_url }} +{% endif %} +{% endif %} diff --git a/src/pybind/mgr/cephadm/tests/node_proxy_data.py b/src/pybind/mgr/cephadm/tests/node_proxy_data.py new file mode 100644 index 000000000..37e6aaa46 --- /dev/null +++ b/src/pybind/mgr/cephadm/tests/node_proxy_data.py @@ -0,0 +1,3 @@ +full_set_with_critical = {'host': 'host01', 'sn': '12345', 'status': {'storage': {'disk.bay.0:enclosure.internal.0-1:raid.integrated.1-1': {'description': 'Solid State Disk 0:1:0', 'entity': 'RAID.Integrated.1-1', 'capacity_bytes': 959656755200, 'model': 'KPM5XVUG960G', 'protocol': 'SAS', 'serial_number': '8080A1CRTP5F', 'status': {'health': 'Critical', 'healthrollup': 'OK', 'state': 'Enabled'}, 'physical_location': {'partlocation': {'locationordinalvalue': 0, 'locationtype': 'Slot'}}}, 'disk.bay.9:enclosure.internal.0-1': {'description': 'PCIe SSD in Slot 9 in Bay 1', 'entity': 'CPU.1', 'capacity_bytes': 1600321314816, 'model': 'Dell Express Flash NVMe P4610 1.6TB SFF', 'protocol': 'PCIe', 'serial_number': 'PHLN035305MN1P6AGN', 'status': {'health': 'Critical', 'healthrollup': 'OK', 'state': 'Enabled'}, 'physical_location': {'partlocation': {'locationordinalvalue': 9, 'locationtype': 'Slot'}}}}, 'processors': {'cpu.socket.2': {'description': 'Represents the properties of a Processor attached to this System', 'total_cores': 20, 'total_threads': 40, 'processor_type': 'CPU', 'model': 'Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz', 'status': {'health': 'OK', 'state': 'Enabled'}, 'manufacturer': 'Intel'}}, 'network': {'nic.slot.1-1-1': {'description': 'NIC in Slot 1 Port 1 Partition 1', 'name': 'System Ethernet Interface', 'speed_mbps': 0, 'status': {'health': 'OK', 'state': 'StandbyOffline'}}}, 'memory': {'dimm.socket.a1': {'description': 'DIMM A1', 'memory_device_type': 'DDR4', 'capacity_mi_b': 31237, 'status': {'health': 'Critical', 'state': 'Enabled'}}}}, 'firmwares': {}} +mgr_inventory_cache = {'host01': {'hostname': 'host01', 'addr': '10.10.10.11', 'labels': ['_admin'], 'status': '', 'oob': {'hostname': '10.10.10.11', 'username': 'root', 'password': 'ceph123'}}, 'host02': {'hostname': 'host02', 'addr': '10.10.10.12', 'labels': [], 'status': '', 'oob': {'hostname': '10.10.10.12', 'username': 'root', 'password': 'ceph123'}}} +full_set = {'host01': {'host': 'host01', 'sn': 'FR8Y5X3', 'status': {'storage': {'disk.bay.8:enclosure.internal.0-1:nonraid.slot.2-1': {'description': 'Disk 8 in Backplane 1 of Storage Controller in Slot 2', 'entity': 'NonRAID.Slot.2-1', 'capacity_bytes': 20000588955136, 'model': 'ST20000NM008D-3D', 'protocol': 'SATA', 'serial_number': 'ZVT99QLL', 'status': {'health': 'OK', 'healthrollup': 'OK', 'state': 'Enabled'}, 'physical_location': {'partlocation': {'locationordinalvalue': 8, 'locationtype': 'Slot'}}}}, 'processors': {'cpu.socket.2': {'description': 'Represents the properties of a Processor attached to this System', 'total_cores': 16, 'total_threads': 32, 'processor_type': 'CPU', 'model': 'Intel(R) Xeon(R) Silver 4314 CPU @ 2.40GHz', 'status': {'health': 'OK', 'state': 'Enabled'}, 'manufacturer': 'Intel'}, 'cpu.socket.1': {'description': 'Represents the properties of a Processor attached to this System', 'total_cores': 16, 'total_threads': 32, 'processor_type': 'CPU', 'model': 'Intel(R) Xeon(R) Silver 4314 CPU @ 2.40GHz', 'status': {'health': 'OK', 'state': 'Enabled'}, 'manufacturer': 'Intel'}}, 'network': {'oslogicalnetwork.2': {'description': 'eno8303', 'name': 'eno8303', 'speed_mbps': 0, 'status': {'health': 'OK', 'state': 'Enabled'}}}, 'memory': {'dimm.socket.a1': {'description': 'DIMM A1', 'memory_device_type': 'DDR4', 'capacity_mi_b': 16384, 'status': {'health': 'OK', 'state': 'Enabled'}}}, 'power': {'0': {'name': 'PS1 Status', 'model': 'PWR SPLY,800W,RDNT,LTON', 'manufacturer': 'DELL', 'status': {'health': 'OK', 'state': 'Enabled'}}, '1': {'name': 'PS2 Status', 'model': 'PWR SPLY,800W,RDNT,LTON', 'manufacturer': 'DELL', 'status': {'health': 'OK', 'state': 'Enabled'}}}, 'fans': {'0': {'name': 'System Board Fan1A', 'physical_context': 'SystemBoard', 'status': {'health': 'OK', 'state': 'Enabled'}}}}, 'firmwares': {'installed-28897-6.10.30.20__usc.embedded.1:lc.embedded.1': {'name': 'Lifecycle Controller', 'description': 'Represents Firmware Inventory', 'release_date': '00:00:00Z', 'version': '6.10.30.20', 'updateable': True, 'status': {'health': 'OK', 'state': 'Enabled'}}}}, 'host02': {'host': 'host02', 'sn': 'FR8Y5X4', 'status': {'storage': {'disk.bay.8:enclosure.internal.0-1:nonraid.slot.2-1': {'description': 'Disk 8 in Backplane 1 of Storage Controller in Slot 2', 'entity': 'NonRAID.Slot.2-1', 'capacity_bytes': 20000588955136, 'model': 'ST20000NM008D-3D', 'protocol': 'SATA', 'serial_number': 'ZVT99QLL', 'status': {'health': 'OK', 'healthrollup': 'OK', 'state': 'Enabled'}, 'physical_location': {'partlocation': {'locationordinalvalue': 8, 'locationtype': 'Slot'}}}}, 'processors': {'cpu.socket.2': {'description': 'Represents the properties of a Processor attached to this System', 'total_cores': 16, 'total_threads': 32, 'processor_type': 'CPU', 'model': 'Intel(R) Xeon(R) Silver 4314 CPU @ 2.40GHz', 'status': {'health': 'OK', 'state': 'Enabled'}, 'manufacturer': 'Intel'}, 'cpu.socket.1': {'description': 'Represents the properties of a Processor attached to this System', 'total_cores': 16, 'total_threads': 32, 'processor_type': 'CPU', 'model': 'Intel(R) Xeon(R) Silver 4314 CPU @ 2.40GHz', 'status': {'health': 'OK', 'state': 'Enabled'}, 'manufacturer': 'Intel'}}, 'network': {'oslogicalnetwork.2': {'description': 'eno8303', 'name': 'eno8303', 'speed_mbps': 0, 'status': {'health': 'OK', 'state': 'Enabled'}}}, 'memory': {'dimm.socket.a1': {'description': 'DIMM A1', 'memory_device_type': 'DDR4', 'capacity_mi_b': 16384, 'status': {'health': 'OK', 'state': 'Enabled'}}}, 'power': {'0': {'name': 'PS1 Status', 'model': 'PWR SPLY,800W,RDNT,LTON', 'manufacturer': 'DELL', 'status': {'health': 'OK', 'state': 'Enabled'}}, '1': {'name': 'PS2 Status', 'model': 'PWR SPLY,800W,RDNT,LTON', 'manufacturer': 'DELL', 'status': {'health': 'OK', 'state': 'Enabled'}}}, 'fans': {'0': {'name': 'System Board Fan1A', 'physical_context': 'SystemBoard', 'status': {'health': 'OK', 'state': 'Enabled'}}}}, 'firmwares': {'installed-28897-6.10.30.20__usc.embedded.1:lc.embedded.1': {'name': 'Lifecycle Controller', 'description': 'Represents Firmware Inventory', 'release_date': '00:00:00Z', 'version': '6.10.30.20', 'updateable': True, 'status': {'health': 'OK', 'state': 'Enabled'}}}}} diff --git a/src/pybind/mgr/cephadm/tests/test_autotune.py b/src/pybind/mgr/cephadm/tests/test_autotune.py index 524da9c00..7994c390a 100644 --- a/src/pybind/mgr/cephadm/tests/test_autotune.py +++ b/src/pybind/mgr/cephadm/tests/test_autotune.py @@ -46,6 +46,17 @@ from orchestrator import DaemonDescription ], {}, 62 * 1024 * 1024 * 1024, + ), + ( + 128 * 1024 * 1024 * 1024, + [ + DaemonDescription('mgr', 'a', 'host1'), + DaemonDescription('osd', '1', 'host1'), + DaemonDescription('osd', '2', 'host1'), + DaemonDescription('nvmeof', 'a', 'host1'), + ], + {}, + 60 * 1024 * 1024 * 1024, ) ]) def test_autotune(total, daemons, config, result): diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py index 24fcb0280..2477de13e 100644 --- a/src/pybind/mgr/cephadm/tests/test_cephadm.py +++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py @@ -400,6 +400,42 @@ class TestCephadm(object): assert 'myerror' in ''.join(evs) + @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')) + def test_daemon_action_event_timestamp_update(self, cephadm_module: CephadmOrchestrator): + # Test to make sure if a new daemon event is created with the same subject + # and message that the timestamp of the event is updated to let users know + # when it most recently occurred. + cephadm_module.service_cache_timeout = 10 + with with_host(cephadm_module, 'test'): + with with_service(cephadm_module, RGWSpec(service_id='myrgw.foobar', unmanaged=True)) as _, \ + with_daemon(cephadm_module, RGWSpec(service_id='myrgw.foobar'), 'test') as daemon_id: + + d_name = 'rgw.' + daemon_id + + now = str_to_datetime('2023-10-18T22:45:29.119250Z') + with mock.patch("cephadm.inventory.datetime_now", lambda: now): + c = cephadm_module.daemon_action('redeploy', d_name) + assert wait(cephadm_module, + c) == f"Scheduled to redeploy rgw.{daemon_id} on host 'test'" + + CephadmServe(cephadm_module)._check_daemons() + + d_events = cephadm_module.events.get_for_daemon(d_name) + assert len(d_events) == 1 + assert d_events[0].created == now + + later = str_to_datetime('2023-10-18T23:46:37.119250Z') + with mock.patch("cephadm.inventory.datetime_now", lambda: later): + c = cephadm_module.daemon_action('redeploy', d_name) + assert wait(cephadm_module, + c) == f"Scheduled to redeploy rgw.{daemon_id} on host 'test'" + + CephadmServe(cephadm_module)._check_daemons() + + d_events = cephadm_module.events.get_for_daemon(d_name) + assert len(d_events) == 1 + assert d_events[0].created == later + @pytest.mark.parametrize( "action", [ @@ -1157,7 +1193,8 @@ class TestCephadm(object): @mock.patch('cephadm.services.osd.OSDService.driveselection_to_ceph_volume') @mock.patch('cephadm.services.osd.OsdIdClaims.refresh', lambda _: None) @mock.patch('cephadm.services.osd.OsdIdClaims.get', lambda _: {}) - def test_limit_not_reached(self, d_to_cv, _run_cv_cmd, cephadm_module): + @mock.patch('cephadm.inventory.HostCache.get_daemons_by_service') + def test_limit_not_reached(self, _get_daemons_by_service, d_to_cv, _run_cv_cmd, cephadm_module): with with_host(cephadm_module, 'test'): dg = DriveGroupSpec(placement=PlacementSpec(host_pattern='test'), data_devices=DeviceSelection(limit=5, rotational=1), @@ -1167,12 +1204,14 @@ class TestCephadm(object): '[{"data": "/dev/vdb", "data_size": "50.00 GB", "encryption": "None"}, {"data": "/dev/vdc", "data_size": "50.00 GB", "encryption": "None"}]'] d_to_cv.return_value = 'foo' _run_cv_cmd.side_effect = async_side_effect((disks_found, '', 0)) + _get_daemons_by_service.return_value = [DaemonDescription(daemon_type='osd', hostname='test', service_name='not_enough')] preview = cephadm_module.osd_service.generate_previews([dg], 'test') for osd in preview: assert 'notes' in osd assert osd['notes'] == [ - 'NOTE: Did not find enough disks matching filter on host test to reach data device limit (Found: 2 | Limit: 5)'] + ('NOTE: Did not find enough disks matching filter on host test to reach ' + 'data device limit\n(New Devices: 2 | Existing Matching Daemons: 1 | Limit: 5)')] @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}')) def test_prepare_drivegroup(self, cephadm_module): @@ -1251,7 +1290,11 @@ class TestCephadm(object): )) @mock.patch("cephadm.services.osd.OSD.exists", True) @mock.patch("cephadm.services.osd.RemoveUtil.get_pg_count", lambda _, __: 0) - def test_remove_osds(self, cephadm_module): + @mock.patch("cephadm.services.osd.RemoveUtil.get_weight") + @mock.patch("cephadm.services.osd.RemoveUtil.reweight_osd") + def test_remove_osds(self, _reweight_osd, _get_weight, cephadm_module): + osd_initial_weight = 2.1 + _get_weight.return_value = osd_initial_weight with with_host(cephadm_module, 'test'): CephadmServe(cephadm_module)._refresh_host_daemons('test') c = cephadm_module.list_daemons() @@ -1261,13 +1304,23 @@ class TestCephadm(object): out = wait(cephadm_module, c) assert out == ["Removed osd.0 from host 'test'"] - cephadm_module.to_remove_osds.enqueue(OSD(osd_id=0, - replace=False, - force=False, - hostname='test', - process_started_at=datetime_now(), - remove_util=cephadm_module.to_remove_osds.rm_util - )) + osd_0 = OSD(osd_id=0, + replace=False, + force=False, + hostname='test', + process_started_at=datetime_now(), + remove_util=cephadm_module.to_remove_osds.rm_util + ) + + cephadm_module.to_remove_osds.enqueue(osd_0) + _get_weight.assert_called() + + # test that OSD is properly reweighted on removal + cephadm_module.stop_remove_osds([0]) + _reweight_osd.assert_called_with(mock.ANY, osd_initial_weight) + + # add OSD back to queue and test normal removal queue processing + cephadm_module.to_remove_osds.enqueue(osd_0) cephadm_module.to_remove_osds.process_removal_queue() assert cephadm_module.to_remove_osds == OSDRemovalQueue(cephadm_module) diff --git a/src/pybind/mgr/cephadm/tests/test_configchecks.py b/src/pybind/mgr/cephadm/tests/test_configchecks.py index 3cae0a27d..ff1e21861 100644 --- a/src/pybind/mgr/cephadm/tests/test_configchecks.py +++ b/src/pybind/mgr/cephadm/tests/test_configchecks.py @@ -238,6 +238,7 @@ class FakeMgr: self.default_version = 'quincy' self.version_overrides = {} self.daemon_to_host = {} + self.config_checks_enabled = True self.cache = HostCache(self) self.upgrade = CephadmUpgrade(self) @@ -623,9 +624,7 @@ class TestConfigCheck: assert 'ceph_release' in checker.skipped_checks def test_skip_when_disabled(self, mgr): - mgr.module_option.update({ - "config_checks_enabled": "false" - }) + mgr.config_checks_enabled = False checker = CephadmConfigChecks(mgr) checker.cluster_network_list = [] checker.public_network_list = ['10.9.64.0/24'] diff --git a/src/pybind/mgr/cephadm/tests/test_node_proxy.py b/src/pybind/mgr/cephadm/tests/test_node_proxy.py new file mode 100644 index 000000000..b19bb5dbc --- /dev/null +++ b/src/pybind/mgr/cephadm/tests/test_node_proxy.py @@ -0,0 +1,312 @@ +import cherrypy +import json +from _pytest.monkeypatch import MonkeyPatch +from urllib.error import URLError +from cherrypy.test import helper +from cephadm.agent import NodeProxyEndpoint +from unittest.mock import MagicMock, call, patch +from cephadm.inventory import AgentCache, NodeProxyCache, Inventory +from cephadm.ssl_cert_utils import SSLCerts +from . import node_proxy_data + +PORT = 58585 + + +class FakeMgr: + def __init__(self) -> None: + self.log = MagicMock() + self.get_store = MagicMock(return_value=json.dumps(node_proxy_data.mgr_inventory_cache)) + self.set_store = MagicMock() + self.set_health_warning = MagicMock() + self.remove_health_warning = MagicMock() + self.inventory = Inventory(self) + self.agent_cache = AgentCache(self) + self.agent_cache.agent_ports = {"host01": 1234} + self.node_proxy_cache = NodeProxyCache(self) + self.node_proxy_cache.save = MagicMock() + self.node_proxy = MagicMock() + self.http_server = MagicMock() + self.http_server.agent = MagicMock() + self.http_server.agent.ssl_certs = SSLCerts() + self.http_server.agent.ssl_certs.generate_root_cert(self.get_mgr_ip()) + + def get_mgr_ip(self) -> str: + return '0.0.0.0' + + +class TestNodeProxyEndpoint(helper.CPWebCase): + mgr = FakeMgr() + app = NodeProxyEndpoint(mgr) + mgr.node_proxy_cache.keyrings = {"host01": "fake-secret01", + "host02": "fake-secret02"} + mgr.node_proxy_cache.oob = {"host01": {"username": "oob-user01", + "password": "oob-pass01"}, + "host02": {"username": "oob-user02", + "password": "oob-pass02"}} + mgr.node_proxy_cache.data = node_proxy_data.full_set + + @classmethod + def setup_server(cls): + # cherrypy.tree.mount(NodeProxyEndpoint(TestNodeProxyEndpoint.mgr)) + cherrypy.tree.mount(TestNodeProxyEndpoint.app) + cherrypy.config.update({'global': { + 'server.socket_host': '127.0.0.1', + 'server.socket_port': PORT}}) + + def setUp(self): + self.PORT = PORT + self.monkeypatch = MonkeyPatch() + + def test_oob_data_misses_cephx_field(self): + data = '{}' + self.getPage("/oob", method="POST", body=data, headers=[('Content-Type', 'application/json'), + ('Content-Length', str(len(data)))]) + self.assertStatus('400 Bad Request') + + def test_oob_data_misses_name_field(self): + data = '{"cephx": {"secret": "fake-secret"}}' + self.getPage("/oob", method="POST", body=data, headers=[('Content-Type', 'application/json'), + ('Content-Length', str(len(data)))]) + self.assertStatus('400 Bad Request') + + def test_oob_data_misses_secret_field(self): + data = '{"cephx": {"name": "node-proxy.host01"}}' + self.getPage("/oob", method="POST", body=data, headers=[('Content-Type', 'application/json'), + ('Content-Length', str(len(data)))]) + self.assertStatus('400 Bad Request') + + def test_oob_agent_not_running(self): + data = '{"cephx": {"name": "node-proxy.host03", "secret": "fake-secret03"}}' + self.getPage("/oob", method="POST", body=data, headers=[('Content-Type', 'application/json'), + ('Content-Length', str(len(data)))]) + self.assertStatus('502 Bad Gateway') + + def test_oob_wrong_keyring(self): + data = '{"cephx": {"name": "node-proxy.host01", "secret": "wrong-keyring"}}' + self.getPage("/oob", method="POST", body=data, headers=[('Content-Type', 'application/json'), + ('Content-Length', str(len(data)))]) + self.assertStatus('403 Forbidden') + + def test_oob_ok(self): + data = '{"cephx": {"name": "node-proxy.host01", "secret": "fake-secret01"}}' + self.getPage("/oob", method="POST", body=data, headers=[('Content-Type', 'application/json'), + ('Content-Length', str(len(data)))]) + self.assertStatus('200 OK') + + def test_data_missing_patch(self): + data = '{"cephx": {"name": "node-proxy.host01", "secret": "fake-secret01"}}' + self.getPage("/data", method="POST", body=data, headers=[('Content-Type', 'application/json'), + ('Content-Length', str(len(data)))]) + self.assertStatus('400 Bad Request') + + def test_data_raises_alert(self): + patch = node_proxy_data.full_set_with_critical + data = {"cephx": {"name": "node-proxy.host01", "secret": "fake-secret01"}, "patch": patch} + data_str = json.dumps(data) + self.getPage("/data", method="POST", body=data_str, headers=[('Content-Type', 'application/json'), + ('Content-Length', str(len(data_str)))]) + self.assertStatus('200 OK') + + calls = [call('HARDWARE_STORAGE', + count=2, + detail=['disk.bay.0:enclosure.internal.0-1:raid.integrated.1-1 is critical: Enabled', + 'disk.bay.9:enclosure.internal.0-1 is critical: Enabled'], + summary='2 storage members are not ok'), + call('HARDWARE_MEMORY', + count=1, + detail=['dimm.socket.a1 is critical: Enabled'], + summary='1 memory member is not ok')] + + assert TestNodeProxyEndpoint.mgr.set_health_warning.mock_calls == calls + + def test_led_GET_no_hostname(self): + self.getPage("/led", method="GET") + self.assertStatus('501 Not Implemented') + + def test_led_PATCH_no_hostname(self): + data = "{}" + self.getPage("/led", method="PATCH", body=data, headers=[('Content-Type', 'application/json'), + ('Content-Length', str(len(data)))]) + self.assertStatus('501 Not Implemented') + + def test_set_led_no_type(self): + data = '{"state": "on", "keyring": "fake-secret01"}' + self.getPage("/host01/led", method="PATCH", body=data, headers=[('Content-Type', 'application/json'), + ('Content-Length', str(len(data)))]) + self.assertStatus('400 Bad Request') + + def test_set_chassis_led(self): + data = '{"state": "on", "keyring": "fake-secret01"}' + with patch('cephadm.agent.http_req') as p: + p.return_value = [], '{}', 200 + self.getPage("/host01/led/chassis", method="PATCH", body=data, headers=[('Content-Type', 'application/json'), + ('Content-Length', str(len(data)))]) + self.assertStatus('200 OK') + + def test_get_led_missing_type(self): + self.getPage("/host01/led", method="GET") + self.assertStatus('400 Bad Request') + + def test_get_led_no_hostname(self): + self.getPage("/led", method="GET") + self.assertStatus('501 Not Implemented') + + def test_get_led_type_chassis_no_hostname(self): + self.getPage("/led/chassis", method="GET") + self.assertStatus('404 Not Found') + + def test_get_led_type_drive_no_hostname(self): + self.getPage("/led/chassis", method="GET") + self.assertStatus('404 Not Found') + + def test_get_led_type_drive_missing_id(self): + self.getPage("/host01/led/drive", method="GET") + self.assertStatus('400 Bad Request') + + def test_get_led_url_error(self): + with patch('cephadm.agent.http_req') as p: + p.side_effect = URLError('fake error') + self.getPage("/host02/led/chassis", method="GET") + self.assertStatus('502 Bad Gateway') + + def test_get_chassis_led_ok(self): + with patch('cephadm.agent.http_req', return_value=MagicMock()) as p: + p.return_value = [], '{}', 200 + self.getPage("/host01/led/chassis", method="GET") + self.assertStatus('200 OK') + + def test_get_drive_led_without_id(self): + self.getPage("/host01/led/drive", method="GET") + self.assertStatus('400 Bad Request') + + def test_get_drive_led_with_id(self): + with patch('cephadm.agent.http_req', return_value=MagicMock()) as p: + p.return_value = [], '{}', 200 + self.getPage("/host01/led/drive/123", method="GET") + self.assertStatus('200 OK') + + def test_fullreport_with_valid_hostname(self): + # data = '{"cephx": {"name": "node-proxy.host01", "secret": "fake-secret01"}}' + # self.getPage("/host02/fullreport", method="POST", body=data, headers=[('Content-Type', 'application/json'), ('Content-Length', str(len(data)))]) + self.getPage("/host02/fullreport", method="GET") + self.assertStatus('200 OK') + + def test_fullreport_no_hostname(self): + # data = '{"cephx": {"name": "node-proxy.host01", "secret": "fake-secret01"}}' + # self.getPage("/fullreport", method="POST", body=data, headers=[('Content-Type', 'application/json'), ('Content-Length', str(len(data)))]) + self.getPage("/fullreport", method="GET") + self.assertStatus('200 OK') + + def test_fullreport_with_invalid_hostname(self): + # data = '{"cephx": {"name": "node-proxy.host03", "secret": "fake-secret03"}}' + # self.getPage("/host03/fullreport", method="POST", body=data, headers=[('Content-Type', 'application/json'), ('Content-Length', str(len(data)))]) + self.getPage("/host03/fullreport", method="GET") + self.assertStatus('404 Not Found') + + def test_summary_with_valid_hostname(self): + self.getPage("/host02/summary", method="GET") + self.assertStatus('200 OK') + + def test_summary_no_hostname(self): + self.getPage("/summary", method="GET") + self.assertStatus('200 OK') + + def test_summary_with_invalid_hostname(self): + self.getPage("/host03/summary", method="GET") + self.assertStatus('404 Not Found') + + def test_criticals_with_valid_hostname(self): + self.getPage("/host02/criticals", method="GET") + self.assertStatus('200 OK') + + def test_criticals_no_hostname(self): + self.getPage("/criticals", method="GET") + self.assertStatus('200 OK') + + def test_criticals_with_invalid_hostname(self): + self.getPage("/host03/criticals", method="GET") + self.assertStatus('404 Not Found') + + def test_memory_with_valid_hostname(self): + self.getPage("/host02/memory", method="GET") + self.assertStatus('200 OK') + + def test_memory_no_hostname(self): + self.getPage("/memory", method="GET") + self.assertStatus('200 OK') + + def test_memory_with_invalid_hostname(self): + self.getPage("/host03/memory", method="GET") + self.assertStatus('404 Not Found') + + def test_network_with_valid_hostname(self): + self.getPage("/host02/network", method="GET") + self.assertStatus('200 OK') + + def test_network_no_hostname(self): + self.getPage("/network", method="GET") + self.assertStatus('200 OK') + + def test_network_with_invalid_hostname(self): + self.getPage("/host03/network", method="GET") + self.assertStatus('404 Not Found') + + def test_processors_with_valid_hostname(self): + self.getPage("/host02/processors", method="GET") + self.assertStatus('200 OK') + + def test_processors_no_hostname(self): + self.getPage("/processors", method="GET") + self.assertStatus('200 OK') + + def test_processors_with_invalid_hostname(self): + self.getPage("/host03/processors", method="GET") + self.assertStatus('404 Not Found') + + def test_storage_with_valid_hostname(self): + self.getPage("/host02/storage", method="GET") + self.assertStatus('200 OK') + + def test_storage_no_hostname(self): + self.getPage("/storage", method="GET") + self.assertStatus('200 OK') + + def test_storage_with_invalid_hostname(self): + self.getPage("/host03/storage", method="GET") + self.assertStatus('404 Not Found') + + def test_power_with_valid_hostname(self): + self.getPage("/host02/power", method="GET") + self.assertStatus('200 OK') + + def test_power_no_hostname(self): + self.getPage("/power", method="GET") + self.assertStatus('200 OK') + + def test_power_with_invalid_hostname(self): + self.getPage("/host03/power", method="GET") + self.assertStatus('404 Not Found') + + def test_fans_with_valid_hostname(self): + self.getPage("/host02/fans", method="GET") + self.assertStatus('200 OK') + + def test_fans_no_hostname(self): + self.getPage("/fans", method="GET") + self.assertStatus('200 OK') + + def test_fans_with_invalid_hostname(self): + self.getPage("/host03/fans", method="GET") + self.assertStatus('404 Not Found') + + def test_firmwares_with_valid_hostname(self): + self.getPage("/host02/firmwares", method="GET") + self.assertStatus('200 OK') + + def test_firmwares_no_hostname(self): + self.getPage("/firmwares", method="GET") + self.assertStatus('200 OK') + + def test_firmwares_with_invalid_hostname(self): + self.getPage("/host03/firmwares", method="GET") + self.assertStatus('404 Not Found') diff --git a/src/pybind/mgr/cephadm/tests/test_scheduling.py b/src/pybind/mgr/cephadm/tests/test_scheduling.py index 067cd5028..f445ed6f0 100644 --- a/src/pybind/mgr/cephadm/tests/test_scheduling.py +++ b/src/pybind/mgr/cephadm/tests/test_scheduling.py @@ -6,7 +6,13 @@ from typing import NamedTuple, List, Dict, Optional import pytest from ceph.deployment.hostspec import HostSpec -from ceph.deployment.service_spec import ServiceSpec, PlacementSpec, IngressSpec +from ceph.deployment.service_spec import ( + ServiceSpec, + PlacementSpec, + IngressSpec, + PatternType, + HostPattern, +) from ceph.deployment.hostspec import SpecValidationError from cephadm.module import HostAssignment @@ -631,6 +637,17 @@ class NodeAssignmentTest(NamedTuple): 'rgw:host2(*:81)', 'rgw:host3(*:81)'], ['rgw.c'] ), + # label + host pattern + # Note all hosts will get the "foo" label, we are checking + # that it also filters on the host pattern when label is provided + NodeAssignmentTest( + 'mgr', + PlacementSpec(label='foo', host_pattern='mgr*'), + 'mgr1 mgr2 osd1'.split(), + [], + None, None, + ['mgr:mgr1', 'mgr:mgr2'], ['mgr:mgr1', 'mgr:mgr2'], [] + ), # cephadm.py teuth case NodeAssignmentTest( 'mgr', @@ -1697,3 +1714,42 @@ def test_drain_from_explict_placement(service_type, placement, hosts, maintenanc ).place() assert sorted([h.hostname for h in to_add]) in expected_add assert sorted([h.name() for h in to_remove]) in expected_remove + + +class RegexHostPatternTest(NamedTuple): + service_type: str + placement: PlacementSpec + hosts: List[str] + expected_add: List[List[str]] + + +@pytest.mark.parametrize("service_type,placement,hosts,expected_add", + [ + RegexHostPatternTest( + 'crash', + PlacementSpec(host_pattern=HostPattern(pattern='host1|host3', pattern_type=PatternType.regex)), + 'host1 host2 host3 host4'.split(), + ['host1', 'host3'], + ), + RegexHostPatternTest( + 'crash', + PlacementSpec(host_pattern=HostPattern(pattern='host[2-4]', pattern_type=PatternType.regex)), + 'host1 host2 host3 host4'.split(), + ['host2', 'host3', 'host4'], + ), + ]) +def test_placement_regex_host_pattern(service_type, placement, hosts, expected_add): + spec = ServiceSpec(service_type=service_type, + service_id='test', + placement=placement) + + host_specs = [HostSpec(h) for h in hosts] + + hosts, to_add, to_remove = HostAssignment( + spec=spec, + hosts=host_specs, + unreachable_hosts=[], + draining_hosts=[], + daemons=[], + ).place() + assert sorted([h.hostname for h in to_add]) == expected_add diff --git a/src/pybind/mgr/cephadm/tests/test_service_discovery.py b/src/pybind/mgr/cephadm/tests/test_service_discovery.py index ff98a1388..687b64553 100644 --- a/src/pybind/mgr/cephadm/tests/test_service_discovery.py +++ b/src/pybind/mgr/cephadm/tests/test_service_discovery.py @@ -19,6 +19,9 @@ class FakeCache: if service_type == 'ceph-exporter': return [FakeDaemonDescription('1.2.3.4', [9926], 'node0'), FakeDaemonDescription('1.2.3.5', [9926], 'node1')] + if service_type == 'nvmeof': + return [FakeDaemonDescription('1.2.3.4', [10008], 'node0'), + FakeDaemonDescription('1.2.3.5', [10008], 'node1')] return [FakeDaemonDescription('1.2.3.4', [9100], 'node0'), FakeDaemonDescription('1.2.3.5', [9200], 'node1')] @@ -171,6 +174,20 @@ class TestServiceDiscovery: # check content assert cfg[0]['targets'] == ['1.2.3.4:9926'] + def test_get_sd_config_nvmeof(self): + mgr = FakeMgr() + root = Root(mgr, 5000, '0.0.0.0') + cfg = root.get_sd_config('nvmeof') + + # check response structure + assert cfg + for entry in cfg: + assert 'labels' in entry + assert 'targets' in entry + + # check content + assert cfg[0]['targets'] == ['1.2.3.4:10008'] + def test_get_sd_config_invalid_service(self): mgr = FakeMgr() root = Root(mgr, 5000, '0.0.0.0') diff --git a/src/pybind/mgr/cephadm/tests/test_services.py b/src/pybind/mgr/cephadm/tests/test_services.py index 2300b288d..1265a39f6 100644 --- a/src/pybind/mgr/cephadm/tests/test_services.py +++ b/src/pybind/mgr/cephadm/tests/test_services.py @@ -376,6 +376,9 @@ port = {default_port} enable_auth = False state_update_notify = True state_update_interval_sec = 5 +enable_prometheus_exporter = True +prometheus_exporter_ssl = False +prometheus_port = 10008 [ceph] pool = {pool} @@ -665,7 +668,9 @@ class TestMonitoring: keepalived_password='12345', virtual_ip="1.2.3.4/32", backend_service='rgw.foo')) as _, \ - with_service(cephadm_module, PrometheusSpec('prometheus')) as _: + with_service(cephadm_module, PrometheusSpec('prometheus', + networks=['1.2.3.0/24'], + only_bind_port_on_networks=True)) as _: y = dedent(""" # This file is generated by cephadm. @@ -699,6 +704,10 @@ class TestMonitoring: honor_labels: true http_sd_configs: - url: http://[::1]:8765/sd/prometheus/sd-config?service=ceph-exporter + + - job_name: 'nvmeof' + http_sd_configs: + - url: http://[::1]:8765/sd/prometheus/sd-config?service=nvmeof """).lstrip() _run_cephadm.assert_called_with( @@ -713,11 +722,12 @@ class TestMonitoring: "deploy_arguments": [], "params": { 'tcp_ports': [9095], + 'port_ips': {'8765': '1.2.3.1'} }, "meta": { 'service_name': 'prometheus', 'ports': [9095], - 'ip': None, + 'ip': '1.2.3.1', 'deployed_by': [], 'rank': None, 'rank_generation': None, @@ -731,6 +741,7 @@ class TestMonitoring: }, 'retention_time': '15d', 'retention_size': '0', + 'ip_to_bind_to': '1.2.3.1', }, }), ) @@ -855,6 +866,19 @@ class TestMonitoring: password: sd_password tls_config: ca_file: root_cert.pem + + - job_name: 'nvmeof' + honor_labels: true + scheme: https + tls_config: + ca_file: root_cert.pem + http_sd_configs: + - url: https://[::1]:8765/sd/prometheus/sd-config?service=nvmeof + basic_auth: + username: sd_user + password: sd_password + tls_config: + ca_file: root_cert.pem """).lstrip() _run_cephadm.assert_called_with( @@ -892,6 +916,7 @@ class TestMonitoring: }, 'retention_time': '15d', 'retention_size': '0', + 'ip_to_bind_to': '', 'web_config': '/etc/prometheus/web.yml', }, }), @@ -1633,7 +1658,7 @@ class TestIngressService: ) if enable_haproxy_protocol: haproxy_txt += ' default-server send-proxy-v2\n' - haproxy_txt += ' server nfs.foo.0 192.168.122.111:12049\n' + haproxy_txt += ' server nfs.foo.0 192.168.122.111:12049 check\n' haproxy_expected_conf = { 'files': {'haproxy.cfg': haproxy_txt} } @@ -1783,7 +1808,7 @@ class TestIngressService: 'balance static-rr\n ' 'option httpchk HEAD / HTTP/1.0\n ' 'server ' - + haproxy_generated_conf[1][0] + ' 1.2.3.7:80 check weight 100\n' + + haproxy_generated_conf[1][0] + ' 1.2.3.7:80 check weight 100 inter 2s\n' } } @@ -1908,7 +1933,7 @@ class TestIngressService: 'balance static-rr\n ' 'option httpchk HEAD / HTTP/1.0\n ' 'server ' - + haproxy_generated_conf[1][0] + ' 1::4:443 check weight 100\n' + + haproxy_generated_conf[1][0] + ' 1::4:443 check weight 100 inter 2s\n' } } @@ -2032,7 +2057,7 @@ class TestIngressService: 'balance static-rr\n ' 'option httpchk HEAD / HTTP/1.0\n ' 'server ' - + haproxy_generated_conf[1][0] + ' 1.2.3.7:80 check weight 100\n' + + haproxy_generated_conf[1][0] + ' 1.2.3.7:80 check weight 100 inter 2s\n' } } @@ -2411,7 +2436,7 @@ class TestIngressService: ' balance source\n' ' hash-type consistent\n' ' default-server send-proxy-v2\n' - ' server nfs.foo.0 192.168.122.111:12049\n' + ' server nfs.foo.0 192.168.122.111:12049 check\n' ) haproxy_expected_conf = { 'files': {'haproxy.cfg': haproxy_txt} @@ -2431,6 +2456,7 @@ class TestIngressService: ' Delegations = false;\n' " RecoveryBackend = 'rados_cluster';\n" ' Minor_Versions = 1, 2;\n' + ' IdmapConf = "/etc/ganesha/idmap.conf";\n' '}\n' '\n' 'RADOS_KV {\n' @@ -2454,7 +2480,7 @@ class TestIngressService: "%url rados://.nfs/foo/conf-nfs.foo" ) nfs_expected_conf = { - 'files': {'ganesha.conf': nfs_ganesha_txt}, + 'files': {'ganesha.conf': nfs_ganesha_txt, 'idmap.conf': ''}, 'config': '', 'extra_args': ['-N', 'NIV_EVENT'], 'keyring': ( diff --git a/src/pybind/mgr/cephadm/utils.py b/src/pybind/mgr/cephadm/utils.py index 63672936c..3aedfbd86 100644 --- a/src/pybind/mgr/cephadm/utils.py +++ b/src/pybind/mgr/cephadm/utils.py @@ -31,7 +31,7 @@ RESCHEDULE_FROM_OFFLINE_HOSTS_TYPES = ['haproxy', 'nfs'] CEPH_UPGRADE_ORDER = CEPH_TYPES + GATEWAY_TYPES + MONITORING_STACK_TYPES # these daemon types use the ceph container image -CEPH_IMAGE_TYPES = CEPH_TYPES + ['iscsi', 'nfs'] +CEPH_IMAGE_TYPES = CEPH_TYPES + ['iscsi', 'nfs', 'node-proxy'] # these daemons do not use the ceph image. There are other daemons # that also don't use the ceph image, but we only care about those |