summaryrefslogtreecommitdiffstats
path: root/src/pybind/mgr/cephadm
diff options
context:
space:
mode:
Diffstat (limited to 'src/pybind/mgr/cephadm')
-rw-r--r--src/pybind/mgr/cephadm/agent.py565
-rw-r--r--src/pybind/mgr/cephadm/autotune.py1
-rw-r--r--src/pybind/mgr/cephadm/configchecks.py2
-rw-r--r--src/pybind/mgr/cephadm/inventory.py195
-rw-r--r--src/pybind/mgr/cephadm/module.py189
-rw-r--r--src/pybind/mgr/cephadm/schedule.py2
-rw-r--r--src/pybind/mgr/cephadm/serve.py23
-rw-r--r--src/pybind/mgr/cephadm/service_discovery.py17
-rw-r--r--src/pybind/mgr/cephadm/services/cephadmservice.py2
-rw-r--r--src/pybind/mgr/cephadm/services/ingress.py1
-rw-r--r--src/pybind/mgr/cephadm/services/jaeger.py3
-rw-r--r--src/pybind/mgr/cephadm/services/monitoring.py30
-rw-r--r--src/pybind/mgr/cephadm/services/nfs.py20
-rw-r--r--src/pybind/mgr/cephadm/services/node_proxy.py180
-rw-r--r--src/pybind/mgr/cephadm/services/nvmeof.py1
-rw-r--r--src/pybind/mgr/cephadm/services/osd.py22
-rw-r--r--src/pybind/mgr/cephadm/ssh.py3
-rw-r--r--src/pybind/mgr/cephadm/ssl_cert_utils.py9
-rw-r--r--src/pybind/mgr/cephadm/templates/services/ingress/haproxy.cfg.j24
-rw-r--r--src/pybind/mgr/cephadm/templates/services/nfs/ganesha.conf.j23
-rw-r--r--src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j23
-rw-r--r--src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j220
-rw-r--r--src/pybind/mgr/cephadm/tests/node_proxy_data.py3
-rw-r--r--src/pybind/mgr/cephadm/tests/test_autotune.py11
-rw-r--r--src/pybind/mgr/cephadm/tests/test_cephadm.py73
-rw-r--r--src/pybind/mgr/cephadm/tests/test_configchecks.py5
-rw-r--r--src/pybind/mgr/cephadm/tests/test_node_proxy.py312
-rw-r--r--src/pybind/mgr/cephadm/tests/test_scheduling.py58
-rw-r--r--src/pybind/mgr/cephadm/tests/test_service_discovery.py17
-rw-r--r--src/pybind/mgr/cephadm/tests/test_services.py42
-rw-r--r--src/pybind/mgr/cephadm/utils.py2
31 files changed, 1749 insertions, 69 deletions
diff --git a/src/pybind/mgr/cephadm/agent.py b/src/pybind/mgr/cephadm/agent.py
index 93a08cb34..9e71477d4 100644
--- a/src/pybind/mgr/cephadm/agent.py
+++ b/src/pybind/mgr/cephadm/agent.py
@@ -16,14 +16,15 @@ import time
from orchestrator import DaemonDescriptionStatus
from orchestrator._interface import daemon_type_to_service
-from ceph.utils import datetime_now
+from ceph.utils import datetime_now, http_req
from ceph.deployment.inventory import Devices
from ceph.deployment.service_spec import ServiceSpec, PlacementSpec
from cephadm.services.cephadmservice import CephadmDaemonDeploySpec
from cephadm.ssl_cert_utils import SSLCerts
from mgr_util import test_port_allocation, PortAlreadyInUse
-from typing import Any, Dict, List, Set, TYPE_CHECKING, Optional
+from urllib.error import HTTPError, URLError
+from typing import Any, Dict, List, Set, TYPE_CHECKING, Optional, MutableMapping
if TYPE_CHECKING:
from cephadm.module import CephadmOrchestrator
@@ -53,11 +54,10 @@ class AgentEndpoint:
self.server_addr = self.mgr.get_mgr_ip()
def configure_routes(self) -> None:
- d = cherrypy.dispatch.RoutesDispatcher()
- d.connect(name='host-data', route='/data/',
- controller=self.host_data.POST,
- conditions=dict(method=['POST']))
- cherrypy.tree.mount(None, '/', config={'/': {'request.dispatch': d}})
+ conf = {'/': {'tools.trailing_slash.on': False}}
+
+ cherrypy.tree.mount(self.host_data, '/data', config=conf)
+ cherrypy.tree.mount(self.node_proxy_endpoint, '/node-proxy', config=conf)
def configure_tls(self, server: Server) -> None:
old_cert = self.mgr.get_store(self.KV_STORE_AGENT_ROOT_CERT)
@@ -88,10 +88,540 @@ class AgentEndpoint:
def configure(self) -> None:
self.host_data = HostData(self.mgr, self.server_port, self.server_addr)
self.configure_tls(self.host_data)
+ self.node_proxy_endpoint = NodeProxyEndpoint(self.mgr)
self.configure_routes()
self.find_free_port()
+class NodeProxyEndpoint:
+ def __init__(self, mgr: "CephadmOrchestrator"):
+ self.mgr = mgr
+ self.ssl_root_crt = self.mgr.http_server.agent.ssl_certs.get_root_cert()
+ self.ssl_ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
+ self.ssl_ctx.check_hostname = False
+ self.ssl_ctx.verify_mode = ssl.CERT_NONE
+ # self.ssl_ctx = ssl.create_default_context()
+ # self.ssl_ctx.check_hostname = True
+ # self.ssl_ctx.verify_mode = ssl.CERT_REQUIRED
+ # self.ssl_ctx.load_verify_locations(cadata=self.ssl_root_crt)
+ self.redfish_token: str = ''
+ self.redfish_session_location: str = ''
+
+ def _cp_dispatch(self, vpath: List[str]) -> "NodeProxyEndpoint":
+ if len(vpath) > 1: # /{hostname}/<endpoint>
+ hostname = vpath.pop(0) # /<endpoint>
+ cherrypy.request.params['hostname'] = hostname
+ # /{hostname}/led/{type}/{drive} eg: /{hostname}/led/chassis or /{hostname}/led/drive/{id}
+ if vpath[0] == 'led' and len(vpath) > 1: # /led/{type}/{id}
+ _type = vpath[1]
+ cherrypy.request.params['type'] = _type
+ vpath.pop(1) # /led/{id} or # /led
+ if _type == 'drive' and len(vpath) > 1: # /led/{id}
+ _id = vpath[1]
+ vpath.pop(1) # /led
+ cherrypy.request.params['id'] = _id
+ # /<endpoint>
+ return self
+
+ @cherrypy.expose
+ @cherrypy.tools.allow(methods=['POST'])
+ @cherrypy.tools.json_in()
+ @cherrypy.tools.json_out()
+ def oob(self) -> Dict[str, Any]:
+ """
+ Get the out-of-band management tool details for a given host.
+
+ :return: oob details.
+ :rtype: dict
+ """
+ data: Dict[str, Any] = cherrypy.request.json
+ results: Dict[str, Any] = {}
+
+ self.validate_node_proxy_data(data)
+
+ # expecting name to be "node-proxy.<hostname>"
+ hostname = data['cephx']['name'][11:]
+ results['result'] = self.mgr.node_proxy_cache.oob.get(hostname, '')
+ if not results['result']:
+ raise cherrypy.HTTPError(400, 'The provided host has no iDrac details.')
+ return results
+
+ def validate_node_proxy_data(self, data: Dict[str, Any]) -> None:
+ """
+ Validate received data.
+
+ :param data: data to validate.
+ :type data: dict
+
+ :raises cherrypy.HTTPError 400: If the data is not valid (missing fields)
+ :raises cherrypy.HTTPError 403: If the secret provided is wrong.
+ """
+ cherrypy.response.status = 200
+ try:
+ if 'cephx' not in data.keys():
+ raise cherrypy.HTTPError(400, 'The field \'cephx\' must be provided.')
+ elif 'name' not in data['cephx'].keys():
+ cherrypy.response.status = 400
+ raise cherrypy.HTTPError(400, 'The field \'name\' must be provided.')
+ # expecting name to be "node-proxy.<hostname>"
+ hostname = data['cephx']['name'][11:]
+ if 'secret' not in data['cephx'].keys():
+ raise cherrypy.HTTPError(400, 'The node-proxy keyring must be provided.')
+ elif not self.mgr.node_proxy_cache.keyrings.get(hostname, ''):
+ raise cherrypy.HTTPError(502, f'Make sure the node-proxy is running on {hostname}')
+ elif data['cephx']['secret'] != self.mgr.node_proxy_cache.keyrings[hostname]:
+ raise cherrypy.HTTPError(403, f'Got wrong keyring from agent on host {hostname}.')
+ except AttributeError:
+ raise cherrypy.HTTPError(400, 'Malformed data received.')
+
+ # TODO(guits): refactor this
+ # TODO(guits): use self.node_proxy.get_critical_from_host() ?
+ def get_nok_members(self,
+ data: Dict[str, Any]) -> List[Dict[str, str]]:
+ """
+ Retrieves members whose status is not 'ok'.
+
+ :param data: Data containing information about members.
+ :type data: dict
+
+ :return: A list containing dictionaries of members whose status is not 'ok'.
+ :rtype: List[Dict[str, str]]
+
+ :return: None
+ :rtype: None
+ """
+ nok_members: List[Dict[str, str]] = []
+
+ for member in data.keys():
+ _status = data[member]['status']['health'].lower()
+ if _status.lower() != 'ok':
+ state = data[member]['status']['state']
+ _member = dict(
+ member=member,
+ status=_status,
+ state=state
+ )
+ nok_members.append(_member)
+
+ return nok_members
+
+ def raise_alert(self, data: Dict[str, Any]) -> None:
+ """
+ Raises hardware alerts based on the provided patch status.
+
+ :param data: Data containing patch status information.
+ :type data: dict
+
+ This function iterates through the provided status
+ information to raise hardware alerts.
+ For each component in the provided data, it removes any
+ existing health warnings associated with it and checks
+ for non-okay members using the `get_nok_members` method.
+ If non-okay members are found, it sets a new health
+ warning for that component and generates a report detailing
+ the non-okay members' statuses.
+
+ Note: This function relies on the `get_nok_members` method to
+ identify non-okay members.
+
+ :return: None
+ :rtype: None
+ """
+
+ for component in data['patch']['status'].keys():
+ alert_name = f"HARDWARE_{component.upper()}"
+ self.mgr.remove_health_warning(alert_name)
+ nok_members = self.get_nok_members(data['patch']['status'][component])
+
+ if nok_members:
+ count = len(nok_members)
+ self.mgr.set_health_warning(
+ alert_name,
+ summary=f'{count} {component} member{"s" if count > 1 else ""} {"are" if count > 1 else "is"} not ok',
+ count=count,
+ detail=[f"{member['member']} is {member['status']}: {member['state']}" for member in nok_members],
+ )
+
+ @cherrypy.expose
+ @cherrypy.tools.allow(methods=['POST'])
+ @cherrypy.tools.json_in()
+ @cherrypy.tools.json_out()
+ def data(self) -> None:
+ """
+ Handles incoming data via a POST request.
+
+ This function is exposed to handle POST requests and expects incoming
+ JSON data. It processes the incoming data by first validating it
+ through the `validate_node_proxy_data` method. Subsequently, it
+ extracts the hostname from the data and saves the information
+ using `mgr.node_proxy.save`. Finally, it raises alerts based on the
+ provided status through the `raise_alert` method.
+
+ :return: None
+ :rtype: None
+ """
+ data: Dict[str, Any] = cherrypy.request.json
+ self.validate_node_proxy_data(data)
+ if 'patch' not in data.keys():
+ raise cherrypy.HTTPError(400, 'Malformed data received.')
+ host = data['cephx']['name'][11:]
+ self.mgr.node_proxy_cache.save(host, data['patch'])
+ self.raise_alert(data)
+
+ @cherrypy.expose
+ @cherrypy.tools.allow(methods=['GET', 'PATCH'])
+ @cherrypy.tools.json_in()
+ @cherrypy.tools.json_out()
+ def led(self, **kw: Any) -> Dict[str, Any]:
+ """
+ Handles enclosure LED operations for the specified hostname.
+
+ This function handles GET and PATCH requests related to LED status for a
+ specific hostname. It identifies the request method and provided hostname.
+ If the hostname is missing, it logs an error and returns an error message.
+
+ For PATCH requests, it prepares authorization headers based on the
+ provided ID and password, encodes them, and constructs the authorization
+ header.
+
+ After processing, it queries the endpoint and returns the result.
+
+ :param kw: Keyword arguments including 'hostname'.
+ :type kw: dict
+
+ :return: Result of the LED-related operation.
+ :rtype: dict[str, Any]
+ """
+ method: str = cherrypy.request.method
+ header: MutableMapping[str, str] = {}
+ hostname: Optional[str] = kw.get('hostname')
+ led_type: Optional[str] = kw.get('type')
+ id_drive: Optional[str] = kw.get('id')
+ payload: Optional[Dict[str, str]] = None
+ endpoint: List[Any] = ['led', led_type]
+ device: str = id_drive if id_drive else ''
+
+ ssl_root_crt = self.mgr.http_server.agent.ssl_certs.get_root_cert()
+ ssl_ctx = ssl.create_default_context()
+ ssl_ctx.check_hostname = True
+ ssl_ctx.verify_mode = ssl.CERT_REQUIRED
+ ssl_ctx.load_verify_locations(cadata=ssl_root_crt)
+
+ if not hostname:
+ msg: str = "listing enclosure LED status for all nodes is not implemented."
+ self.mgr.log.debug(msg)
+ raise cherrypy.HTTPError(501, msg)
+
+ if not led_type:
+ msg = "the led type must be provided (either 'chassis' or 'drive')."
+ self.mgr.log.debug(msg)
+ raise cherrypy.HTTPError(400, msg)
+
+ if led_type == 'drive' and not id_drive:
+ msg = "the id of the drive must be provided when type is 'drive'."
+ self.mgr.log.debug(msg)
+ raise cherrypy.HTTPError(400, msg)
+
+ if led_type == 'drive':
+ endpoint.append(device)
+
+ if hostname not in self.mgr.node_proxy_cache.data.keys():
+ # TODO(guits): update unit test for this
+ msg = f"'{hostname}' not found."
+ self.mgr.log.debug(msg)
+ raise cherrypy.HTTPError(400, msg)
+
+ addr: str = self.mgr.inventory.get_addr(hostname)
+
+ if method == 'PATCH':
+ # TODO(guits): need to check the request is authorized
+ # allowing a specific keyring only ? (client.admin or client.agent.. ?)
+ data: Dict[str, Any] = cherrypy.request.json
+ if 'state' not in data.keys():
+ msg = "'state' key not provided."
+ raise cherrypy.HTTPError(400, msg)
+ if 'keyring' not in data.keys():
+ msg = "'keyring' key must be provided."
+ raise cherrypy.HTTPError(400, msg)
+ if data['keyring'] != self.mgr.node_proxy_cache.keyrings.get(hostname):
+ msg = 'wrong keyring provided.'
+ raise cherrypy.HTTPError(401, msg)
+ payload = {}
+ payload['state'] = data['state']
+
+ if led_type == 'drive':
+ if id_drive not in self.mgr.node_proxy_cache.data[hostname]['status']['storage'].keys():
+ # TODO(guits): update unit test for this
+ msg = f"'{id_drive}' not found."
+ self.mgr.log.debug(msg)
+ raise cherrypy.HTTPError(400, msg)
+
+ endpoint = f'/{"/".join(endpoint)}'
+ header = self.mgr.node_proxy.generate_auth_header(hostname)
+
+ try:
+ headers, result, status = http_req(hostname=addr,
+ port='9456',
+ headers=header,
+ method=method,
+ data=json.dumps(payload),
+ endpoint=endpoint,
+ ssl_ctx=ssl_ctx)
+ response_json = json.loads(result)
+ except HTTPError as e:
+ self.mgr.log.debug(e)
+ except URLError:
+ raise cherrypy.HTTPError(502, f'Make sure the node-proxy agent is deployed and running on {hostname}')
+
+ return response_json
+
+ @cherrypy.expose
+ @cherrypy.tools.allow(methods=['GET'])
+ @cherrypy.tools.json_out()
+ def fullreport(self, **kw: Any) -> Dict[str, Any]:
+ """
+ Handles GET request to retrieve a full report.
+
+ This function is exposed to handle GET requests and retrieves a comprehensive
+ report using the 'fullreport' method from the NodeProxyCache class.
+
+ :param kw: Keyword arguments for the request.
+ :type kw: dict
+
+ :return: The full report data.
+ :rtype: dict[str, Any]
+
+ :raises cherrypy.HTTPError 404: If the passed hostname is not found.
+ """
+ try:
+ results = self.mgr.node_proxy_cache.fullreport(**kw)
+ except KeyError:
+ raise cherrypy.HTTPError(404, f"{kw.get('hostname')} not found.")
+ return results
+
+ @cherrypy.expose
+ @cherrypy.tools.allow(methods=['GET'])
+ @cherrypy.tools.json_out()
+ def criticals(self, **kw: Any) -> Dict[str, Any]:
+ """
+ Handles GET request to retrieve critical information.
+
+ This function is exposed to handle GET requests and fetches critical data
+ using the 'criticals' method from the NodeProxyCache class.
+
+ :param kw: Keyword arguments for the request.
+ :type kw: dict
+
+ :return: Critical information data.
+ :rtype: dict[str, Any]
+
+ :raises cherrypy.HTTPError 404: If the passed hostname is not found.
+ """
+ try:
+ results = self.mgr.node_proxy_cache.criticals(**kw)
+ except KeyError:
+ raise cherrypy.HTTPError(404, f"{kw.get('hostname')} not found.")
+ return results
+
+ @cherrypy.expose
+ @cherrypy.tools.allow(methods=['GET'])
+ @cherrypy.tools.json_out()
+ def summary(self, **kw: Any) -> Dict[str, Any]:
+ """
+ Handles GET request to retrieve summary information.
+
+ This function is exposed to handle GET requests and fetches summary
+ data using the 'summary' method from the NodeProxyCache class.
+
+ :param kw: Keyword arguments for the request.
+ :type kw: dict
+
+ :return: Summary information data.
+ :rtype: dict[str, Any]
+
+ :raises cherrypy.HTTPError 404: If the passed hostname is not found.
+ """
+ try:
+ results = self.mgr.node_proxy_cache.summary(**kw)
+ except KeyError:
+ raise cherrypy.HTTPError(404, f"{kw.get('hostname')} not found.")
+ return results
+
+ @cherrypy.expose
+ @cherrypy.tools.allow(methods=['GET'])
+ @cherrypy.tools.json_out()
+ def memory(self, **kw: Any) -> Dict[str, Any]:
+ """
+ Handles GET request to retrieve specific information.
+
+ This function is exposed to handle GET requests
+ and fetch specific data using the 'common' method
+ from the NodeProxyCache class with.
+
+ :param kw: Keyword arguments for the request.
+ :type kw: dict
+
+ :return: Specific information data.
+ :rtype: dict[str, Any]
+
+ :raises cherrypy.HTTPError 404: If the passed hostname is not found.
+ """
+ try:
+ results = self.mgr.node_proxy_cache.common('memory', **kw)
+ except KeyError:
+ raise cherrypy.HTTPError(404, f"{kw.get('hostname')} not found.")
+ return results
+
+ @cherrypy.expose
+ @cherrypy.tools.allow(methods=['GET'])
+ @cherrypy.tools.json_out()
+ def network(self, **kw: Any) -> Dict[str, Any]:
+ """
+ Handles GET request to retrieve specific information.
+
+ This function is exposed to handle GET requests
+ and fetch specific data using the 'common' method
+ from the NodeProxyCache class with.
+
+ :param kw: Keyword arguments for the request.
+ :type kw: dict
+
+ :return: Specific information data.
+ :rtype: dict[str, Any]
+
+ :raises cherrypy.HTTPError 404: If the passed hostname is not found.
+ """
+ try:
+ results = self.mgr.node_proxy_cache.common('network', **kw)
+ except KeyError:
+ raise cherrypy.HTTPError(404, f"{kw.get('hostname')} not found.")
+ return results
+
+ @cherrypy.expose
+ @cherrypy.tools.allow(methods=['GET'])
+ @cherrypy.tools.json_out()
+ def processors(self, **kw: Any) -> Dict[str, Any]:
+ """
+ Handles GET request to retrieve specific information.
+
+ This function is exposed to handle GET requests
+ and fetch specific data using the 'common' method
+ from the NodeProxyCache class with.
+
+ :param kw: Keyword arguments for the request.
+ :type kw: dict
+
+ :return: Specific information data.
+ :rtype: dict[str, Any]
+
+ :raises cherrypy.HTTPError 404: If the passed hostname is not found.
+ """
+ try:
+ results = self.mgr.node_proxy_cache.common('processors', **kw)
+ except KeyError:
+ raise cherrypy.HTTPError(404, f"{kw.get('hostname')} not found.")
+ return results
+
+ @cherrypy.expose
+ @cherrypy.tools.allow(methods=['GET'])
+ @cherrypy.tools.json_out()
+ def storage(self, **kw: Any) -> Dict[str, Any]:
+ """
+ Handles GET request to retrieve specific information.
+
+ This function is exposed to handle GET requests
+ and fetch specific data using the 'common' method
+ from the NodeProxyCache class with.
+
+ :param kw: Keyword arguments for the request.
+ :type kw: dict
+
+ :return: Specific information data.
+ :rtype: dict[str, Any]
+
+ :raises cherrypy.HTTPError 404: If the passed hostname is not found.
+ """
+ try:
+ results = self.mgr.node_proxy_cache.common('storage', **kw)
+ except KeyError:
+ raise cherrypy.HTTPError(404, f"{kw.get('hostname')} not found.")
+ return results
+
+ @cherrypy.expose
+ @cherrypy.tools.allow(methods=['GET'])
+ @cherrypy.tools.json_out()
+ def power(self, **kw: Any) -> Dict[str, Any]:
+ """
+ Handles GET request to retrieve specific information.
+
+ This function is exposed to handle GET requests
+ and fetch specific data using the 'common' method
+ from the NodeProxyCache class with.
+
+ :param kw: Keyword arguments for the request.
+ :type kw: dict
+
+ :return: Specific information data.
+ :rtype: dict[str, Any]
+
+ :raises cherrypy.HTTPError 404: If the passed hostname is not found.
+ """
+ try:
+ results = self.mgr.node_proxy_cache.common('power', **kw)
+ except KeyError:
+ raise cherrypy.HTTPError(404, f"{kw.get('hostname')} not found.")
+ return results
+
+ @cherrypy.expose
+ @cherrypy.tools.allow(methods=['GET'])
+ @cherrypy.tools.json_out()
+ def fans(self, **kw: Any) -> Dict[str, Any]:
+ """
+ Handles GET request to retrieve specific information.
+
+ This function is exposed to handle GET requests
+ and fetch specific data using the 'common' method
+ from the NodeProxyCache class with.
+
+ :param kw: Keyword arguments for the request.
+ :type kw: dict
+
+ :return: Specific information data.
+ :rtype: dict[str, Any]
+
+ :raises cherrypy.HTTPError 404: If the passed hostname is not found.
+ """
+ try:
+ results = self.mgr.node_proxy_cache.common('fans', **kw)
+ except KeyError:
+ raise cherrypy.HTTPError(404, f"{kw.get('hostname')} not found.")
+ return results
+
+ @cherrypy.expose
+ @cherrypy.tools.allow(methods=['GET'])
+ @cherrypy.tools.json_out()
+ def firmwares(self, **kw: Any) -> Dict[str, Any]:
+ """
+ Handles GET request to retrieve firmware information.
+
+ This function is exposed to handle GET requests and fetches firmware data using
+ the 'firmwares' method from the NodeProxyCache class.
+
+ :param kw: Keyword arguments for the request.
+ :type kw: dict
+
+ :return: Firmware information data.
+ :rtype: dict[str, Any]
+
+ :raises cherrypy.HTTPError 404: If the passed hostname is not found.
+ """
+ try:
+ results = self.mgr.node_proxy_cache.firmwares(**kw)
+ except KeyError:
+ raise cherrypy.HTTPError(404, f"{kw.get('hostname')} not found.")
+ return results
+
+
class HostData(Server):
exposed = True
@@ -109,9 +639,11 @@ class HostData(Server):
self.unsubscribe()
super().stop()
+ @cherrypy.tools.allow(methods=['POST'])
@cherrypy.tools.json_in()
@cherrypy.tools.json_out()
- def POST(self) -> Dict[str, Any]:
+ @cherrypy.expose
+ def index(self) -> Dict[str, Any]:
data: Dict[str, Any] = cherrypy.request.json
results: Dict[str, Any] = {}
try:
@@ -234,6 +766,7 @@ class AgentMessageThread(threading.Thread):
self.port = port
self.data: str = json.dumps(data)
self.daemon_spec: Optional[CephadmDaemonDeploySpec] = daemon_spec
+ self.agent_response: str = ''
super().__init__(target=self.run)
def run(self) -> None:
@@ -286,8 +819,8 @@ class AgentMessageThread(threading.Thread):
secure_agent_socket.connect((self.addr, self.port))
msg = (bytes_len + self.data)
secure_agent_socket.sendall(msg.encode('utf-8'))
- agent_response = secure_agent_socket.recv(1024).decode()
- self.mgr.log.debug(f'Received "{agent_response}" from agent on host {self.host}')
+ self.agent_response = secure_agent_socket.recv(1024).decode()
+ self.mgr.log.debug(f'Received "{self.agent_response}" from agent on host {self.host}')
if self.daemon_spec:
self.mgr.agent_cache.agent_config_successfully_delivered(self.daemon_spec)
self.mgr.agent_cache.sending_agent_message[self.host] = False
@@ -307,6 +840,9 @@ class AgentMessageThread(threading.Thread):
self.mgr.agent_cache.sending_agent_message[self.host] = False
return
+ def get_agent_response(self) -> str:
+ return self.agent_response
+
class CephadmAgentHelpers:
def __init__(self, mgr: "CephadmOrchestrator"):
@@ -403,10 +939,11 @@ class CephadmAgentHelpers:
if 'agent' in self.mgr.spec_store:
self.mgr.spec_store.rm('agent')
need_apply = True
- self.mgr.agent_cache.agent_counter = {}
- self.mgr.agent_cache.agent_timestamp = {}
- self.mgr.agent_cache.agent_keys = {}
- self.mgr.agent_cache.agent_ports = {}
+ if not self.mgr.cache.get_daemons_by_service('agent'):
+ self.mgr.agent_cache.agent_counter = {}
+ self.mgr.agent_cache.agent_timestamp = {}
+ self.mgr.agent_cache.agent_keys = {}
+ self.mgr.agent_cache.agent_ports = {}
return need_apply
def _check_agent(self, host: str) -> bool:
diff --git a/src/pybind/mgr/cephadm/autotune.py b/src/pybind/mgr/cephadm/autotune.py
index 51c931cba..72ebcd660 100644
--- a/src/pybind/mgr/cephadm/autotune.py
+++ b/src/pybind/mgr/cephadm/autotune.py
@@ -15,6 +15,7 @@ class MemoryAutotuner(object):
'crash': 128 * 1048576,
'keepalived': 128 * 1048576,
'haproxy': 128 * 1048576,
+ 'nvmeof': 4096 * 1048576,
}
default_size = 1024 * 1048576
diff --git a/src/pybind/mgr/cephadm/configchecks.py b/src/pybind/mgr/cephadm/configchecks.py
index b9dcb18f4..9077a1d66 100644
--- a/src/pybind/mgr/cephadm/configchecks.py
+++ b/src/pybind/mgr/cephadm/configchecks.py
@@ -674,7 +674,7 @@ class CephadmConfigChecks:
self.host_to_role[hostname] = list(self.mgr.cache.get_daemon_types(hostname))
def run_checks(self) -> None:
- checks_enabled = self.mgr.get_module_option('config_checks_enabled')
+ checks_enabled = self.mgr.config_checks_enabled
if checks_enabled is not True:
return
diff --git a/src/pybind/mgr/cephadm/inventory.py b/src/pybind/mgr/cephadm/inventory.py
index 7153ca6dc..966ffc046 100644
--- a/src/pybind/mgr/cephadm/inventory.py
+++ b/src/pybind/mgr/cephadm/inventory.py
@@ -8,7 +8,7 @@ import logging
import math
import socket
from typing import TYPE_CHECKING, Dict, List, Iterator, Optional, Any, Tuple, Set, Mapping, cast, \
- NamedTuple, Type
+ NamedTuple, Type, ValuesView
import orchestrator
from ceph.deployment import inventory
@@ -29,6 +29,7 @@ logger = logging.getLogger(__name__)
HOST_CACHE_PREFIX = "host."
SPEC_STORE_PREFIX = "spec."
AGENT_CACHE_PREFIX = 'agent.'
+NODE_PROXY_CACHE_PREFIX = 'node_proxy'
class HostCacheStatus(enum.Enum):
@@ -1405,6 +1406,196 @@ class HostCache():
return self.scheduled_daemon_actions.get(host, {}).get(daemon)
+class NodeProxyCache:
+ def __init__(self, mgr: "CephadmOrchestrator") -> None:
+ self.mgr = mgr
+ self.data: Dict[str, Any] = {}
+ self.oob: Dict[str, Any] = {}
+ self.keyrings: Dict[str, str] = {}
+
+ def load(self) -> None:
+ _oob = self.mgr.get_store(f'{NODE_PROXY_CACHE_PREFIX}/oob', '{}')
+ self.oob = json.loads(_oob)
+
+ _keyrings = self.mgr.get_store(f'{NODE_PROXY_CACHE_PREFIX}/keyrings', '{}')
+ self.keyrings = json.loads(_keyrings)
+
+ for k, v in self.mgr.get_store_prefix(f'{NODE_PROXY_CACHE_PREFIX}/data').items():
+ host = k.split('/')[-1:][0]
+
+ if host not in self.mgr.inventory.keys():
+ # remove entry for host that no longer exists
+ self.mgr.set_store(f"{NODE_PROXY_CACHE_PREFIX}/data/{host}", None)
+ try:
+ self.oob.pop(host)
+ self.data.pop(host)
+ self.keyrings.pop(host)
+ except KeyError:
+ pass
+ continue
+
+ self.data[host] = json.loads(v)
+
+ def save(self,
+ host: str = '',
+ data: Dict[str, Any] = {}) -> None:
+ self.mgr.set_store(f"{NODE_PROXY_CACHE_PREFIX}/data/{host}", json.dumps(data))
+
+ def update_oob(self, host: str, host_oob_info: Dict[str, str]) -> None:
+ self.oob[host] = host_oob_info
+ self.mgr.set_store(f"{NODE_PROXY_CACHE_PREFIX}/oob", json.dumps(self.oob))
+
+ def update_keyring(self, host: str, key: str) -> None:
+ self.keyrings[host] = key
+ self.mgr.set_store(f"{NODE_PROXY_CACHE_PREFIX}/keyrings", json.dumps(self.keyrings))
+
+ def fullreport(self, **kw: Any) -> Dict[str, Any]:
+ """
+ Retrieves the full report for the specified hostname.
+
+ If a hostname is provided in the keyword arguments, it retrieves the full report
+ data for that specific host. If no hostname is provided, it fetches the full
+ report data for all hosts available.
+
+ :param kw: Keyword arguments including 'hostname'.
+ :type kw: dict
+
+ :return: The full report data for the specified hostname(s).
+ :rtype: dict
+ """
+ hostname = kw.get('hostname')
+ hosts = [hostname] if hostname else self.data.keys()
+ return {host: self.data[host] for host in hosts}
+
+ def summary(self, **kw: Any) -> Dict[str, Any]:
+ """
+ Summarizes the health status of components for specified hosts or all hosts.
+
+ Generates a summary of the health status of components for given hosts. If
+ no hostname is provided, it generates the health status summary for all hosts.
+ It inspects the status of each component and categorizes it as 'ok' or 'error'
+ based on the health status of its members.
+
+ :param kw: Keyword arguments including 'hostname'.
+ :type kw: dict
+
+ :return: A dictionary containing the health status summary for each specified
+ host or all hosts and their components.
+ :rtype: Dict[str, Dict[str, str]]
+ """
+ hostname = kw.get('hostname')
+ hosts = [hostname] if hostname else self.data.keys()
+
+ def is_unknown(statuses: ValuesView) -> bool:
+ return any([status['status']['health'].lower() == 'unknown' for status in statuses]) and not is_error(statuses)
+
+ def is_error(statuses: ValuesView) -> bool:
+ return any([status['status']['health'].lower() == 'error' for status in statuses])
+
+ _result: Dict[str, Any] = {}
+
+ for host in hosts:
+ _result[host] = {}
+ _result[host]['status'] = {}
+ data = self.data[host]
+ for component in data['status'].keys():
+ values = data['status'][component].values()
+ if is_error(values):
+ state = 'error'
+ elif is_unknown(values):
+ state = 'unknown'
+ else:
+ state = 'ok'
+ _result[host]['status'][component] = state
+ _result[host]['sn'] = data['sn']
+ _result[host]['host'] = data['host']
+ _result[host]['firmwares'] = data['firmwares']
+ return _result
+
+ def common(self, endpoint: str, **kw: Any) -> Dict[str, Any]:
+ """
+ Retrieves specific endpoint information for a specific hostname or all hosts.
+
+ Retrieves information from the specified 'endpoint' for all available hosts.
+ If 'hostname' is provided, retrieves the specified 'endpoint' information for that host.
+
+ :param endpoint: The endpoint for which information is retrieved.
+ :type endpoint: str
+ :param kw: Keyword arguments, including 'hostname' if specified.
+ :type kw: dict
+
+ :return: Endpoint information for the specified host(s).
+ :rtype: Union[Dict[str, Any], Any]
+ """
+ hostname = kw.get('hostname')
+ _result = {}
+ hosts = [hostname] if hostname else self.data.keys()
+
+ for host in hosts:
+ try:
+ _result[host] = self.data[host]['status'][endpoint]
+ except KeyError:
+ raise KeyError(f'Invalid host {host} or component {endpoint}.')
+ return _result
+
+ def firmwares(self, **kw: Any) -> Dict[str, Any]:
+ """
+ Retrieves firmware information for a specific hostname or all hosts.
+
+ If a 'hostname' is provided in the keyword arguments, retrieves firmware
+ information for that specific host. Otherwise, retrieves firmware
+ information for all available hosts.
+
+ :param kw: Keyword arguments, including 'hostname' if specified.
+ :type kw: dict
+
+ :return: A dictionary containing firmware information for each host.
+ :rtype: Dict[str, Any]
+ """
+ hostname = kw.get('hostname')
+ hosts = [hostname] if hostname else self.data.keys()
+
+ return {host: self.data[host]['firmwares'] for host in hosts}
+
+ def get_critical_from_host(self, hostname: str) -> Dict[str, Any]:
+ results: Dict[str, Any] = {}
+ for component, data_component in self.data[hostname]['status'].items():
+ if component not in results.keys():
+ results[component] = {}
+ for member, data_member in data_component.items():
+ if component == 'power':
+ data_member['status']['health'] = 'critical'
+ data_member['status']['state'] = 'unplugged'
+ if component == 'memory':
+ data_member['status']['health'] = 'critical'
+ data_member['status']['state'] = 'errors detected'
+ if data_member['status']['health'].lower() != 'ok':
+ results[component][member] = data_member
+ return results
+
+ def criticals(self, **kw: Any) -> Dict[str, Any]:
+ """
+ Retrieves critical information for a specific hostname or all hosts.
+
+ If a 'hostname' is provided in the keyword arguments, retrieves critical
+ information for that specific host. Otherwise, retrieves critical
+ information for all available hosts.
+
+ :param kw: Keyword arguments, including 'hostname' if specified.
+ :type kw: dict
+
+ :return: A dictionary containing critical information for each host.
+ :rtype: List[Dict[str, Any]]
+ """
+ hostname = kw.get('hostname')
+ results: Dict[str, Any] = {}
+
+ hosts = [hostname] if hostname else self.data.keys()
+ for host in hosts:
+ results[host] = self.get_critical_from_host(host)
+ return results
+
+
class AgentCache():
"""
AgentCache is used for storing metadata about agent daemons that must be kept
@@ -1507,6 +1698,8 @@ class EventStore():
for e in self.events[event.kind_subject()]:
if e.message == event.message:
+ # if subject and message match, just update the timestamp
+ e.created = event.created
return
self.events[event.kind_subject()].append(event)
diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py
index 7b97ce74a..f1d234fb2 100644
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -1,4 +1,5 @@
import asyncio
+import concurrent
import json
import errno
import ipaddress
@@ -10,6 +11,7 @@ from configparser import ConfigParser
from contextlib import contextmanager
from functools import wraps
from tempfile import TemporaryDirectory, NamedTemporaryFile
+from urllib.error import HTTPError
from threading import Event
from cephadm.service_discovery import ServiceDiscovery
@@ -39,7 +41,14 @@ from cephadm.http_server import CephadmHttpServer
from cephadm.agent import CephadmAgentHelpers
-from mgr_module import MgrModule, HandleCommandResult, Option, NotifyType
+from mgr_module import (
+ MgrModule,
+ HandleCommandResult,
+ Option,
+ NotifyType,
+ MonCommandFailed,
+)
+from mgr_util import build_url
import orchestrator
from orchestrator.module import to_format, Format
@@ -64,9 +73,10 @@ from .services.osd import OSDRemovalQueue, OSDService, OSD, NotFoundError
from .services.monitoring import GrafanaService, AlertmanagerService, PrometheusService, \
NodeExporterService, SNMPGatewayService, LokiService, PromtailService
from .services.jaeger import ElasticSearchService, JaegerAgentService, JaegerCollectorService, JaegerQueryService
+from .services.node_proxy import NodeProxy
from .schedule import HostAssignment
from .inventory import Inventory, SpecStore, HostCache, AgentCache, EventStore, \
- ClientKeyringStore, ClientKeyringSpec, TunedProfileStore
+ ClientKeyringStore, ClientKeyringSpec, TunedProfileStore, NodeProxyCache
from .upgrade import CephadmUpgrade
from .template import TemplateMgr
from .utils import CEPH_IMAGE_TYPES, RESCHEDULE_FROM_OFFLINE_HOSTS_TYPES, forall_hosts, \
@@ -107,7 +117,7 @@ os._exit = os_exit_noop # type: ignore
DEFAULT_IMAGE = 'quay.io/ceph/ceph' # DO NOT ADD TAG TO THIS
DEFAULT_PROMETHEUS_IMAGE = 'quay.io/prometheus/prometheus:v2.43.0'
DEFAULT_NODE_EXPORTER_IMAGE = 'quay.io/prometheus/node-exporter:v1.5.0'
-DEFAULT_NVMEOF_IMAGE = 'quay.io/ceph/nvmeof:0.0.2'
+DEFAULT_NVMEOF_IMAGE = 'quay.io/ceph/nvmeof:1.0.0'
DEFAULT_LOKI_IMAGE = 'docker.io/grafana/loki:2.4.0'
DEFAULT_PROMTAIL_IMAGE = 'docker.io/grafana/promtail:2.4.0'
DEFAULT_ALERT_MANAGER_IMAGE = 'quay.io/prometheus/alertmanager:v0.25.0'
@@ -436,6 +446,12 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
desc='Multiplied by agent refresh rate to calculate how long agent must not report before being marked down'
),
Option(
+ 'hw_monitoring',
+ type='bool',
+ default=False,
+ desc='Deploy hw monitoring daemon on every host.'
+ ),
+ Option(
'max_osd_draining_count',
type='int',
default=10,
@@ -467,11 +483,17 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
),
Option(
'default_cephadm_command_timeout',
- type='secs',
+ type='int',
default=15 * 60,
desc='Default timeout applied to cephadm commands run directly on '
'the host (in seconds)'
),
+ Option(
+ 'oob_default_addr',
+ type='str',
+ default='169.254.1.1',
+ desc="Default address for RedFish API (oob management)."
+ ),
]
def __init__(self, *args: Any, **kwargs: Any):
@@ -529,6 +551,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
self.registry_password: Optional[str] = None
self.registry_insecure: bool = False
self.use_repo_digest = True
+ self.config_checks_enabled = False
self.default_registry = ''
self.autotune_memory_target_ratio = 0.0
self.autotune_interval = 0
@@ -545,6 +568,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
self.agent_refresh_rate = 0
self.agent_down_multiplier = 0.0
self.agent_starting_port = 0
+ self.hw_monitoring = False
self.service_discovery_port = 0
self.secure_monitoring_stack = False
self.apply_spec_fails: List[Tuple[str, str]] = []
@@ -554,6 +578,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
self.cgroups_split = True
self.log_refresh_metadata = False
self.default_cephadm_command_timeout = 0
+ self.oob_default_addr = ''
self.notify(NotifyType.mon_map, None)
self.config_notify()
@@ -584,6 +609,9 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
self.cache = HostCache(self)
self.cache.load()
+ self.node_proxy_cache = NodeProxyCache(self)
+ self.node_proxy_cache.load()
+
self.agent_cache = AgentCache(self)
self.agent_cache.load()
@@ -621,7 +649,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
PrometheusService, NodeExporterService, LokiService, PromtailService, CrashService, IscsiService,
IngressService, CustomContainerService, CephfsMirrorService, NvmeofService,
CephadmAgent, CephExporterService, SNMPGatewayService, ElasticSearchService,
- JaegerQueryService, JaegerAgentService, JaegerCollectorService
+ JaegerQueryService, JaegerAgentService, JaegerCollectorService, NodeProxy
]
# https://github.com/python/mypy/issues/8993
@@ -632,6 +660,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
self.osd_service: OSDService = cast(OSDService, self.cephadm_services['osd'])
self.iscsi_service: IscsiService = cast(IscsiService, self.cephadm_services['iscsi'])
self.nvmeof_service: NvmeofService = cast(NvmeofService, self.cephadm_services['nvmeof'])
+ self.node_proxy_service: NodeProxy = cast(NodeProxy, self.cephadm_services['node-proxy'])
self.scheduled_async_actions: List[Callable] = []
@@ -644,6 +673,9 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
self.http_server = CephadmHttpServer(self)
self.http_server.start()
+
+ self.node_proxy = NodeProxy(self)
+
self.agent_helpers = CephadmAgentHelpers(self)
if self.use_agent:
self.agent_helpers._apply_agent()
@@ -709,7 +741,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
# are provided, that will be included in the OrchestratorError's message
try:
yield
- except asyncio.TimeoutError:
+ except (asyncio.TimeoutError, concurrent.futures.TimeoutError):
err_str: str = ''
if cmd:
err_str = f'Command "{cmd}" timed out '
@@ -722,6 +754,16 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
else:
err_str += (f'(default {self.default_cephadm_command_timeout} second timeout)')
raise OrchestratorError(err_str)
+ except concurrent.futures.CancelledError as e:
+ err_str = ''
+ if cmd:
+ err_str = f'Command "{cmd}" failed '
+ else:
+ err_str = 'Command failed '
+ if host:
+ err_str += f'on host {host} '
+ err_str += f' - {str(e)}'
+ raise OrchestratorError(err_str)
def set_container_image(self, entity: str, image: str) -> None:
self.check_mon_command({
@@ -810,7 +852,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
Generate a unique random service name
"""
suffix = daemon_type not in [
- 'mon', 'crash', 'ceph-exporter',
+ 'mon', 'crash', 'ceph-exporter', 'node-proxy',
'prometheus', 'node-exporter', 'grafana', 'alertmanager',
'container', 'agent', 'snmp-gateway', 'loki', 'promtail',
'elasticsearch', 'jaeger-collector', 'jaeger-agent', 'jaeger-query'
@@ -948,6 +990,17 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
self.set_health_warning('CEPHADM_FAILED_DAEMON', f'{len(failed_daemons)} failed cephadm daemon(s)', len(
failed_daemons), failed_daemons)
+ def get_first_matching_network_ip(self, host: str, sspec: ServiceSpec) -> Optional[str]:
+ sspec_networks = sspec.networks
+ for subnet, ifaces in self.cache.networks.get(host, {}).items():
+ host_network = ipaddress.ip_network(subnet)
+ for spec_network_str in sspec_networks:
+ spec_network = ipaddress.ip_network(spec_network_str)
+ if host_network.overlaps(spec_network):
+ return list(ifaces.values())[0][0]
+ logger.error(f'{spec_network} from {sspec.service_name()} spec does not overlap with {host_network} on {host}')
+ return None
+
@staticmethod
def can_run() -> Tuple[bool, str]:
if asyncssh is not None:
@@ -1320,7 +1373,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
@orchestrator._cli_read_command('cephadm config-check status')
def _config_check_status(self) -> HandleCommandResult:
"""Show whether the configuration checker feature is enabled/disabled"""
- status = self.get_module_option('config_checks_enabled')
+ status = self.config_checks_enabled
return HandleCommandResult(stdout="Enabled" if status else "Disabled")
@orchestrator._cli_write_command('cephadm config-check enable')
@@ -1597,6 +1650,18 @@ Then run the following:
if spec.hostname in self.inventory and self.inventory.get_addr(spec.hostname) != spec.addr:
self.cache.refresh_all_host_info(spec.hostname)
+ if spec.oob:
+ if not spec.oob.get('addr'):
+ spec.oob['addr'] = self.oob_default_addr
+ if not spec.oob.get('port'):
+ spec.oob['port'] = '443'
+ host_oob_info = dict()
+ host_oob_info['addr'] = spec.oob['addr']
+ host_oob_info['port'] = spec.oob['port']
+ host_oob_info['username'] = spec.oob['username']
+ host_oob_info['password'] = spec.oob['password']
+ self.node_proxy_cache.update_oob(spec.hostname, host_oob_info)
+
# prime crush map?
if spec.location:
self.check_mon_command({
@@ -1621,7 +1686,72 @@ Then run the following:
return self._add_host(spec)
@handle_orch_error
- def remove_host(self, host: str, force: bool = False, offline: bool = False) -> str:
+ def hardware_light(self, light_type: str, action: str, hostname: str, device: Optional[str] = None) -> Dict[str, Any]:
+ try:
+ result = self.node_proxy.led(light_type=light_type,
+ action=action,
+ hostname=hostname,
+ device=device)
+ except RuntimeError as e:
+ self.log.error(e)
+ raise OrchestratorValidationError(f'Make sure the node-proxy agent is deployed and running on {hostname}')
+ except HTTPError as e:
+ self.log.error(e)
+ raise OrchestratorValidationError(f"http error while querying node-proxy API: {e}")
+ return result
+
+ @handle_orch_error
+ def hardware_shutdown(self, hostname: str, force: Optional[bool] = False, yes_i_really_mean_it: bool = False) -> str:
+ if not yes_i_really_mean_it:
+ raise OrchestratorError("you must pass --yes-i-really-mean-it")
+
+ try:
+ self.node_proxy.shutdown(hostname, force)
+ except RuntimeError as e:
+ self.log.error(e)
+ raise OrchestratorValidationError(f'Make sure the node-proxy agent is deployed and running on {hostname}')
+ except HTTPError as e:
+ self.log.error(e)
+ raise OrchestratorValidationError(f"Can't shutdown node {hostname}: {e}")
+ return f'Shutdown scheduled on {hostname}'
+
+ @handle_orch_error
+ def hardware_powercycle(self, hostname: str, yes_i_really_mean_it: bool = False) -> str:
+ if not yes_i_really_mean_it:
+ raise OrchestratorError("you must pass --yes-i-really-mean-it")
+
+ try:
+ self.node_proxy.powercycle(hostname)
+ except RuntimeError as e:
+ self.log.error(e)
+ raise OrchestratorValidationError(f'Make sure the node-proxy agent is deployed and running on {hostname}')
+ except HTTPError as e:
+ self.log.error(e)
+ raise OrchestratorValidationError(f"Can't perform powercycle on node {hostname}: {e}")
+ return f'Powercycle scheduled on {hostname}'
+
+ @handle_orch_error
+ def node_proxy_fullreport(self, hostname: Optional[str] = None) -> Dict[str, Any]:
+ return self.node_proxy_cache.fullreport(hostname=hostname)
+
+ @handle_orch_error
+ def node_proxy_summary(self, hostname: Optional[str] = None) -> Dict[str, Any]:
+ return self.node_proxy_cache.summary(hostname=hostname)
+
+ @handle_orch_error
+ def node_proxy_firmwares(self, hostname: Optional[str] = None) -> Dict[str, Any]:
+ return self.node_proxy_cache.firmwares(hostname=hostname)
+
+ @handle_orch_error
+ def node_proxy_criticals(self, hostname: Optional[str] = None) -> Dict[str, Any]:
+ return self.node_proxy_cache.criticals(hostname=hostname)
+
+ @handle_orch_error
+ def node_proxy_common(self, category: str, hostname: Optional[str] = None) -> Dict[str, Any]:
+ return self.node_proxy_cache.common(category, hostname=hostname)
+
+ @handle_orch_error
+ def remove_host(self, host: str, force: bool = False, offline: bool = False, rm_crush_entry: bool = False) -> str:
"""
Remove a host from orchestrator management.
@@ -1701,6 +1831,17 @@ Then run the following:
}
run_cmd(cmd_args)
+ if rm_crush_entry:
+ try:
+ self.check_mon_command({
+ 'prefix': 'osd crush remove',
+ 'name': host,
+ })
+ except MonCommandFailed as e:
+ self.log.error(f'Couldn\'t remove host {host} from CRUSH map: {str(e)}')
+ return (f'Cephadm failed removing host {host}\n'
+ f'Failed to remove host {host} from the CRUSH map: {str(e)}')
+
self.inventory.rm_host(host)
self.cache.rm_host(host)
self.ssh.reset_con(host)
@@ -2665,6 +2806,15 @@ Then run the following:
pass
deps = sorted([self.get_mgr_ip(), server_port, root_cert,
str(self.device_enhanced_scan)])
+ elif daemon_type == 'node-proxy':
+ root_cert = ''
+ server_port = ''
+ try:
+ server_port = str(self.http_server.agent.server_port)
+ root_cert = self.http_server.agent.ssl_certs.get_root_cert()
+ except Exception:
+ pass
+ deps = sorted([self.get_mgr_ip(), server_port, root_cert])
elif daemon_type == 'iscsi':
if spec:
iscsi_spec = cast(IscsiServiceSpec, spec)
@@ -2704,6 +2854,12 @@ Then run the following:
deps.append(f'{hash(alertmanager_user + alertmanager_password)}')
elif daemon_type == 'promtail':
deps += get_daemon_names(['loki'])
+ elif daemon_type == JaegerAgentService.TYPE:
+ for dd in self.cache.get_daemons_by_type(JaegerCollectorService.TYPE):
+ assert dd.hostname is not None
+ port = dd.ports[0] if dd.ports else JaegerCollectorService.DEFAULT_SERVICE_PORT
+ deps.append(build_url(host=dd.hostname, port=port).lstrip('/'))
+ deps = sorted(deps)
else:
# TODO(redo): some error message!
pass
@@ -3347,8 +3503,7 @@ Then run the following:
"""
for osd_id in osd_ids:
try:
- self.to_remove_osds.rm(OSD(osd_id=int(osd_id),
- remove_util=self.to_remove_osds.rm_util))
+ self.to_remove_osds.rm_by_osd_id(int(osd_id))
except (NotFoundError, KeyError, ValueError):
return f'Unable to find OSD in the queue: {osd_id}'
@@ -3382,6 +3537,18 @@ Then run the following:
f"It is recommended to add the {SpecialHostLabels.ADMIN} label to another host"
" before completing this operation.\nIf you're certain this is"
" what you want rerun this command with --force.")
+ # if the user has specified the host we are going to drain
+ # explicitly in any service spec, warn the user. Having a
+ # drained host listed in a placement provides no value, so
+ # they may want to fix it.
+ services_matching_drain_host: List[str] = []
+ for sname, sspec in self.spec_store.all_specs.items():
+ if sspec.placement.hosts and hostname in [h.hostname for h in sspec.placement.hosts]:
+ services_matching_drain_host.append(sname)
+ if services_matching_drain_host:
+ raise OrchestratorValidationError(f'Host {hostname} was found explicitly listed in the placements '
+ f'of services:\n {services_matching_drain_host}.\nPlease update those '
+ 'specs to not list this host.\nThis warning can be bypassed with --force')
self.add_host_label(hostname, '_no_schedule')
if not keep_conf_keyring:
diff --git a/src/pybind/mgr/cephadm/schedule.py b/src/pybind/mgr/cephadm/schedule.py
index 6666d761e..98d2fe998 100644
--- a/src/pybind/mgr/cephadm/schedule.py
+++ b/src/pybind/mgr/cephadm/schedule.py
@@ -413,6 +413,8 @@ class HostAssignment(object):
hostname=x.hostname, ports=self.ports_start)
for x in self.hosts_by_label(self.spec.placement.label)
]
+ if self.spec.placement.host_pattern:
+ ls = [h for h in ls if h.hostname in self.spec.placement.filter_matching_hostspecs(self.hosts)]
elif self.spec.placement.host_pattern:
ls = [
DaemonPlacement(daemon_type=self.primary_daemon_type,
diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py
index 5dfdc27a3..7bfb3f633 100644
--- a/src/pybind/mgr/cephadm/serve.py
+++ b/src/pybind/mgr/cephadm/serve.py
@@ -67,7 +67,6 @@ class CephadmServe:
of cephadm. This loop will then attempt to apply this new state.
"""
self.log.debug("serve starting")
- self.mgr.config_checker.load_network_config()
while self.mgr.run:
self.log.debug("serve loop start")
@@ -113,9 +112,15 @@ class CephadmServe:
if self.mgr.agent_helpers._handle_use_agent_setting():
continue
+ if self.mgr.node_proxy_service.handle_hw_monitoring_setting():
+ continue
+
if self.mgr.upgrade.continue_upgrade():
continue
+ # refresh node-proxy cache
+ self.mgr.node_proxy_cache.load()
+
except OrchestratorError as e:
if e.event_subject:
self.mgr.events.from_orch_error(e)
@@ -316,7 +321,9 @@ class CephadmServe:
self.mgr.agent_helpers._update_agent_down_healthcheck(agents_down)
self.mgr.http_server.config_update()
- self.mgr.config_checker.run_checks()
+ if self.mgr.config_checks_enabled:
+ self.mgr.config_checker.load_network_config()
+ self.mgr.config_checker.run_checks()
for k in [
'CEPHADM_HOST_CHECK_FAILED',
@@ -882,6 +889,13 @@ class CephadmServe:
hosts_altered.add(d.hostname)
break
+ # do not attempt to deploy node-proxy agent when oob details are not provided.
+ if slot.daemon_type == 'node-proxy' and slot.hostname not in self.mgr.node_proxy_cache.oob.keys():
+ self.log.debug(
+ f'Not deploying node-proxy agent on {slot.hostname} as oob details are not present.'
+ )
+ continue
+
# deploy new daemon
daemon_id = slot.name
@@ -1060,6 +1074,11 @@ class CephadmServe:
diff = list(set(last_deps) - set(deps))
if any('secure_monitoring_stack' in e for e in diff):
action = 'redeploy'
+ elif dd.daemon_type == 'jaeger-agent':
+ # changes to jaeger-agent deps affect the way the unit.run for
+ # the daemon is written, which we rewrite on redeploy, but not
+ # on reconfig.
+ action = 'redeploy'
elif spec is not None and hasattr(spec, 'extra_container_args') and dd.extra_container_args != spec.extra_container_args:
self.log.debug(
diff --git a/src/pybind/mgr/cephadm/service_discovery.py b/src/pybind/mgr/cephadm/service_discovery.py
index ddc0574e2..b3b7b5499 100644
--- a/src/pybind/mgr/cephadm/service_discovery.py
+++ b/src/pybind/mgr/cephadm/service_discovery.py
@@ -19,6 +19,7 @@ import secrets
from cephadm.services.ingress import IngressSpec
from cephadm.ssl_cert_utils import SSLCerts
from cephadm.services.cephadmservice import CephExporterService
+from cephadm.services.nvmeof import NvmeofService
if TYPE_CHECKING:
from cephadm.module import CephadmOrchestrator
@@ -145,6 +146,7 @@ class Root(Server):
<p><a href='prometheus/sd-config?service=node-exporter'>Node exporter http sd-config</a></p>
<p><a href='prometheus/sd-config?service=haproxy'>HAProxy http sd-config</a></p>
<p><a href='prometheus/sd-config?service=ceph-exporter'>Ceph exporter http sd-config</a></p>
+<p><a href='prometheus/sd-config?service=nvmeof'>NVMeoF http sd-config</a></p>
<p><a href='prometheus/rules'>Prometheus rules</a></p>
</body>
</html>'''
@@ -163,6 +165,8 @@ class Root(Server):
return self.haproxy_sd_config()
elif service == 'ceph-exporter':
return self.ceph_exporter_sd_config()
+ elif service == 'nvmeof':
+ return self.nvmeof_sd_config()
else:
return []
@@ -231,6 +235,19 @@ class Root(Server):
})
return srv_entries
+ def nvmeof_sd_config(self) -> List[Dict[str, Collection[str]]]:
+ """Return <http_sd_config> compatible prometheus config for nvmeof service."""
+ srv_entries = []
+ for dd in self.mgr.cache.get_daemons_by_type('nvmeof'):
+ assert dd.hostname is not None
+ addr = dd.ip if dd.ip else self.mgr.inventory.get_addr(dd.hostname)
+ port = NvmeofService.PROMETHEUS_PORT
+ srv_entries.append({
+ 'targets': [build_url(host=addr, port=port).lstrip('/')],
+ 'labels': {'instance': dd.hostname}
+ })
+ return srv_entries
+
@cherrypy.expose(alias='prometheus/rules')
def get_prometheus_rules(self) -> str:
"""Return currently configured prometheus rules as Yaml."""
diff --git a/src/pybind/mgr/cephadm/services/cephadmservice.py b/src/pybind/mgr/cephadm/services/cephadmservice.py
index 7d7a04dad..f3c45b164 100644
--- a/src/pybind/mgr/cephadm/services/cephadmservice.py
+++ b/src/pybind/mgr/cephadm/services/cephadmservice.py
@@ -41,7 +41,7 @@ def get_auth_entity(daemon_type: str, daemon_id: str, host: str = "") -> AuthEnt
# the CephService class refers to service types, not daemon types
if daemon_type in ['rgw', 'rbd-mirror', 'cephfs-mirror', 'nfs', "iscsi", 'nvmeof', 'ingress', 'ceph-exporter']:
return AuthEntity(f'client.{daemon_type}.{daemon_id}')
- elif daemon_type in ['crash', 'agent']:
+ elif daemon_type in ['crash', 'agent', 'node-proxy']:
if host == "":
raise OrchestratorError(
f'Host not provided to generate <{daemon_type}> auth entity name')
diff --git a/src/pybind/mgr/cephadm/services/ingress.py b/src/pybind/mgr/cephadm/services/ingress.py
index 55be30454..5edd2517d 100644
--- a/src/pybind/mgr/cephadm/services/ingress.py
+++ b/src/pybind/mgr/cephadm/services/ingress.py
@@ -187,6 +187,7 @@ class IngressService(CephService):
'monitor_port': daemon_spec.ports[1] if daemon_spec.ports else spec.monitor_port,
'local_host_ip': host_ip,
'default_server_opts': server_opts,
+ 'health_check_interval': spec.health_check_interval or '2s',
}
)
config_files = {
diff --git a/src/pybind/mgr/cephadm/services/jaeger.py b/src/pybind/mgr/cephadm/services/jaeger.py
index c136d20e6..c83c765d0 100644
--- a/src/pybind/mgr/cephadm/services/jaeger.py
+++ b/src/pybind/mgr/cephadm/services/jaeger.py
@@ -20,13 +20,16 @@ class JaegerAgentService(CephadmService):
def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec:
assert self.TYPE == daemon_spec.daemon_type
collectors = []
+ deps: List[str] = []
for dd in self.mgr.cache.get_daemons_by_type(JaegerCollectorService.TYPE):
# scrape jaeger-collector nodes
assert dd.hostname is not None
port = dd.ports[0] if dd.ports else JaegerCollectorService.DEFAULT_SERVICE_PORT
url = build_url(host=dd.hostname, port=port).lstrip('/')
collectors.append(url)
+ deps.append(url)
daemon_spec.final_config = {'collector_nodes': ",".join(collectors)}
+ daemon_spec.deps = sorted(deps)
return daemon_spec
diff --git a/src/pybind/mgr/cephadm/services/monitoring.py b/src/pybind/mgr/cephadm/services/monitoring.py
index 114c84860..10ddcbbd0 100644
--- a/src/pybind/mgr/cephadm/services/monitoring.py
+++ b/src/pybind/mgr/cephadm/services/monitoring.py
@@ -67,13 +67,24 @@ class GrafanaService(CephadmService):
spec: GrafanaSpec = cast(
GrafanaSpec, self.mgr.spec_store.active_specs[daemon_spec.service_name])
+
+ grafana_port = daemon_spec.ports[0] if daemon_spec.ports else self.DEFAULT_SERVICE_PORT
+ grafana_ip = daemon_spec.ip if daemon_spec.ip else ''
+
+ if spec.only_bind_port_on_networks and spec.networks:
+ assert daemon_spec.host is not None
+ ip_to_bind_to = self.mgr.get_first_matching_network_ip(daemon_spec.host, spec)
+ if ip_to_bind_to:
+ daemon_spec.port_ips = {str(grafana_port): ip_to_bind_to}
+ grafana_ip = ip_to_bind_to
+
grafana_ini = self.mgr.template.render(
'services/grafana/grafana.ini.j2', {
'anonymous_access': spec.anonymous_access,
'initial_admin_password': spec.initial_admin_password,
- 'http_port': daemon_spec.ports[0] if daemon_spec.ports else self.DEFAULT_SERVICE_PORT,
+ 'http_port': grafana_port,
'protocol': spec.protocol,
- 'http_addr': daemon_spec.ip if daemon_spec.ip else ''
+ 'http_addr': grafana_ip
})
if 'dashboard' in self.mgr.get('mgr_map')['modules'] and spec.initial_admin_password:
@@ -402,6 +413,7 @@ class PrometheusService(CephadmService):
haproxy_sd_url = f'{srv_end_point}service=haproxy' if haproxy_cnt > 0 else None
mgr_prometheus_sd_url = f'{srv_end_point}service=mgr-prometheus' # always included
ceph_exporter_sd_url = f'{srv_end_point}service=ceph-exporter' # always included
+ nvmeof_sd_url = f'{srv_end_point}service=nvmeof' # always included
alertmanager_user, alertmanager_password = self.mgr._get_alertmanager_credentials()
prometheus_user, prometheus_password = self.mgr._get_prometheus_credentials()
@@ -417,9 +429,17 @@ class PrometheusService(CephadmService):
'node_exporter_sd_url': node_exporter_sd_url,
'alertmanager_sd_url': alertmanager_sd_url,
'haproxy_sd_url': haproxy_sd_url,
- 'ceph_exporter_sd_url': ceph_exporter_sd_url
+ 'ceph_exporter_sd_url': ceph_exporter_sd_url,
+ 'nvmeof_sd_url': nvmeof_sd_url,
}
+ ip_to_bind_to = ''
+ if spec.only_bind_port_on_networks and spec.networks:
+ assert daemon_spec.host is not None
+ ip_to_bind_to = self.mgr.get_first_matching_network_ip(daemon_spec.host, spec) or ''
+ if ip_to_bind_to:
+ daemon_spec.port_ips = {str(port): ip_to_bind_to}
+
web_context = {
'prometheus_web_user': prometheus_user,
'prometheus_web_password': password_hash(prometheus_password),
@@ -446,6 +466,7 @@ class PrometheusService(CephadmService):
},
'retention_time': retention_time,
'retention_size': retention_size,
+ 'ip_to_bind_to': ip_to_bind_to,
'web_config': '/etc/prometheus/web.yml'
}
else:
@@ -454,7 +475,8 @@ class PrometheusService(CephadmService):
'prometheus.yml': self.mgr.template.render('services/prometheus/prometheus.yml.j2', context)
},
'retention_time': retention_time,
- 'retention_size': retention_size
+ 'retention_size': retention_size,
+ 'ip_to_bind_to': ip_to_bind_to
}
# include alerts, if present in the container
diff --git a/src/pybind/mgr/cephadm/services/nfs.py b/src/pybind/mgr/cephadm/services/nfs.py
index f94a00f5b..e0c61b117 100644
--- a/src/pybind/mgr/cephadm/services/nfs.py
+++ b/src/pybind/mgr/cephadm/services/nfs.py
@@ -5,6 +5,8 @@ import os
import subprocess
import tempfile
from typing import Dict, Tuple, Any, List, cast, Optional
+from configparser import ConfigParser
+from io import StringIO
from mgr_module import HandleCommandResult
from mgr_module import NFS_POOL_NAME as POOL_NAME
@@ -79,6 +81,8 @@ class NFSService(CephService):
nodeid = f'{daemon_spec.service_name}.{daemon_spec.rank}'
+ nfs_idmap_conf = '/etc/ganesha/idmap.conf'
+
# create the RADOS recovery pool keyring
rados_user = f'{daemon_type}.{daemon_id}'
rados_keyring = self.create_keyring(daemon_spec)
@@ -115,12 +119,27 @@ class NFSService(CephService):
"port": daemon_spec.ports[0] if daemon_spec.ports else 2049,
"bind_addr": bind_addr,
"haproxy_hosts": [],
+ "nfs_idmap_conf": nfs_idmap_conf,
}
if spec.enable_haproxy_protocol:
context["haproxy_hosts"] = self._haproxy_hosts()
logger.debug("selected haproxy_hosts: %r", context["haproxy_hosts"])
return self.mgr.template.render('services/nfs/ganesha.conf.j2', context)
+ # generate the idmap config
+ def get_idmap_conf() -> str:
+ idmap_conf = spec.idmap_conf
+ output = ''
+ if idmap_conf is not None:
+ cp = ConfigParser()
+ out = StringIO()
+ cp.read_dict(idmap_conf)
+ cp.write(out)
+ out.seek(0)
+ output = out.read()
+ out.close()
+ return output
+
# generate the cephadm config json
def get_cephadm_config() -> Dict[str, Any]:
config: Dict[str, Any] = {}
@@ -130,6 +149,7 @@ class NFSService(CephService):
config['extra_args'] = ['-N', 'NIV_EVENT']
config['files'] = {
'ganesha.conf': get_ganesha_conf(),
+ 'idmap.conf': get_idmap_conf()
}
config.update(
self.get_config_and_keyring(
diff --git a/src/pybind/mgr/cephadm/services/node_proxy.py b/src/pybind/mgr/cephadm/services/node_proxy.py
new file mode 100644
index 000000000..e5608ca42
--- /dev/null
+++ b/src/pybind/mgr/cephadm/services/node_proxy.py
@@ -0,0 +1,180 @@
+import json
+import ssl
+import base64
+
+from urllib.error import HTTPError, URLError
+from typing import List, Any, Dict, Tuple, Optional, MutableMapping
+
+from .cephadmservice import CephadmDaemonDeploySpec, CephService
+from ceph.deployment.service_spec import ServiceSpec, PlacementSpec
+from ceph.utils import http_req
+from orchestrator import OrchestratorError
+
+
+class NodeProxy(CephService):
+ TYPE = 'node-proxy'
+
+ def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec:
+ assert self.TYPE == daemon_spec.daemon_type
+ daemon_id, host = daemon_spec.daemon_id, daemon_spec.host
+
+ if not self.mgr.http_server.agent:
+ raise OrchestratorError('Cannot deploy node-proxy before creating cephadm endpoint')
+
+ keyring = self.get_keyring_with_caps(self.get_auth_entity(daemon_id, host=host), [])
+ daemon_spec.keyring = keyring
+ self.mgr.node_proxy_cache.update_keyring(host, keyring)
+
+ daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
+
+ return daemon_spec
+
+ def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]:
+ # node-proxy is re-using the agent endpoint and therefore
+ # needs similar checks to see if the endpoint is ready.
+ self.agent_endpoint = self.mgr.http_server.agent
+ try:
+ assert self.agent_endpoint
+ assert self.agent_endpoint.ssl_certs.get_root_cert()
+ assert self.agent_endpoint.server_port
+ except Exception:
+ raise OrchestratorError(
+ 'Cannot deploy node-proxy daemons until cephadm endpoint has finished generating certs')
+
+ listener_cert, listener_key = self.agent_endpoint.ssl_certs.generate_cert(daemon_spec.host, self.mgr.inventory.get_addr(daemon_spec.host))
+ cfg = {
+ 'target_ip': self.mgr.get_mgr_ip(),
+ 'target_port': self.agent_endpoint.server_port,
+ 'name': f'node-proxy.{daemon_spec.host}',
+ 'keyring': daemon_spec.keyring,
+ 'root_cert.pem': self.agent_endpoint.ssl_certs.get_root_cert(),
+ 'listener.crt': listener_cert,
+ 'listener.key': listener_key,
+ }
+ config = {'node-proxy.json': json.dumps(cfg)}
+
+ return config, sorted([str(self.mgr.get_mgr_ip()), str(self.agent_endpoint.server_port),
+ self.agent_endpoint.ssl_certs.get_root_cert()])
+
+ def handle_hw_monitoring_setting(self) -> bool:
+ # function to apply or remove node-proxy service spec depending
+ # on whether the hw_mointoring config option is set or not.
+ # It should return True when it either creates or deletes a spec
+ # and False otherwise.
+ if self.mgr.hw_monitoring:
+ if 'node-proxy' not in self.mgr.spec_store:
+ spec = ServiceSpec(
+ service_type='node-proxy',
+ placement=PlacementSpec(host_pattern='*')
+ )
+ self.mgr.spec_store.save(spec)
+ return True
+ return False
+ else:
+ if 'node-proxy' in self.mgr.spec_store:
+ self.mgr.spec_store.rm('node-proxy')
+ return True
+ return False
+
+ def get_ssl_ctx(self) -> ssl.SSLContext:
+ ssl_root_crt = self.mgr.http_server.agent.ssl_certs.get_root_cert()
+ ssl_ctx = ssl.create_default_context()
+ ssl_ctx.check_hostname = True
+ ssl_ctx.verify_mode = ssl.CERT_REQUIRED
+ ssl_ctx.load_verify_locations(cadata=ssl_root_crt)
+ return ssl_ctx
+
+ def led(self, light_type: str, action: str, hostname: str, device: Optional[str] = None) -> Dict[str, Any]:
+ ssl_ctx: ssl.SSLContext = self.get_ssl_ctx()
+ header: MutableMapping[str, str] = {}
+ method: str = 'PATCH' if action in ['on', 'off'] else 'GET'
+ payload: Optional[Dict[str, str]] = None
+ addr: str = self.mgr.inventory.get_addr(hostname)
+ endpoint: List[str] = ['led', light_type]
+ _device: str = device if device else ''
+
+ if light_type == 'drive':
+ endpoint.append(_device)
+
+ if method == 'PATCH':
+ payload = dict(state=action)
+
+ header = self.generate_auth_header(hostname)
+
+ endpoint = f'/{"/".join(endpoint)}'
+
+ try:
+ headers, result, status = http_req(hostname=addr,
+ port='9456',
+ headers=header,
+ method=method,
+ data=json.dumps(payload),
+ endpoint=endpoint,
+ ssl_ctx=ssl_ctx)
+ result_json = json.loads(result)
+ except HTTPError as e:
+ self.mgr.log.error(e)
+ raise
+ except URLError as e:
+ raise RuntimeError(e)
+
+ return result_json
+
+ def generate_auth_header(self, hostname: str) -> Dict[str, str]:
+ try:
+ username = self.mgr.node_proxy_cache.oob[hostname]['username']
+ password = self.mgr.node_proxy_cache.oob[hostname]['password']
+ auth: bytes = f'{username}:{password}'.encode('utf-8')
+ auth_str: str = base64.b64encode(auth).decode('utf-8')
+ header = {'Authorization': f'Basic {auth_str}'}
+ except KeyError as e:
+ self.mgr.log.error(f'Check oob information is provided for {hostname}.')
+ raise RuntimeError(e)
+ return header
+
+ def shutdown(self, hostname: str, force: Optional[bool] = False) -> Dict[str, Any]:
+ ssl_ctx: ssl.SSLContext = self.get_ssl_ctx()
+ header: Dict[str, str] = self.generate_auth_header(hostname)
+ addr: str = self.mgr.inventory.get_addr(hostname)
+
+ endpoint = '/shutdown'
+ payload: Dict[str, Optional[bool]] = dict(force=force)
+
+ try:
+ headers, result, status = http_req(hostname=addr,
+ port='9456',
+ headers=header,
+ data=json.dumps(payload),
+ endpoint=endpoint,
+ ssl_ctx=ssl_ctx)
+ result_json = json.loads(result)
+ except HTTPError as e:
+ self.mgr.log.error(e)
+ raise
+ except URLError as e:
+ raise RuntimeError(e)
+
+ return result_json
+
+ def powercycle(self, hostname: str) -> Dict[str, Any]:
+ ssl_ctx: ssl.SSLContext = self.get_ssl_ctx()
+ header: Dict[str, str] = self.generate_auth_header(hostname)
+ addr: str = self.mgr.inventory.get_addr(hostname)
+
+ endpoint = '/powercycle'
+
+ try:
+ headers, result, status = http_req(hostname=addr,
+ port='9456',
+ headers=header,
+ data="{}",
+ endpoint=endpoint,
+ ssl_ctx=ssl_ctx)
+ result_json = json.loads(result)
+ except HTTPError as e:
+ self.mgr.log.error(e)
+ raise
+ except URLError as e:
+ raise RuntimeError(e)
+
+ return result_json
diff --git a/src/pybind/mgr/cephadm/services/nvmeof.py b/src/pybind/mgr/cephadm/services/nvmeof.py
index 7d2dd16cf..5f28273d4 100644
--- a/src/pybind/mgr/cephadm/services/nvmeof.py
+++ b/src/pybind/mgr/cephadm/services/nvmeof.py
@@ -15,6 +15,7 @@ logger = logging.getLogger(__name__)
class NvmeofService(CephService):
TYPE = 'nvmeof'
+ PROMETHEUS_PORT = 10008
def config(self, spec: NvmeofServiceSpec) -> None: # type: ignore
assert self.TYPE == spec.service_type
diff --git a/src/pybind/mgr/cephadm/services/osd.py b/src/pybind/mgr/cephadm/services/osd.py
index bfecc5723..75b3fc58c 100644
--- a/src/pybind/mgr/cephadm/services/osd.py
+++ b/src/pybind/mgr/cephadm/services/osd.py
@@ -319,11 +319,16 @@ class OSDService(CephService):
logger.exception('Cannot decode JSON: \'%s\'' % ' '.join(out))
concat_out = {}
notes = []
- if osdspec.data_devices is not None and osdspec.data_devices.limit and len(concat_out) < osdspec.data_devices.limit:
+ if (
+ osdspec.data_devices is not None
+ and osdspec.data_devices.limit
+ and (len(concat_out) + ds.existing_daemons) < osdspec.data_devices.limit
+ ):
found = len(concat_out)
limit = osdspec.data_devices.limit
notes.append(
- f'NOTE: Did not find enough disks matching filter on host {host} to reach data device limit (Found: {found} | Limit: {limit})')
+ f'NOTE: Did not find enough disks matching filter on host {host} to reach data device limit\n'
+ f'(New Devices: {found} | Existing Matching Daemons: {ds.existing_daemons} | Limit: {limit})')
ret_all.append({'data': concat_out,
'osdspec': osdspec.service_id,
'host': host,
@@ -664,6 +669,7 @@ class OSD:
return None
self.started = True
self.stopped = False
+ self.original_weight = self.rm_util.get_weight(self)
def start_draining(self) -> bool:
if self.stopped:
@@ -672,7 +678,6 @@ class OSD:
if self.replace:
self.rm_util.set_osd_flag([self], 'out')
else:
- self.original_weight = self.rm_util.get_weight(self)
self.rm_util.reweight_osd(self, 0.0)
self.drain_started_at = datetime.utcnow()
self.draining = True
@@ -761,6 +766,7 @@ class OSD:
out['force'] = self.force
out['zap'] = self.zap
out['hostname'] = self.hostname # type: ignore
+ out['original_weight'] = self.original_weight
for k in ['drain_started_at', 'drain_stopped_at', 'drain_done_at', 'process_started_at']:
if getattr(self, k):
@@ -953,6 +959,16 @@ class OSDRemovalQueue(object):
self.osds.add(osd)
osd.start()
+ def rm_by_osd_id(self, osd_id: int) -> None:
+ osd: Optional["OSD"] = None
+ for o in self.osds:
+ if o.osd_id == osd_id:
+ osd = o
+ if not osd:
+ logger.debug(f"Could not find osd with id {osd_id} in queue.")
+ raise KeyError(f'No osd with id {osd_id} in removal queue')
+ self.rm(osd)
+
def rm(self, osd: "OSD") -> None:
if not osd.exists:
raise NotFoundError()
diff --git a/src/pybind/mgr/cephadm/ssh.py b/src/pybind/mgr/cephadm/ssh.py
index d17cc0fcc..7460fc159 100644
--- a/src/pybind/mgr/cephadm/ssh.py
+++ b/src/pybind/mgr/cephadm/ssh.py
@@ -1,6 +1,7 @@
import logging
import os
import asyncio
+import concurrent
from tempfile import NamedTemporaryFile
from threading import Thread
from contextlib import contextmanager
@@ -61,7 +62,7 @@ class EventLoopThread(Thread):
future = asyncio.run_coroutine_threadsafe(coro, self._loop)
try:
return future.result(timeout)
- except asyncio.TimeoutError:
+ except (asyncio.TimeoutError, concurrent.futures.TimeoutError):
# try to cancel the task before raising the exception further up
future.cancel()
raise
diff --git a/src/pybind/mgr/cephadm/ssl_cert_utils.py b/src/pybind/mgr/cephadm/ssl_cert_utils.py
index fcc6f00ea..6295152c7 100644
--- a/src/pybind/mgr/cephadm/ssl_cert_utils.py
+++ b/src/pybind/mgr/cephadm/ssl_cert_utils.py
@@ -46,7 +46,7 @@ class SSLCerts:
root_builder = root_builder.public_key(root_public_key)
root_builder = root_builder.add_extension(
x509.SubjectAlternativeName(
- [x509.IPAddress(ipaddress.IPv4Address(addr))]
+ [x509.IPAddress(ipaddress.ip_address(addr))]
),
critical=False
)
@@ -70,12 +70,9 @@ class SSLCerts:
def generate_cert(self, host: str, addr: str) -> Tuple[str, str]:
have_ip = True
try:
- ip = x509.IPAddress(ipaddress.IPv4Address(addr))
+ ip = x509.IPAddress(ipaddress.ip_address(addr))
except Exception:
- try:
- ip = x509.IPAddress(ipaddress.IPv6Address(addr))
- except Exception:
- have_ip = False
+ have_ip = False
private_key = rsa.generate_private_key(
public_exponent=65537, key_size=4096, backend=default_backend())
diff --git a/src/pybind/mgr/cephadm/templates/services/ingress/haproxy.cfg.j2 b/src/pybind/mgr/cephadm/templates/services/ingress/haproxy.cfg.j2
index 100acce40..9a0309ab4 100644
--- a/src/pybind/mgr/cephadm/templates/services/ingress/haproxy.cfg.j2
+++ b/src/pybind/mgr/cephadm/templates/services/ingress/haproxy.cfg.j2
@@ -74,7 +74,7 @@ backend backend
balance static-rr
option httpchk HEAD / HTTP/1.0
{% for server in servers %}
- server {{ server.name }} {{ server.ip }}:{{ server.port }} check weight 100
+ server {{ server.name }} {{ server.ip }}:{{ server.port }} check weight 100 inter {{ health_check_interval }}
{% endfor %}
{% endif %}
{% if mode == 'tcp' %}
@@ -85,6 +85,6 @@ backend backend
default-server {{ default_server_opts|join(" ") }}
{% endif %}
{% for server in servers %}
- server {{ server.name }} {{ server.ip }}:{{ server.port }}
+ server {{ server.name }} {{ server.ip }}:{{ server.port }} check
{% endfor %}
{% endif %}
diff --git a/src/pybind/mgr/cephadm/templates/services/nfs/ganesha.conf.j2 b/src/pybind/mgr/cephadm/templates/services/nfs/ganesha.conf.j2
index ab8df7192..7bc0278d7 100644
--- a/src/pybind/mgr/cephadm/templates/services/nfs/ganesha.conf.j2
+++ b/src/pybind/mgr/cephadm/templates/services/nfs/ganesha.conf.j2
@@ -16,6 +16,9 @@ NFSv4 {
Delegations = false;
RecoveryBackend = 'rados_cluster';
Minor_Versions = 1, 2;
+{% if nfs_idmap_conf %}
+ IdmapConf = "{{ nfs_idmap_conf }}";
+{% endif %}
}
RADOS_KV {
diff --git a/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2 b/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2
index 69b8332cd..17290f504 100644
--- a/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2
+++ b/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2
@@ -7,6 +7,9 @@ port = {{ port }}
enable_auth = {{ spec.enable_auth }}
state_update_notify = True
state_update_interval_sec = 5
+enable_prometheus_exporter = True
+prometheus_exporter_ssl = False
+prometheus_port = 10008
[ceph]
pool = {{ spec.pool }}
diff --git a/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 b/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2
index b56843994..57d2f8a3f 100644
--- a/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2
+++ b/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2
@@ -107,3 +107,23 @@ scrape_configs:
- url: {{ ceph_exporter_sd_url }}
{% endif %}
{% endif %}
+
+{% if nvmeof_sd_url %}
+ - job_name: 'nvmeof'
+{% if secure_monitoring_stack %}
+ honor_labels: true
+ scheme: https
+ tls_config:
+ ca_file: root_cert.pem
+ http_sd_configs:
+ - url: {{ nvmeof_sd_url }}
+ basic_auth:
+ username: {{ service_discovery_username }}
+ password: {{ service_discovery_password }}
+ tls_config:
+ ca_file: root_cert.pem
+{% else %}
+ http_sd_configs:
+ - url: {{ nvmeof_sd_url }}
+{% endif %}
+{% endif %}
diff --git a/src/pybind/mgr/cephadm/tests/node_proxy_data.py b/src/pybind/mgr/cephadm/tests/node_proxy_data.py
new file mode 100644
index 000000000..37e6aaa46
--- /dev/null
+++ b/src/pybind/mgr/cephadm/tests/node_proxy_data.py
@@ -0,0 +1,3 @@
+full_set_with_critical = {'host': 'host01', 'sn': '12345', 'status': {'storage': {'disk.bay.0:enclosure.internal.0-1:raid.integrated.1-1': {'description': 'Solid State Disk 0:1:0', 'entity': 'RAID.Integrated.1-1', 'capacity_bytes': 959656755200, 'model': 'KPM5XVUG960G', 'protocol': 'SAS', 'serial_number': '8080A1CRTP5F', 'status': {'health': 'Critical', 'healthrollup': 'OK', 'state': 'Enabled'}, 'physical_location': {'partlocation': {'locationordinalvalue': 0, 'locationtype': 'Slot'}}}, 'disk.bay.9:enclosure.internal.0-1': {'description': 'PCIe SSD in Slot 9 in Bay 1', 'entity': 'CPU.1', 'capacity_bytes': 1600321314816, 'model': 'Dell Express Flash NVMe P4610 1.6TB SFF', 'protocol': 'PCIe', 'serial_number': 'PHLN035305MN1P6AGN', 'status': {'health': 'Critical', 'healthrollup': 'OK', 'state': 'Enabled'}, 'physical_location': {'partlocation': {'locationordinalvalue': 9, 'locationtype': 'Slot'}}}}, 'processors': {'cpu.socket.2': {'description': 'Represents the properties of a Processor attached to this System', 'total_cores': 20, 'total_threads': 40, 'processor_type': 'CPU', 'model': 'Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz', 'status': {'health': 'OK', 'state': 'Enabled'}, 'manufacturer': 'Intel'}}, 'network': {'nic.slot.1-1-1': {'description': 'NIC in Slot 1 Port 1 Partition 1', 'name': 'System Ethernet Interface', 'speed_mbps': 0, 'status': {'health': 'OK', 'state': 'StandbyOffline'}}}, 'memory': {'dimm.socket.a1': {'description': 'DIMM A1', 'memory_device_type': 'DDR4', 'capacity_mi_b': 31237, 'status': {'health': 'Critical', 'state': 'Enabled'}}}}, 'firmwares': {}}
+mgr_inventory_cache = {'host01': {'hostname': 'host01', 'addr': '10.10.10.11', 'labels': ['_admin'], 'status': '', 'oob': {'hostname': '10.10.10.11', 'username': 'root', 'password': 'ceph123'}}, 'host02': {'hostname': 'host02', 'addr': '10.10.10.12', 'labels': [], 'status': '', 'oob': {'hostname': '10.10.10.12', 'username': 'root', 'password': 'ceph123'}}}
+full_set = {'host01': {'host': 'host01', 'sn': 'FR8Y5X3', 'status': {'storage': {'disk.bay.8:enclosure.internal.0-1:nonraid.slot.2-1': {'description': 'Disk 8 in Backplane 1 of Storage Controller in Slot 2', 'entity': 'NonRAID.Slot.2-1', 'capacity_bytes': 20000588955136, 'model': 'ST20000NM008D-3D', 'protocol': 'SATA', 'serial_number': 'ZVT99QLL', 'status': {'health': 'OK', 'healthrollup': 'OK', 'state': 'Enabled'}, 'physical_location': {'partlocation': {'locationordinalvalue': 8, 'locationtype': 'Slot'}}}}, 'processors': {'cpu.socket.2': {'description': 'Represents the properties of a Processor attached to this System', 'total_cores': 16, 'total_threads': 32, 'processor_type': 'CPU', 'model': 'Intel(R) Xeon(R) Silver 4314 CPU @ 2.40GHz', 'status': {'health': 'OK', 'state': 'Enabled'}, 'manufacturer': 'Intel'}, 'cpu.socket.1': {'description': 'Represents the properties of a Processor attached to this System', 'total_cores': 16, 'total_threads': 32, 'processor_type': 'CPU', 'model': 'Intel(R) Xeon(R) Silver 4314 CPU @ 2.40GHz', 'status': {'health': 'OK', 'state': 'Enabled'}, 'manufacturer': 'Intel'}}, 'network': {'oslogicalnetwork.2': {'description': 'eno8303', 'name': 'eno8303', 'speed_mbps': 0, 'status': {'health': 'OK', 'state': 'Enabled'}}}, 'memory': {'dimm.socket.a1': {'description': 'DIMM A1', 'memory_device_type': 'DDR4', 'capacity_mi_b': 16384, 'status': {'health': 'OK', 'state': 'Enabled'}}}, 'power': {'0': {'name': 'PS1 Status', 'model': 'PWR SPLY,800W,RDNT,LTON', 'manufacturer': 'DELL', 'status': {'health': 'OK', 'state': 'Enabled'}}, '1': {'name': 'PS2 Status', 'model': 'PWR SPLY,800W,RDNT,LTON', 'manufacturer': 'DELL', 'status': {'health': 'OK', 'state': 'Enabled'}}}, 'fans': {'0': {'name': 'System Board Fan1A', 'physical_context': 'SystemBoard', 'status': {'health': 'OK', 'state': 'Enabled'}}}}, 'firmwares': {'installed-28897-6.10.30.20__usc.embedded.1:lc.embedded.1': {'name': 'Lifecycle Controller', 'description': 'Represents Firmware Inventory', 'release_date': '00:00:00Z', 'version': '6.10.30.20', 'updateable': True, 'status': {'health': 'OK', 'state': 'Enabled'}}}}, 'host02': {'host': 'host02', 'sn': 'FR8Y5X4', 'status': {'storage': {'disk.bay.8:enclosure.internal.0-1:nonraid.slot.2-1': {'description': 'Disk 8 in Backplane 1 of Storage Controller in Slot 2', 'entity': 'NonRAID.Slot.2-1', 'capacity_bytes': 20000588955136, 'model': 'ST20000NM008D-3D', 'protocol': 'SATA', 'serial_number': 'ZVT99QLL', 'status': {'health': 'OK', 'healthrollup': 'OK', 'state': 'Enabled'}, 'physical_location': {'partlocation': {'locationordinalvalue': 8, 'locationtype': 'Slot'}}}}, 'processors': {'cpu.socket.2': {'description': 'Represents the properties of a Processor attached to this System', 'total_cores': 16, 'total_threads': 32, 'processor_type': 'CPU', 'model': 'Intel(R) Xeon(R) Silver 4314 CPU @ 2.40GHz', 'status': {'health': 'OK', 'state': 'Enabled'}, 'manufacturer': 'Intel'}, 'cpu.socket.1': {'description': 'Represents the properties of a Processor attached to this System', 'total_cores': 16, 'total_threads': 32, 'processor_type': 'CPU', 'model': 'Intel(R) Xeon(R) Silver 4314 CPU @ 2.40GHz', 'status': {'health': 'OK', 'state': 'Enabled'}, 'manufacturer': 'Intel'}}, 'network': {'oslogicalnetwork.2': {'description': 'eno8303', 'name': 'eno8303', 'speed_mbps': 0, 'status': {'health': 'OK', 'state': 'Enabled'}}}, 'memory': {'dimm.socket.a1': {'description': 'DIMM A1', 'memory_device_type': 'DDR4', 'capacity_mi_b': 16384, 'status': {'health': 'OK', 'state': 'Enabled'}}}, 'power': {'0': {'name': 'PS1 Status', 'model': 'PWR SPLY,800W,RDNT,LTON', 'manufacturer': 'DELL', 'status': {'health': 'OK', 'state': 'Enabled'}}, '1': {'name': 'PS2 Status', 'model': 'PWR SPLY,800W,RDNT,LTON', 'manufacturer': 'DELL', 'status': {'health': 'OK', 'state': 'Enabled'}}}, 'fans': {'0': {'name': 'System Board Fan1A', 'physical_context': 'SystemBoard', 'status': {'health': 'OK', 'state': 'Enabled'}}}}, 'firmwares': {'installed-28897-6.10.30.20__usc.embedded.1:lc.embedded.1': {'name': 'Lifecycle Controller', 'description': 'Represents Firmware Inventory', 'release_date': '00:00:00Z', 'version': '6.10.30.20', 'updateable': True, 'status': {'health': 'OK', 'state': 'Enabled'}}}}}
diff --git a/src/pybind/mgr/cephadm/tests/test_autotune.py b/src/pybind/mgr/cephadm/tests/test_autotune.py
index 524da9c00..7994c390a 100644
--- a/src/pybind/mgr/cephadm/tests/test_autotune.py
+++ b/src/pybind/mgr/cephadm/tests/test_autotune.py
@@ -46,6 +46,17 @@ from orchestrator import DaemonDescription
],
{},
62 * 1024 * 1024 * 1024,
+ ),
+ (
+ 128 * 1024 * 1024 * 1024,
+ [
+ DaemonDescription('mgr', 'a', 'host1'),
+ DaemonDescription('osd', '1', 'host1'),
+ DaemonDescription('osd', '2', 'host1'),
+ DaemonDescription('nvmeof', 'a', 'host1'),
+ ],
+ {},
+ 60 * 1024 * 1024 * 1024,
)
])
def test_autotune(total, daemons, config, result):
diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py
index 24fcb0280..2477de13e 100644
--- a/src/pybind/mgr/cephadm/tests/test_cephadm.py
+++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py
@@ -400,6 +400,42 @@ class TestCephadm(object):
assert 'myerror' in ''.join(evs)
+ @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]'))
+ def test_daemon_action_event_timestamp_update(self, cephadm_module: CephadmOrchestrator):
+ # Test to make sure if a new daemon event is created with the same subject
+ # and message that the timestamp of the event is updated to let users know
+ # when it most recently occurred.
+ cephadm_module.service_cache_timeout = 10
+ with with_host(cephadm_module, 'test'):
+ with with_service(cephadm_module, RGWSpec(service_id='myrgw.foobar', unmanaged=True)) as _, \
+ with_daemon(cephadm_module, RGWSpec(service_id='myrgw.foobar'), 'test') as daemon_id:
+
+ d_name = 'rgw.' + daemon_id
+
+ now = str_to_datetime('2023-10-18T22:45:29.119250Z')
+ with mock.patch("cephadm.inventory.datetime_now", lambda: now):
+ c = cephadm_module.daemon_action('redeploy', d_name)
+ assert wait(cephadm_module,
+ c) == f"Scheduled to redeploy rgw.{daemon_id} on host 'test'"
+
+ CephadmServe(cephadm_module)._check_daemons()
+
+ d_events = cephadm_module.events.get_for_daemon(d_name)
+ assert len(d_events) == 1
+ assert d_events[0].created == now
+
+ later = str_to_datetime('2023-10-18T23:46:37.119250Z')
+ with mock.patch("cephadm.inventory.datetime_now", lambda: later):
+ c = cephadm_module.daemon_action('redeploy', d_name)
+ assert wait(cephadm_module,
+ c) == f"Scheduled to redeploy rgw.{daemon_id} on host 'test'"
+
+ CephadmServe(cephadm_module)._check_daemons()
+
+ d_events = cephadm_module.events.get_for_daemon(d_name)
+ assert len(d_events) == 1
+ assert d_events[0].created == later
+
@pytest.mark.parametrize(
"action",
[
@@ -1157,7 +1193,8 @@ class TestCephadm(object):
@mock.patch('cephadm.services.osd.OSDService.driveselection_to_ceph_volume')
@mock.patch('cephadm.services.osd.OsdIdClaims.refresh', lambda _: None)
@mock.patch('cephadm.services.osd.OsdIdClaims.get', lambda _: {})
- def test_limit_not_reached(self, d_to_cv, _run_cv_cmd, cephadm_module):
+ @mock.patch('cephadm.inventory.HostCache.get_daemons_by_service')
+ def test_limit_not_reached(self, _get_daemons_by_service, d_to_cv, _run_cv_cmd, cephadm_module):
with with_host(cephadm_module, 'test'):
dg = DriveGroupSpec(placement=PlacementSpec(host_pattern='test'),
data_devices=DeviceSelection(limit=5, rotational=1),
@@ -1167,12 +1204,14 @@ class TestCephadm(object):
'[{"data": "/dev/vdb", "data_size": "50.00 GB", "encryption": "None"}, {"data": "/dev/vdc", "data_size": "50.00 GB", "encryption": "None"}]']
d_to_cv.return_value = 'foo'
_run_cv_cmd.side_effect = async_side_effect((disks_found, '', 0))
+ _get_daemons_by_service.return_value = [DaemonDescription(daemon_type='osd', hostname='test', service_name='not_enough')]
preview = cephadm_module.osd_service.generate_previews([dg], 'test')
for osd in preview:
assert 'notes' in osd
assert osd['notes'] == [
- 'NOTE: Did not find enough disks matching filter on host test to reach data device limit (Found: 2 | Limit: 5)']
+ ('NOTE: Did not find enough disks matching filter on host test to reach '
+ 'data device limit\n(New Devices: 2 | Existing Matching Daemons: 1 | Limit: 5)')]
@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
def test_prepare_drivegroup(self, cephadm_module):
@@ -1251,7 +1290,11 @@ class TestCephadm(object):
))
@mock.patch("cephadm.services.osd.OSD.exists", True)
@mock.patch("cephadm.services.osd.RemoveUtil.get_pg_count", lambda _, __: 0)
- def test_remove_osds(self, cephadm_module):
+ @mock.patch("cephadm.services.osd.RemoveUtil.get_weight")
+ @mock.patch("cephadm.services.osd.RemoveUtil.reweight_osd")
+ def test_remove_osds(self, _reweight_osd, _get_weight, cephadm_module):
+ osd_initial_weight = 2.1
+ _get_weight.return_value = osd_initial_weight
with with_host(cephadm_module, 'test'):
CephadmServe(cephadm_module)._refresh_host_daemons('test')
c = cephadm_module.list_daemons()
@@ -1261,13 +1304,23 @@ class TestCephadm(object):
out = wait(cephadm_module, c)
assert out == ["Removed osd.0 from host 'test'"]
- cephadm_module.to_remove_osds.enqueue(OSD(osd_id=0,
- replace=False,
- force=False,
- hostname='test',
- process_started_at=datetime_now(),
- remove_util=cephadm_module.to_remove_osds.rm_util
- ))
+ osd_0 = OSD(osd_id=0,
+ replace=False,
+ force=False,
+ hostname='test',
+ process_started_at=datetime_now(),
+ remove_util=cephadm_module.to_remove_osds.rm_util
+ )
+
+ cephadm_module.to_remove_osds.enqueue(osd_0)
+ _get_weight.assert_called()
+
+ # test that OSD is properly reweighted on removal
+ cephadm_module.stop_remove_osds([0])
+ _reweight_osd.assert_called_with(mock.ANY, osd_initial_weight)
+
+ # add OSD back to queue and test normal removal queue processing
+ cephadm_module.to_remove_osds.enqueue(osd_0)
cephadm_module.to_remove_osds.process_removal_queue()
assert cephadm_module.to_remove_osds == OSDRemovalQueue(cephadm_module)
diff --git a/src/pybind/mgr/cephadm/tests/test_configchecks.py b/src/pybind/mgr/cephadm/tests/test_configchecks.py
index 3cae0a27d..ff1e21861 100644
--- a/src/pybind/mgr/cephadm/tests/test_configchecks.py
+++ b/src/pybind/mgr/cephadm/tests/test_configchecks.py
@@ -238,6 +238,7 @@ class FakeMgr:
self.default_version = 'quincy'
self.version_overrides = {}
self.daemon_to_host = {}
+ self.config_checks_enabled = True
self.cache = HostCache(self)
self.upgrade = CephadmUpgrade(self)
@@ -623,9 +624,7 @@ class TestConfigCheck:
assert 'ceph_release' in checker.skipped_checks
def test_skip_when_disabled(self, mgr):
- mgr.module_option.update({
- "config_checks_enabled": "false"
- })
+ mgr.config_checks_enabled = False
checker = CephadmConfigChecks(mgr)
checker.cluster_network_list = []
checker.public_network_list = ['10.9.64.0/24']
diff --git a/src/pybind/mgr/cephadm/tests/test_node_proxy.py b/src/pybind/mgr/cephadm/tests/test_node_proxy.py
new file mode 100644
index 000000000..b19bb5dbc
--- /dev/null
+++ b/src/pybind/mgr/cephadm/tests/test_node_proxy.py
@@ -0,0 +1,312 @@
+import cherrypy
+import json
+from _pytest.monkeypatch import MonkeyPatch
+from urllib.error import URLError
+from cherrypy.test import helper
+from cephadm.agent import NodeProxyEndpoint
+from unittest.mock import MagicMock, call, patch
+from cephadm.inventory import AgentCache, NodeProxyCache, Inventory
+from cephadm.ssl_cert_utils import SSLCerts
+from . import node_proxy_data
+
+PORT = 58585
+
+
+class FakeMgr:
+ def __init__(self) -> None:
+ self.log = MagicMock()
+ self.get_store = MagicMock(return_value=json.dumps(node_proxy_data.mgr_inventory_cache))
+ self.set_store = MagicMock()
+ self.set_health_warning = MagicMock()
+ self.remove_health_warning = MagicMock()
+ self.inventory = Inventory(self)
+ self.agent_cache = AgentCache(self)
+ self.agent_cache.agent_ports = {"host01": 1234}
+ self.node_proxy_cache = NodeProxyCache(self)
+ self.node_proxy_cache.save = MagicMock()
+ self.node_proxy = MagicMock()
+ self.http_server = MagicMock()
+ self.http_server.agent = MagicMock()
+ self.http_server.agent.ssl_certs = SSLCerts()
+ self.http_server.agent.ssl_certs.generate_root_cert(self.get_mgr_ip())
+
+ def get_mgr_ip(self) -> str:
+ return '0.0.0.0'
+
+
+class TestNodeProxyEndpoint(helper.CPWebCase):
+ mgr = FakeMgr()
+ app = NodeProxyEndpoint(mgr)
+ mgr.node_proxy_cache.keyrings = {"host01": "fake-secret01",
+ "host02": "fake-secret02"}
+ mgr.node_proxy_cache.oob = {"host01": {"username": "oob-user01",
+ "password": "oob-pass01"},
+ "host02": {"username": "oob-user02",
+ "password": "oob-pass02"}}
+ mgr.node_proxy_cache.data = node_proxy_data.full_set
+
+ @classmethod
+ def setup_server(cls):
+ # cherrypy.tree.mount(NodeProxyEndpoint(TestNodeProxyEndpoint.mgr))
+ cherrypy.tree.mount(TestNodeProxyEndpoint.app)
+ cherrypy.config.update({'global': {
+ 'server.socket_host': '127.0.0.1',
+ 'server.socket_port': PORT}})
+
+ def setUp(self):
+ self.PORT = PORT
+ self.monkeypatch = MonkeyPatch()
+
+ def test_oob_data_misses_cephx_field(self):
+ data = '{}'
+ self.getPage("/oob", method="POST", body=data, headers=[('Content-Type', 'application/json'),
+ ('Content-Length', str(len(data)))])
+ self.assertStatus('400 Bad Request')
+
+ def test_oob_data_misses_name_field(self):
+ data = '{"cephx": {"secret": "fake-secret"}}'
+ self.getPage("/oob", method="POST", body=data, headers=[('Content-Type', 'application/json'),
+ ('Content-Length', str(len(data)))])
+ self.assertStatus('400 Bad Request')
+
+ def test_oob_data_misses_secret_field(self):
+ data = '{"cephx": {"name": "node-proxy.host01"}}'
+ self.getPage("/oob", method="POST", body=data, headers=[('Content-Type', 'application/json'),
+ ('Content-Length', str(len(data)))])
+ self.assertStatus('400 Bad Request')
+
+ def test_oob_agent_not_running(self):
+ data = '{"cephx": {"name": "node-proxy.host03", "secret": "fake-secret03"}}'
+ self.getPage("/oob", method="POST", body=data, headers=[('Content-Type', 'application/json'),
+ ('Content-Length', str(len(data)))])
+ self.assertStatus('502 Bad Gateway')
+
+ def test_oob_wrong_keyring(self):
+ data = '{"cephx": {"name": "node-proxy.host01", "secret": "wrong-keyring"}}'
+ self.getPage("/oob", method="POST", body=data, headers=[('Content-Type', 'application/json'),
+ ('Content-Length', str(len(data)))])
+ self.assertStatus('403 Forbidden')
+
+ def test_oob_ok(self):
+ data = '{"cephx": {"name": "node-proxy.host01", "secret": "fake-secret01"}}'
+ self.getPage("/oob", method="POST", body=data, headers=[('Content-Type', 'application/json'),
+ ('Content-Length', str(len(data)))])
+ self.assertStatus('200 OK')
+
+ def test_data_missing_patch(self):
+ data = '{"cephx": {"name": "node-proxy.host01", "secret": "fake-secret01"}}'
+ self.getPage("/data", method="POST", body=data, headers=[('Content-Type', 'application/json'),
+ ('Content-Length', str(len(data)))])
+ self.assertStatus('400 Bad Request')
+
+ def test_data_raises_alert(self):
+ patch = node_proxy_data.full_set_with_critical
+ data = {"cephx": {"name": "node-proxy.host01", "secret": "fake-secret01"}, "patch": patch}
+ data_str = json.dumps(data)
+ self.getPage("/data", method="POST", body=data_str, headers=[('Content-Type', 'application/json'),
+ ('Content-Length', str(len(data_str)))])
+ self.assertStatus('200 OK')
+
+ calls = [call('HARDWARE_STORAGE',
+ count=2,
+ detail=['disk.bay.0:enclosure.internal.0-1:raid.integrated.1-1 is critical: Enabled',
+ 'disk.bay.9:enclosure.internal.0-1 is critical: Enabled'],
+ summary='2 storage members are not ok'),
+ call('HARDWARE_MEMORY',
+ count=1,
+ detail=['dimm.socket.a1 is critical: Enabled'],
+ summary='1 memory member is not ok')]
+
+ assert TestNodeProxyEndpoint.mgr.set_health_warning.mock_calls == calls
+
+ def test_led_GET_no_hostname(self):
+ self.getPage("/led", method="GET")
+ self.assertStatus('501 Not Implemented')
+
+ def test_led_PATCH_no_hostname(self):
+ data = "{}"
+ self.getPage("/led", method="PATCH", body=data, headers=[('Content-Type', 'application/json'),
+ ('Content-Length', str(len(data)))])
+ self.assertStatus('501 Not Implemented')
+
+ def test_set_led_no_type(self):
+ data = '{"state": "on", "keyring": "fake-secret01"}'
+ self.getPage("/host01/led", method="PATCH", body=data, headers=[('Content-Type', 'application/json'),
+ ('Content-Length', str(len(data)))])
+ self.assertStatus('400 Bad Request')
+
+ def test_set_chassis_led(self):
+ data = '{"state": "on", "keyring": "fake-secret01"}'
+ with patch('cephadm.agent.http_req') as p:
+ p.return_value = [], '{}', 200
+ self.getPage("/host01/led/chassis", method="PATCH", body=data, headers=[('Content-Type', 'application/json'),
+ ('Content-Length', str(len(data)))])
+ self.assertStatus('200 OK')
+
+ def test_get_led_missing_type(self):
+ self.getPage("/host01/led", method="GET")
+ self.assertStatus('400 Bad Request')
+
+ def test_get_led_no_hostname(self):
+ self.getPage("/led", method="GET")
+ self.assertStatus('501 Not Implemented')
+
+ def test_get_led_type_chassis_no_hostname(self):
+ self.getPage("/led/chassis", method="GET")
+ self.assertStatus('404 Not Found')
+
+ def test_get_led_type_drive_no_hostname(self):
+ self.getPage("/led/chassis", method="GET")
+ self.assertStatus('404 Not Found')
+
+ def test_get_led_type_drive_missing_id(self):
+ self.getPage("/host01/led/drive", method="GET")
+ self.assertStatus('400 Bad Request')
+
+ def test_get_led_url_error(self):
+ with patch('cephadm.agent.http_req') as p:
+ p.side_effect = URLError('fake error')
+ self.getPage("/host02/led/chassis", method="GET")
+ self.assertStatus('502 Bad Gateway')
+
+ def test_get_chassis_led_ok(self):
+ with patch('cephadm.agent.http_req', return_value=MagicMock()) as p:
+ p.return_value = [], '{}', 200
+ self.getPage("/host01/led/chassis", method="GET")
+ self.assertStatus('200 OK')
+
+ def test_get_drive_led_without_id(self):
+ self.getPage("/host01/led/drive", method="GET")
+ self.assertStatus('400 Bad Request')
+
+ def test_get_drive_led_with_id(self):
+ with patch('cephadm.agent.http_req', return_value=MagicMock()) as p:
+ p.return_value = [], '{}', 200
+ self.getPage("/host01/led/drive/123", method="GET")
+ self.assertStatus('200 OK')
+
+ def test_fullreport_with_valid_hostname(self):
+ # data = '{"cephx": {"name": "node-proxy.host01", "secret": "fake-secret01"}}'
+ # self.getPage("/host02/fullreport", method="POST", body=data, headers=[('Content-Type', 'application/json'), ('Content-Length', str(len(data)))])
+ self.getPage("/host02/fullreport", method="GET")
+ self.assertStatus('200 OK')
+
+ def test_fullreport_no_hostname(self):
+ # data = '{"cephx": {"name": "node-proxy.host01", "secret": "fake-secret01"}}'
+ # self.getPage("/fullreport", method="POST", body=data, headers=[('Content-Type', 'application/json'), ('Content-Length', str(len(data)))])
+ self.getPage("/fullreport", method="GET")
+ self.assertStatus('200 OK')
+
+ def test_fullreport_with_invalid_hostname(self):
+ # data = '{"cephx": {"name": "node-proxy.host03", "secret": "fake-secret03"}}'
+ # self.getPage("/host03/fullreport", method="POST", body=data, headers=[('Content-Type', 'application/json'), ('Content-Length', str(len(data)))])
+ self.getPage("/host03/fullreport", method="GET")
+ self.assertStatus('404 Not Found')
+
+ def test_summary_with_valid_hostname(self):
+ self.getPage("/host02/summary", method="GET")
+ self.assertStatus('200 OK')
+
+ def test_summary_no_hostname(self):
+ self.getPage("/summary", method="GET")
+ self.assertStatus('200 OK')
+
+ def test_summary_with_invalid_hostname(self):
+ self.getPage("/host03/summary", method="GET")
+ self.assertStatus('404 Not Found')
+
+ def test_criticals_with_valid_hostname(self):
+ self.getPage("/host02/criticals", method="GET")
+ self.assertStatus('200 OK')
+
+ def test_criticals_no_hostname(self):
+ self.getPage("/criticals", method="GET")
+ self.assertStatus('200 OK')
+
+ def test_criticals_with_invalid_hostname(self):
+ self.getPage("/host03/criticals", method="GET")
+ self.assertStatus('404 Not Found')
+
+ def test_memory_with_valid_hostname(self):
+ self.getPage("/host02/memory", method="GET")
+ self.assertStatus('200 OK')
+
+ def test_memory_no_hostname(self):
+ self.getPage("/memory", method="GET")
+ self.assertStatus('200 OK')
+
+ def test_memory_with_invalid_hostname(self):
+ self.getPage("/host03/memory", method="GET")
+ self.assertStatus('404 Not Found')
+
+ def test_network_with_valid_hostname(self):
+ self.getPage("/host02/network", method="GET")
+ self.assertStatus('200 OK')
+
+ def test_network_no_hostname(self):
+ self.getPage("/network", method="GET")
+ self.assertStatus('200 OK')
+
+ def test_network_with_invalid_hostname(self):
+ self.getPage("/host03/network", method="GET")
+ self.assertStatus('404 Not Found')
+
+ def test_processors_with_valid_hostname(self):
+ self.getPage("/host02/processors", method="GET")
+ self.assertStatus('200 OK')
+
+ def test_processors_no_hostname(self):
+ self.getPage("/processors", method="GET")
+ self.assertStatus('200 OK')
+
+ def test_processors_with_invalid_hostname(self):
+ self.getPage("/host03/processors", method="GET")
+ self.assertStatus('404 Not Found')
+
+ def test_storage_with_valid_hostname(self):
+ self.getPage("/host02/storage", method="GET")
+ self.assertStatus('200 OK')
+
+ def test_storage_no_hostname(self):
+ self.getPage("/storage", method="GET")
+ self.assertStatus('200 OK')
+
+ def test_storage_with_invalid_hostname(self):
+ self.getPage("/host03/storage", method="GET")
+ self.assertStatus('404 Not Found')
+
+ def test_power_with_valid_hostname(self):
+ self.getPage("/host02/power", method="GET")
+ self.assertStatus('200 OK')
+
+ def test_power_no_hostname(self):
+ self.getPage("/power", method="GET")
+ self.assertStatus('200 OK')
+
+ def test_power_with_invalid_hostname(self):
+ self.getPage("/host03/power", method="GET")
+ self.assertStatus('404 Not Found')
+
+ def test_fans_with_valid_hostname(self):
+ self.getPage("/host02/fans", method="GET")
+ self.assertStatus('200 OK')
+
+ def test_fans_no_hostname(self):
+ self.getPage("/fans", method="GET")
+ self.assertStatus('200 OK')
+
+ def test_fans_with_invalid_hostname(self):
+ self.getPage("/host03/fans", method="GET")
+ self.assertStatus('404 Not Found')
+
+ def test_firmwares_with_valid_hostname(self):
+ self.getPage("/host02/firmwares", method="GET")
+ self.assertStatus('200 OK')
+
+ def test_firmwares_no_hostname(self):
+ self.getPage("/firmwares", method="GET")
+ self.assertStatus('200 OK')
+
+ def test_firmwares_with_invalid_hostname(self):
+ self.getPage("/host03/firmwares", method="GET")
+ self.assertStatus('404 Not Found')
diff --git a/src/pybind/mgr/cephadm/tests/test_scheduling.py b/src/pybind/mgr/cephadm/tests/test_scheduling.py
index 067cd5028..f445ed6f0 100644
--- a/src/pybind/mgr/cephadm/tests/test_scheduling.py
+++ b/src/pybind/mgr/cephadm/tests/test_scheduling.py
@@ -6,7 +6,13 @@ from typing import NamedTuple, List, Dict, Optional
import pytest
from ceph.deployment.hostspec import HostSpec
-from ceph.deployment.service_spec import ServiceSpec, PlacementSpec, IngressSpec
+from ceph.deployment.service_spec import (
+ ServiceSpec,
+ PlacementSpec,
+ IngressSpec,
+ PatternType,
+ HostPattern,
+)
from ceph.deployment.hostspec import SpecValidationError
from cephadm.module import HostAssignment
@@ -631,6 +637,17 @@ class NodeAssignmentTest(NamedTuple):
'rgw:host2(*:81)', 'rgw:host3(*:81)'],
['rgw.c']
),
+ # label + host pattern
+ # Note all hosts will get the "foo" label, we are checking
+ # that it also filters on the host pattern when label is provided
+ NodeAssignmentTest(
+ 'mgr',
+ PlacementSpec(label='foo', host_pattern='mgr*'),
+ 'mgr1 mgr2 osd1'.split(),
+ [],
+ None, None,
+ ['mgr:mgr1', 'mgr:mgr2'], ['mgr:mgr1', 'mgr:mgr2'], []
+ ),
# cephadm.py teuth case
NodeAssignmentTest(
'mgr',
@@ -1697,3 +1714,42 @@ def test_drain_from_explict_placement(service_type, placement, hosts, maintenanc
).place()
assert sorted([h.hostname for h in to_add]) in expected_add
assert sorted([h.name() for h in to_remove]) in expected_remove
+
+
+class RegexHostPatternTest(NamedTuple):
+ service_type: str
+ placement: PlacementSpec
+ hosts: List[str]
+ expected_add: List[List[str]]
+
+
+@pytest.mark.parametrize("service_type,placement,hosts,expected_add",
+ [
+ RegexHostPatternTest(
+ 'crash',
+ PlacementSpec(host_pattern=HostPattern(pattern='host1|host3', pattern_type=PatternType.regex)),
+ 'host1 host2 host3 host4'.split(),
+ ['host1', 'host3'],
+ ),
+ RegexHostPatternTest(
+ 'crash',
+ PlacementSpec(host_pattern=HostPattern(pattern='host[2-4]', pattern_type=PatternType.regex)),
+ 'host1 host2 host3 host4'.split(),
+ ['host2', 'host3', 'host4'],
+ ),
+ ])
+def test_placement_regex_host_pattern(service_type, placement, hosts, expected_add):
+ spec = ServiceSpec(service_type=service_type,
+ service_id='test',
+ placement=placement)
+
+ host_specs = [HostSpec(h) for h in hosts]
+
+ hosts, to_add, to_remove = HostAssignment(
+ spec=spec,
+ hosts=host_specs,
+ unreachable_hosts=[],
+ draining_hosts=[],
+ daemons=[],
+ ).place()
+ assert sorted([h.hostname for h in to_add]) == expected_add
diff --git a/src/pybind/mgr/cephadm/tests/test_service_discovery.py b/src/pybind/mgr/cephadm/tests/test_service_discovery.py
index ff98a1388..687b64553 100644
--- a/src/pybind/mgr/cephadm/tests/test_service_discovery.py
+++ b/src/pybind/mgr/cephadm/tests/test_service_discovery.py
@@ -19,6 +19,9 @@ class FakeCache:
if service_type == 'ceph-exporter':
return [FakeDaemonDescription('1.2.3.4', [9926], 'node0'),
FakeDaemonDescription('1.2.3.5', [9926], 'node1')]
+ if service_type == 'nvmeof':
+ return [FakeDaemonDescription('1.2.3.4', [10008], 'node0'),
+ FakeDaemonDescription('1.2.3.5', [10008], 'node1')]
return [FakeDaemonDescription('1.2.3.4', [9100], 'node0'),
FakeDaemonDescription('1.2.3.5', [9200], 'node1')]
@@ -171,6 +174,20 @@ class TestServiceDiscovery:
# check content
assert cfg[0]['targets'] == ['1.2.3.4:9926']
+ def test_get_sd_config_nvmeof(self):
+ mgr = FakeMgr()
+ root = Root(mgr, 5000, '0.0.0.0')
+ cfg = root.get_sd_config('nvmeof')
+
+ # check response structure
+ assert cfg
+ for entry in cfg:
+ assert 'labels' in entry
+ assert 'targets' in entry
+
+ # check content
+ assert cfg[0]['targets'] == ['1.2.3.4:10008']
+
def test_get_sd_config_invalid_service(self):
mgr = FakeMgr()
root = Root(mgr, 5000, '0.0.0.0')
diff --git a/src/pybind/mgr/cephadm/tests/test_services.py b/src/pybind/mgr/cephadm/tests/test_services.py
index 2300b288d..1265a39f6 100644
--- a/src/pybind/mgr/cephadm/tests/test_services.py
+++ b/src/pybind/mgr/cephadm/tests/test_services.py
@@ -376,6 +376,9 @@ port = {default_port}
enable_auth = False
state_update_notify = True
state_update_interval_sec = 5
+enable_prometheus_exporter = True
+prometheus_exporter_ssl = False
+prometheus_port = 10008
[ceph]
pool = {pool}
@@ -665,7 +668,9 @@ class TestMonitoring:
keepalived_password='12345',
virtual_ip="1.2.3.4/32",
backend_service='rgw.foo')) as _, \
- with_service(cephadm_module, PrometheusSpec('prometheus')) as _:
+ with_service(cephadm_module, PrometheusSpec('prometheus',
+ networks=['1.2.3.0/24'],
+ only_bind_port_on_networks=True)) as _:
y = dedent("""
# This file is generated by cephadm.
@@ -699,6 +704,10 @@ class TestMonitoring:
honor_labels: true
http_sd_configs:
- url: http://[::1]:8765/sd/prometheus/sd-config?service=ceph-exporter
+
+ - job_name: 'nvmeof'
+ http_sd_configs:
+ - url: http://[::1]:8765/sd/prometheus/sd-config?service=nvmeof
""").lstrip()
_run_cephadm.assert_called_with(
@@ -713,11 +722,12 @@ class TestMonitoring:
"deploy_arguments": [],
"params": {
'tcp_ports': [9095],
+ 'port_ips': {'8765': '1.2.3.1'}
},
"meta": {
'service_name': 'prometheus',
'ports': [9095],
- 'ip': None,
+ 'ip': '1.2.3.1',
'deployed_by': [],
'rank': None,
'rank_generation': None,
@@ -731,6 +741,7 @@ class TestMonitoring:
},
'retention_time': '15d',
'retention_size': '0',
+ 'ip_to_bind_to': '1.2.3.1',
},
}),
)
@@ -855,6 +866,19 @@ class TestMonitoring:
password: sd_password
tls_config:
ca_file: root_cert.pem
+
+ - job_name: 'nvmeof'
+ honor_labels: true
+ scheme: https
+ tls_config:
+ ca_file: root_cert.pem
+ http_sd_configs:
+ - url: https://[::1]:8765/sd/prometheus/sd-config?service=nvmeof
+ basic_auth:
+ username: sd_user
+ password: sd_password
+ tls_config:
+ ca_file: root_cert.pem
""").lstrip()
_run_cephadm.assert_called_with(
@@ -892,6 +916,7 @@ class TestMonitoring:
},
'retention_time': '15d',
'retention_size': '0',
+ 'ip_to_bind_to': '',
'web_config': '/etc/prometheus/web.yml',
},
}),
@@ -1633,7 +1658,7 @@ class TestIngressService:
)
if enable_haproxy_protocol:
haproxy_txt += ' default-server send-proxy-v2\n'
- haproxy_txt += ' server nfs.foo.0 192.168.122.111:12049\n'
+ haproxy_txt += ' server nfs.foo.0 192.168.122.111:12049 check\n'
haproxy_expected_conf = {
'files': {'haproxy.cfg': haproxy_txt}
}
@@ -1783,7 +1808,7 @@ class TestIngressService:
'balance static-rr\n '
'option httpchk HEAD / HTTP/1.0\n '
'server '
- + haproxy_generated_conf[1][0] + ' 1.2.3.7:80 check weight 100\n'
+ + haproxy_generated_conf[1][0] + ' 1.2.3.7:80 check weight 100 inter 2s\n'
}
}
@@ -1908,7 +1933,7 @@ class TestIngressService:
'balance static-rr\n '
'option httpchk HEAD / HTTP/1.0\n '
'server '
- + haproxy_generated_conf[1][0] + ' 1::4:443 check weight 100\n'
+ + haproxy_generated_conf[1][0] + ' 1::4:443 check weight 100 inter 2s\n'
}
}
@@ -2032,7 +2057,7 @@ class TestIngressService:
'balance static-rr\n '
'option httpchk HEAD / HTTP/1.0\n '
'server '
- + haproxy_generated_conf[1][0] + ' 1.2.3.7:80 check weight 100\n'
+ + haproxy_generated_conf[1][0] + ' 1.2.3.7:80 check weight 100 inter 2s\n'
}
}
@@ -2411,7 +2436,7 @@ class TestIngressService:
' balance source\n'
' hash-type consistent\n'
' default-server send-proxy-v2\n'
- ' server nfs.foo.0 192.168.122.111:12049\n'
+ ' server nfs.foo.0 192.168.122.111:12049 check\n'
)
haproxy_expected_conf = {
'files': {'haproxy.cfg': haproxy_txt}
@@ -2431,6 +2456,7 @@ class TestIngressService:
' Delegations = false;\n'
" RecoveryBackend = 'rados_cluster';\n"
' Minor_Versions = 1, 2;\n'
+ ' IdmapConf = "/etc/ganesha/idmap.conf";\n'
'}\n'
'\n'
'RADOS_KV {\n'
@@ -2454,7 +2480,7 @@ class TestIngressService:
"%url rados://.nfs/foo/conf-nfs.foo"
)
nfs_expected_conf = {
- 'files': {'ganesha.conf': nfs_ganesha_txt},
+ 'files': {'ganesha.conf': nfs_ganesha_txt, 'idmap.conf': ''},
'config': '',
'extra_args': ['-N', 'NIV_EVENT'],
'keyring': (
diff --git a/src/pybind/mgr/cephadm/utils.py b/src/pybind/mgr/cephadm/utils.py
index 63672936c..3aedfbd86 100644
--- a/src/pybind/mgr/cephadm/utils.py
+++ b/src/pybind/mgr/cephadm/utils.py
@@ -31,7 +31,7 @@ RESCHEDULE_FROM_OFFLINE_HOSTS_TYPES = ['haproxy', 'nfs']
CEPH_UPGRADE_ORDER = CEPH_TYPES + GATEWAY_TYPES + MONITORING_STACK_TYPES
# these daemon types use the ceph container image
-CEPH_IMAGE_TYPES = CEPH_TYPES + ['iscsi', 'nfs']
+CEPH_IMAGE_TYPES = CEPH_TYPES + ['iscsi', 'nfs', 'node-proxy']
# these daemons do not use the ceph image. There are other daemons
# that also don't use the ceph image, but we only care about those