summaryrefslogtreecommitdiffstats
path: root/staslib/ctrl.py
diff options
context:
space:
mode:
authorBenjamin Drung <bdrung@debian.org>2023-06-10 08:55:33 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2023-06-10 09:21:49 +0000
commit88837172f69eabc408ae3945d82e0270b8e07440 (patch)
treed6b7fa06694f45d25f54f6ea9ded93c981e51f6f /staslib/ctrl.py
parentInitial commit. (diff)
downloadnvme-stas-88837172f69eabc408ae3945d82e0270b8e07440.tar.xz
nvme-stas-88837172f69eabc408ae3945d82e0270b8e07440.zip
Adding upstream version 2.2.1.upstream/2.2.1
Signed-off-by: Benjamin Drung <bdrung@debian.org>
Diffstat (limited to '')
-rw-r--r--staslib/ctrl.py850
1 files changed, 850 insertions, 0 deletions
diff --git a/staslib/ctrl.py b/staslib/ctrl.py
new file mode 100644
index 0000000..97a1c7b
--- /dev/null
+++ b/staslib/ctrl.py
@@ -0,0 +1,850 @@
+# Copyright (c) 2022, Dell Inc. or its subsidiaries. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+# See the LICENSE file for details.
+#
+# This file is part of NVMe STorage Appliance Services (nvme-stas).
+#
+# Authors: Martin Belanger <Martin.Belanger@dell.com>
+#
+'''This module defines the base Controller object from which the
+Dc (Discovery Controller) and Ioc (I/O Controller) objects are derived.'''
+
+import time
+import inspect
+import logging
+from gi.repository import GLib
+from libnvme import nvme
+from staslib import conf, defs, gutil, trid, udev, stas
+
+
+DLP_CHANGED = (
+ (nvme.NVME_LOG_LID_DISCOVER << 16) | (nvme.NVME_AER_NOTICE_DISC_CHANGED << 8) | nvme.NVME_AER_NOTICE
+) # 0x70f002
+
+
+def get_eflags(dlpe):
+ '''@brief Return eflags field of dlpe'''
+ return int(dlpe.get('eflags', 0)) if dlpe else 0
+
+
+def get_ncc(eflags: int):
+ '''@brief Return True if Not Connected to CDC bit is asserted, False otherwise'''
+ return eflags & nvme.NVMF_DISC_EFLAGS_NCC != 0
+
+
+def dlp_supp_opts_as_string(dlp_supp_opts: int):
+ '''@brief Return the list of options supported by the Get
+ discovery log page command.
+ '''
+ data = {
+ nvme.NVMF_LOG_DISC_LID_EXTDLPES: "EXTDLPES",
+ nvme.NVMF_LOG_DISC_LID_PLEOS: "PLEOS",
+ nvme.NVMF_LOG_DISC_LID_ALLSUBES: "ALLSUBES",
+ }
+ return [txt for msk, txt in data.items() if dlp_supp_opts & msk]
+
+
+# ******************************************************************************
+class Controller(stas.ControllerABC): # pylint: disable=too-many-instance-attributes
+ '''@brief Base class used to manage the connection to a controller.'''
+
+ def __init__(self, tid: trid.TID, service, discovery_ctrl: bool = False):
+ sysconf = conf.SysConf()
+ self._nvme_options = conf.NvmeOptions()
+ self._root = nvme.root()
+ self._host = nvme.host(
+ self._root, hostnqn=sysconf.hostnqn, hostid=sysconf.hostid, hostsymname=sysconf.hostsymname
+ )
+ self._host.dhchap_key = sysconf.hostkey if self._nvme_options.dhchap_hostkey_supp else None
+ self._udev = udev.UDEV
+ self._device = None # Refers to the nvme device (e.g. /dev/nvme[n])
+ self._ctrl = None # libnvme's nvme.ctrl object
+ self._connect_op = None
+
+ super().__init__(tid, service, discovery_ctrl)
+
+ def _release_resources(self):
+ logging.debug('Controller._release_resources() - %s | %s', self.id, self.device)
+
+ if self._udev:
+ self._udev.unregister_for_device_events(self._on_udev_notification)
+
+ self._kill_ops()
+
+ super()._release_resources()
+
+ self._ctrl = None
+ self._udev = None
+ self._host = None
+ self._root = None
+ self._nvme_options = None
+
+ @property
+ def device(self) -> str:
+ '''@brief return the Linux nvme device id (e.g. nvme3) or empty
+ string if no device is associated with this controller'''
+ if not self._device and self._ctrl and self._ctrl.name:
+ self._device = self._ctrl.name
+
+ return self._device or 'nvme?'
+
+ def all_ops_completed(self) -> bool:
+ '''@brief Returns True if all operations have completed. False otherwise.'''
+ return self._connect_op is None or self._connect_op.completed()
+
+ def connected(self):
+ '''@brief Return whether a connection is established'''
+ return self._ctrl and self._ctrl.connected()
+
+ def controller_id_dict(self) -> dict:
+ '''@brief return the controller ID as a dict.'''
+ cid = super().controller_id_dict()
+ cid['device'] = self.device
+ return cid
+
+ def details(self) -> dict:
+ '''@brief return detailed debug info about this controller'''
+ details = super().details()
+ details.update(
+ self._udev.get_attributes(self.device, ('hostid', 'hostnqn', 'model', 'serial', 'dctype', 'cntrltype'))
+ )
+ details['connected'] = str(self.connected())
+ return details
+
+ def info(self) -> dict:
+ '''@brief Get the controller info for this object'''
+ info = super().info()
+ if self._connect_op:
+ info['connect operation'] = str(self._connect_op.as_dict())
+ return info
+
+ def cancel(self):
+ '''@brief Used to cancel pending operations.'''
+ super().cancel()
+ if self._connect_op:
+ self._connect_op.cancel()
+
+ def _kill_ops(self):
+ if self._connect_op:
+ self._connect_op.kill()
+ self._connect_op = None
+
+ def set_level_from_tron(self, tron):
+ '''Set log level based on TRON'''
+ if self._root:
+ self._root.log_level("debug" if tron else "err")
+
+ def _on_udev_notification(self, udev_obj):
+ if self._alive():
+ if udev_obj.action == 'change':
+ nvme_aen = udev_obj.get('NVME_AEN')
+ nvme_event = udev_obj.get('NVME_EVENT')
+ if isinstance(nvme_aen, str):
+ logging.info('%s | %s - Received AEN: %s', self.id, udev_obj.sys_name, nvme_aen)
+ self._on_aen(int(nvme_aen, 16))
+ if isinstance(nvme_event, str):
+ self._on_nvme_event(nvme_event)
+ elif udev_obj.action == 'remove':
+ logging.info('%s | %s - Received "remove" event', self.id, udev_obj.sys_name)
+ self._on_ctrl_removed(udev_obj)
+ else:
+ logging.debug(
+ 'Controller._on_udev_notification() - %s | %s: Received "%s" event',
+ self.id,
+ udev_obj.sys_name,
+ udev_obj.action,
+ )
+ else:
+ logging.debug(
+ 'Controller._on_udev_notification() - %s | %s: Received event on dead object. udev_obj %s: %s',
+ self.id,
+ self.device,
+ udev_obj.action,
+ udev_obj.sys_name,
+ )
+
+ def _on_ctrl_removed(self, udev_obj): # pylint: disable=unused-argument
+ if self._udev:
+ self._udev.unregister_for_device_events(self._on_udev_notification)
+ self._kill_ops() # Kill all pending operations
+ self._ctrl = None
+
+ # Defer removal of this object to the next main loop's idle period.
+ GLib.idle_add(self._serv.remove_controller, self, True)
+
+ def _get_cfg(self):
+ '''Get configuration parameters. These may either come from the [Global]
+ section or from a "controller" entry in the configuration file. A
+ definition found in a "controller" entry overrides the same definition
+ found in the [Global] section.
+ '''
+ cfg = {}
+ service_conf = conf.SvcConf()
+ for option, keyword in (
+ ('kato', 'keep_alive_tmo'),
+ ('queue-size', 'queue_size'),
+ ('hdr-digest', 'hdr_digest'),
+ ('data-digest', 'data_digest'),
+ ('nr-io-queues', 'nr_io_queues'),
+ ('ctrl-loss-tmo', 'ctrl_loss_tmo'),
+ ('disable-sqflow', 'disable_sqflow'),
+ ('nr-poll-queues', 'nr_poll_queues'),
+ ('nr-write-queues', 'nr_write_queues'),
+ ('reconnect-delay', 'reconnect_delay'),
+ ):
+ # Check if the value is defined as a "controller" entry (i.e. override)
+ ovrd_val = self.tid.cfg.get(option, None)
+ if ovrd_val is not None:
+ cfg[keyword] = ovrd_val
+ else:
+ # Check if the value is found in the [Global] section.
+ glob_val = service_conf.get_option('Global', option)
+ if glob_val is not None:
+ cfg[keyword] = glob_val
+
+ return cfg
+
+ def _do_connect(self):
+ service_conf = conf.SvcConf()
+ host_iface = (
+ self.tid.host_iface
+ if (self.tid.host_iface and not service_conf.ignore_iface and self._nvme_options.host_iface_supp)
+ else None
+ )
+ self._ctrl = nvme.ctrl(
+ self._root,
+ subsysnqn=self.tid.subsysnqn,
+ transport=self.tid.transport,
+ traddr=self.tid.traddr,
+ trsvcid=self.tid.trsvcid if self.tid.trsvcid else None,
+ host_traddr=self.tid.host_traddr if self.tid.host_traddr else None,
+ host_iface=host_iface,
+ )
+ self._ctrl.discovery_ctrl_set(self._discovery_ctrl)
+
+ # Set the DHCHAP key on the controller
+ # NOTE that this will eventually have to
+ # change once we have support for AVE (TP8019)
+ ctrl_dhchap_key = self.tid.cfg.get('dhchap-ctrl-secret')
+ if ctrl_dhchap_key and self._nvme_options.dhchap_ctrlkey_supp:
+ has_dhchap_key = hasattr(self._ctrl, 'dhchap_key')
+ if not has_dhchap_key:
+ logging.warning(
+ '%s | %s - libnvme-%s does not allow setting the controller DHCHAP key. Please upgrade libnvme.',
+ self.id,
+ self.device,
+ defs.LIBNVME_VERSION,
+ )
+ else:
+ self._ctrl.dhchap_key = ctrl_dhchap_key
+
+ # Audit existing nvme devices. If we find a match, then
+ # we'll just borrow that device instead of creating a new one.
+ udev_obj = self._find_existing_connection()
+ if udev_obj is not None:
+ # A device already exists.
+ self._device = udev_obj.sys_name
+ logging.debug(
+ 'Controller._do_connect() - %s Found existing control device: %s', self.id, udev_obj.sys_name
+ )
+ self._connect_op = gutil.AsyncTask(
+ self._on_connect_success, self._on_connect_fail, self._ctrl.init, self._host, int(udev_obj.sys_number)
+ )
+ else:
+ cfg = self._get_cfg()
+ logging.debug(
+ 'Controller._do_connect() - %s Connecting to nvme control with cfg=%s', self.id, cfg
+ )
+ self._connect_op = gutil.AsyncTask(
+ self._on_connect_success, self._on_connect_fail, self._ctrl.connect, self._host, cfg
+ )
+
+ self._connect_op.run_async()
+
+ # --------------------------------------------------------------------------
+ def _on_connect_success(self, op_obj: gutil.AsyncTask, data):
+ '''@brief Function called when we successfully connect to the
+ Controller.
+ '''
+ op_obj.kill()
+ self._connect_op = None
+
+ if self._alive():
+ self._device = self._ctrl.name
+ logging.info('%s | %s - Connection established!', self.id, self.device)
+ self._connect_attempts = 0
+ self._udev.register_for_device_events(self._device, self._on_udev_notification)
+ else:
+ logging.debug(
+ 'Controller._on_connect_success() - %s | %s: Received event on dead object. data=%s',
+ self.id,
+ self.device,
+ data,
+ )
+
+ def _on_connect_fail(self, op_obj: gutil.AsyncTask, err, fail_cnt): # pylint: disable=unused-argument
+ '''@brief Function called when we fail to connect to the Controller.'''
+ op_obj.kill()
+ self._connect_op = None
+ if self._alive():
+ if self._connect_attempts == 1:
+ # Do a fast re-try on the first failure.
+ self._retry_connect_tmr.set_timeout(self.FAST_CONNECT_RETRY_PERIOD_SEC)
+ elif self._connect_attempts == 2:
+ # If the fast connect re-try fails, then we can print a message to
+ # indicate the failure, and start a slow re-try period.
+ self._retry_connect_tmr.set_timeout(self.CONNECT_RETRY_PERIOD_SEC)
+ logging.error('%s Failed to connect to controller. %s %s', self.id, err.domain, err.message)
+
+ if self._should_try_to_reconnect():
+ logging.debug(
+ 'Controller._on_connect_fail() - %s %s. Retry in %s sec.',
+ self.id,
+ err,
+ self._retry_connect_tmr.get_timeout(),
+ )
+ self._retry_connect_tmr.start()
+ else:
+ logging.debug(
+ 'Controller._on_connect_fail() - %s Received event on dead object. %s %s',
+ self.id,
+ err.domain,
+ err.message,
+ )
+
+ def disconnect(self, disconnected_cb, keep_connection):
+ '''@brief Issue an asynchronous disconnect command to a Controller.
+ Once the async command has completed, the callback 'disconnected_cb'
+ will be invoked. If a controller is already disconnected, then the
+ callback will be added to the main loop's next idle slot to be executed
+ ASAP.
+
+ @param disconnected_cb: Callback to be called when disconnect has
+ completed. the callback must have this signature:
+ def cback(controller: Controller, success: bool)
+ @param keep_connection: Whether the underlying connection should remain
+ in the kernel.
+ '''
+ logging.debug(
+ 'Controller.disconnect() - %s | %s: keep_connection=%s', self.id, self.device, keep_connection
+ )
+ if self._ctrl and self._ctrl.connected() and not keep_connection:
+ logging.info('%s | %s - Disconnect initiated', self.id, self.device)
+ op = gutil.AsyncTask(self._on_disconn_success, self._on_disconn_fail, self._ctrl.disconnect)
+ op.run_async(disconnected_cb)
+ else:
+ # Defer callback to the next main loop's idle period. The callback
+ # cannot be called directly as the current Controller object is in the
+ # process of being disconnected and the callback will in fact delete
+ # the object. This would invariably lead to unpredictable outcome.
+ GLib.idle_add(disconnected_cb, self, True)
+
+ def _on_disconn_success(self, op_obj: gutil.AsyncTask, data, disconnected_cb): # pylint: disable=unused-argument
+ logging.debug('Controller._on_disconn_success() - %s | %s', self.id, self.device)
+ op_obj.kill()
+ # Defer callback to the next main loop's idle period. The callback
+ # cannot be called directly as the current Controller object is in the
+ # process of being disconnected and the callback will in fact delete
+ # the object. This would invariably lead to unpredictable outcome.
+ GLib.idle_add(disconnected_cb, self, True)
+
+ def _on_disconn_fail(
+ self, op_obj: gutil.AsyncTask, err, fail_cnt, disconnected_cb
+ ): # pylint: disable=unused-argument
+ logging.debug('Controller._on_disconn_fail() - %s | %s: %s', self.id, self.device, err)
+ op_obj.kill()
+ # Defer callback to the next main loop's idle period. The callback
+ # cannot be called directly as the current Controller object is in the
+ # process of being disconnected and the callback will in fact delete
+ # the object. This would invariably lead to unpredictable outcome.
+ GLib.idle_add(disconnected_cb, self, False)
+
+
+# ******************************************************************************
+class Dc(Controller):
+ '''@brief This object establishes a connection to one Discover Controller (DC).
+ It retrieves the discovery log pages and caches them.
+ It also monitors udev events associated with that DC and updates
+ the cached discovery log pages accordingly.
+ '''
+
+ GET_LOG_PAGE_RETRY_RERIOD_SEC = 20
+ REGISTRATION_RETRY_RERIOD_SEC = 5
+ GET_SUPPORTED_RETRY_RERIOD_SEC = 5
+
+ def __init__(self, staf, tid: trid.TID, log_pages=None, origin=None):
+ super().__init__(tid, staf, discovery_ctrl=True)
+ self._register_op = None
+ self._get_supported_op = None
+ self._get_log_op = None
+ self._origin = origin
+ self._log_pages = log_pages if log_pages else list() # Log pages cache
+
+ # For Avahi-discovered DCs that later become unresponsive, monitor how
+ # long the controller remains unresponsive and if it does not return for
+ # a configurable soak period (_ctrl_unresponsive_tmr), remove that
+ # controller. Only Avahi-discovered controllers need this timeout-based
+ # cleanup.
+ self._ctrl_unresponsive_time = None # The time at which connectivity was lost
+ self._ctrl_unresponsive_tmr = gutil.GTimer(0, self._serv.controller_unresponsive, self.tid)
+
+ def _release_resources(self):
+ logging.debug('Dc._release_resources() - %s | %s', self.id, self.device)
+ super()._release_resources()
+
+ if self._ctrl_unresponsive_tmr is not None:
+ self._ctrl_unresponsive_tmr.kill()
+
+ self._log_pages = list()
+ self._ctrl_unresponsive_tmr = None
+
+ def _kill_ops(self):
+ super()._kill_ops()
+ if self._get_log_op:
+ self._get_log_op.kill()
+ self._get_log_op = None
+ if self._register_op:
+ self._register_op.kill()
+ self._register_op = None
+ if self._get_supported_op:
+ self._get_supported_op.kill()
+ self._get_supported_op = None
+
+ def all_ops_completed(self) -> bool:
+ '''@brief Returns True if all operations have completed. False otherwise.'''
+ return (
+ super().all_ops_completed()
+ and (self._get_log_op is None or self._get_log_op.completed())
+ and (self._register_op is None or self._register_op.completed())
+ and (self._get_supported_op is None or self._get_supported_op.completed())
+ )
+
+ @property
+ def origin(self):
+ '''@brief Return how this controller came into existance. Was it
+ "discovered" through mDNS service discovery (TP8009), was it manually
+ "configured" in stafd.conf, or was it a "referral".
+ '''
+ return self._origin
+
+ @origin.setter
+ def origin(self, value):
+ '''@brief Set the origin of this controller.'''
+ if value in ('discovered', 'configured', 'referral'):
+ self._origin = value
+ self._handle_lost_controller()
+ else:
+ logging.error('%s | %s - Trying to set invalid origin to %s', self.id, self.device, value)
+
+ def reload_hdlr(self):
+ '''@brief This is called when a "reload" signal is received.'''
+ logging.debug('Dc.reload_hdlr() - %s | %s', self.id, self.device)
+
+ self._handle_lost_controller()
+ self._resync_with_controller()
+
+ def info(self) -> dict:
+ '''@brief Get the controller info for this object'''
+ timeout = conf.SvcConf().zeroconf_persistence_sec
+ unresponsive_time = (
+ time.asctime(self._ctrl_unresponsive_time) if self._ctrl_unresponsive_time is not None else '---'
+ )
+ info = super().info()
+ info['origin'] = self.origin
+ if self.origin == 'discovered':
+ # The code that handles "unresponsive" DCs only applies to
+ # discovered DCs. So, let's only print that info when it's relevant.
+ info['unresponsive timer'] = str(self._ctrl_unresponsive_tmr)
+ info['unresponsive timeout'] = f'{timeout} sec' if timeout >= 0 else 'forever'
+ info['unresponsive time'] = unresponsive_time
+ if self._get_log_op:
+ info['get log page operation'] = str(self._get_log_op.as_dict())
+ if self._register_op:
+ info['register operation'] = str(self._register_op.as_dict())
+ if self._get_supported_op:
+ info['get supported log page operation'] = str(self._get_supported_op.as_dict())
+ return info
+
+ def cancel(self):
+ '''@brief Used to cancel pending operations.'''
+ super().cancel()
+ if self._get_log_op:
+ self._get_log_op.cancel()
+ if self._register_op:
+ self._register_op.cancel()
+ if self._get_supported_op:
+ self._get_supported_op.cancel()
+
+ def log_pages(self) -> list:
+ '''@brief Get the cached log pages for this object'''
+ return self._log_pages
+
+ def referrals(self) -> list:
+ '''@brief Return the list of referrals'''
+ return [page for page in self._log_pages if page['subtype'] == 'referral']
+
+ def _is_ddc(self):
+ return self._ctrl and self._ctrl.dctype != 'cdc'
+
+ def _on_aen(self, aen: int):
+ if aen == DLP_CHANGED and self._get_log_op:
+ self._get_log_op.run_async()
+
+ def _handle_lost_controller(self):
+ if self.origin == 'discovered': # Only apply to mDNS-discovered DCs
+ if not self._serv.is_avahi_reported(self.tid) and not self.connected():
+ timeout = conf.SvcConf().zeroconf_persistence_sec
+ if timeout >= 0:
+ if self._ctrl_unresponsive_time is None:
+ self._ctrl_unresponsive_time = time.localtime()
+ self._ctrl_unresponsive_tmr.start(timeout)
+ logging.info(
+ '%s | %s - Controller is not responding. Will be removed by %s unless restored',
+ self.id,
+ self.device,
+ time.ctime(time.mktime(self._ctrl_unresponsive_time) + timeout),
+ )
+
+ return
+
+ logging.info(
+ '%s | %s - Controller not responding. Retrying...',
+ self.id,
+ self.device,
+ )
+
+ self._ctrl_unresponsive_time = None
+ self._ctrl_unresponsive_tmr.stop()
+ self._ctrl_unresponsive_tmr.set_timeout(0)
+
+ def is_unresponsive(self):
+ '''@brief For "discovered" DC, return True if DC is unresponsive,
+ False otherwise.
+ '''
+ return (
+ self.origin == 'discovered'
+ and not self._serv.is_avahi_reported(self.tid)
+ and not self.connected()
+ and self._ctrl_unresponsive_time is not None
+ and self._ctrl_unresponsive_tmr.time_remaining() <= 0
+ )
+
+ def _resync_with_controller(self):
+ '''Communicate with DC to resync the states'''
+ if self._register_op:
+ self._register_op.run_async()
+ elif self._get_supported_op:
+ self._get_supported_op.run_async()
+ elif self._get_log_op:
+ self._get_log_op.run_async()
+
+ def _on_nvme_event(self, nvme_event: str):
+ if nvme_event in ('connected', 'rediscover'):
+ # This event indicates that the kernel
+ # driver re-connected to the DC.
+ logging.debug(
+ 'Dc._on_nvme_event() - %s | %s: Received "%s" event',
+ self.id,
+ self.device,
+ nvme_event,
+ )
+ self._resync_with_controller()
+
+ def _find_existing_connection(self):
+ return self._udev.find_nvme_dc_device(self.tid)
+
+ def _post_registration_actions(self):
+ # Need to check that supported_log_pages() is available (introduced in libnvme 1.2)
+ has_supported_log_pages = hasattr(self._ctrl, 'supported_log_pages')
+ if not has_supported_log_pages:
+ logging.warning(
+ '%s | %s - libnvme-%s does not support "Get supported log pages". Please upgrade libnvme.',
+ self.id,
+ self.device,
+ defs.LIBNVME_VERSION,
+ )
+
+ if conf.SvcConf().pleo_enabled and self._is_ddc() and has_supported_log_pages:
+ self._get_supported_op = gutil.AsyncTask(
+ self._on_get_supported_success, self._on_get_supported_fail, self._ctrl.supported_log_pages
+ )
+ self._get_supported_op.run_async()
+ else:
+ self._get_log_op = gutil.AsyncTask(self._on_get_log_success, self._on_get_log_fail, self._ctrl.discover)
+ self._get_log_op.run_async()
+
+ # --------------------------------------------------------------------------
+ def _on_connect_success(self, op_obj: gutil.AsyncTask, data):
+ '''@brief Function called when we successfully connect to the
+ Discovery Controller.
+ '''
+ super()._on_connect_success(op_obj, data)
+
+ if self._alive():
+ self._ctrl_unresponsive_time = None
+ self._ctrl_unresponsive_tmr.stop()
+ self._ctrl_unresponsive_tmr.set_timeout(0)
+
+ if self._ctrl.is_registration_supported():
+ self._register_op = gutil.AsyncTask(
+ self._on_registration_success,
+ self._on_registration_fail,
+ self._ctrl.registration_ctlr,
+ nvme.NVMF_DIM_TAS_REGISTER,
+ )
+ self._register_op.run_async()
+ else:
+ self._post_registration_actions()
+
+ def _on_connect_fail(self, op_obj: gutil.AsyncTask, err, fail_cnt):
+ '''@brief Function called when we fail to connect to the Controller.'''
+ super()._on_connect_fail(op_obj, err, fail_cnt)
+
+ if self._alive():
+ self._handle_lost_controller()
+
+ # --------------------------------------------------------------------------
+ def _on_registration_success(self, op_obj: gutil.AsyncTask, data): # pylint: disable=unused-argument
+ '''@brief Function called when we successfully register with the
+ Discovery Controller. See self._register_op object
+ for details.
+
+ NOTE: The name _on_registration_success() may be misleading. "success"
+ refers to the fact that a successful exchange was made with the DC.
+ It doesn't mean that the registration itself succeeded.
+ '''
+ if self._alive():
+ if data is not None:
+ logging.warning('%s | %s - Registration error. %s.', self.id, self.device, data)
+ else:
+ logging.debug('Dc._on_registration_success() - %s | %s', self.id, self.device)
+
+ self._post_registration_actions()
+ else:
+ logging.debug(
+ 'Dc._on_registration_success() - %s | %s: Received event on dead object.', self.id, self.device
+ )
+
+ def _on_registration_fail(self, op_obj: gutil.AsyncTask, err, fail_cnt):
+ '''@brief Function called when we fail to register with the
+ Discovery Controller. See self._register_op object
+ for details.
+ '''
+ if self._alive():
+ logging.debug(
+ 'Dc._on_registration_fail() - %s | %s: %s. Retry in %s sec',
+ self.id,
+ self.device,
+ err,
+ Dc.REGISTRATION_RETRY_RERIOD_SEC,
+ )
+ if fail_cnt == 1: # Throttle the logs. Only print the first time the command fails
+ logging.error('%s | %s - Failed to register with Discovery Controller. %s', self.id, self.device, err)
+ op_obj.retry(Dc.REGISTRATION_RETRY_RERIOD_SEC)
+ else:
+ logging.debug(
+ 'Dc._on_registration_fail() - %s | %s: Received event on dead object. %s',
+ self.id,
+ self.device,
+ err,
+ )
+ op_obj.kill()
+
+ # --------------------------------------------------------------------------
+ def _on_get_supported_success(self, op_obj: gutil.AsyncTask, data): # pylint: disable=unused-argument
+ '''@brief Function called when we successfully retrieved the supported
+ log pages from the Discovery Controller. See self._get_supported_op object
+ for details.
+
+ NOTE: The name _on_get_supported_success() may be misleading. "success"
+ refers to the fact that a successful exchange was made with the DC.
+ It doesn't mean that the Get Supported Log Page itself succeeded.
+ '''
+ if self._alive():
+ try:
+ dlp_supp_opts = data[nvme.NVME_LOG_LID_DISCOVER] >> 16
+ except (TypeError, IndexError):
+ dlp_supp_opts = 0
+
+ logging.debug(
+ 'Dc._on_get_supported_success() - %s | %s: supported options = 0x%04X = %s',
+ self.id,
+ self.device,
+ dlp_supp_opts,
+ dlp_supp_opts_as_string(dlp_supp_opts),
+ )
+
+ if 'lsp' in inspect.signature(self._ctrl.discover).parameters:
+ lsp = nvme.NVMF_LOG_DISC_LSP_PLEO if dlp_supp_opts & nvme.NVMF_LOG_DISC_LID_PLEOS else 0
+ self._get_log_op = gutil.AsyncTask(
+ self._on_get_log_success, self._on_get_log_fail, self._ctrl.discover, lsp
+ )
+ else:
+ logging.warning(
+ '%s | %s - libnvme-%s does not support setting PLEO bit. Please upgrade.',
+ self.id,
+ self.device,
+ defs.LIBNVME_VERSION,
+ )
+ self._get_log_op = gutil.AsyncTask(self._on_get_log_success, self._on_get_log_fail, self._ctrl.discover)
+ self._get_log_op.run_async()
+ else:
+ logging.debug(
+ 'Dc._on_get_supported_success() - %s | %s: Received event on dead object.', self.id, self.device
+ )
+
+ def _on_get_supported_fail(self, op_obj: gutil.AsyncTask, err, fail_cnt):
+ '''@brief Function called when we fail to retrieve the supported log
+ page from the Discovery Controller. See self._get_supported_op object
+ for details.
+ '''
+ if self._alive():
+ logging.debug(
+ 'Dc._on_get_supported_fail() - %s | %s: %s. Retry in %s sec',
+ self.id,
+ self.device,
+ err,
+ Dc.GET_SUPPORTED_RETRY_RERIOD_SEC,
+ )
+ if fail_cnt == 1: # Throttle the logs. Only print the first time the command fails
+ logging.error(
+ '%s | %s - Failed to Get supported log pages from Discovery Controller. %s',
+ self.id,
+ self.device,
+ err,
+ )
+ op_obj.retry(Dc.GET_SUPPORTED_RETRY_RERIOD_SEC)
+ else:
+ logging.debug(
+ 'Dc._on_get_supported_fail() - %s | %s: Received event on dead object. %s',
+ self.id,
+ self.device,
+ err,
+ )
+ op_obj.kill()
+
+ # --------------------------------------------------------------------------
+ def _on_get_log_success(self, op_obj: gutil.AsyncTask, data): # pylint: disable=unused-argument
+ '''@brief Function called when we successfully retrieve the log pages
+ from the Discovery Controller. See self._get_log_op object
+ for details.
+ '''
+ if self._alive():
+ # Note that for historical reasons too long to explain, the CDC may
+ # return invalid addresses ('0.0.0.0', '::', or ''). Those need to
+ # be filtered out.
+ referrals_before = self.referrals()
+ self._log_pages = (
+ [
+ {k.strip(): str(v).strip() for k, v in dictionary.items()}
+ for dictionary in data
+ if dictionary.get('traddr', '').strip() not in ('0.0.0.0', '::', '')
+ ]
+ if data
+ else list()
+ )
+ logging.info(
+ '%s | %s - Received discovery log pages (num records=%s).', self.id, self.device, len(self._log_pages)
+ )
+ referrals_after = self.referrals()
+ self._serv.log_pages_changed(self, self.device)
+ if referrals_after != referrals_before:
+ logging.debug(
+ 'Dc._on_get_log_success() - %s | %s: Referrals before = %s',
+ self.id,
+ self.device,
+ referrals_before,
+ )
+ logging.debug(
+ 'Dc._on_get_log_success() - %s | %s: Referrals after = %s',
+ self.id,
+ self.device,
+ referrals_after,
+ )
+ self._serv.referrals_changed()
+ else:
+ logging.debug(
+ 'Dc._on_get_log_success() - %s | %s: Received event on dead object.', self.id, self.device
+ )
+
+ def _on_get_log_fail(self, op_obj: gutil.AsyncTask, err, fail_cnt):
+ '''@brief Function called when we fail to retrieve the log pages
+ from the Discovery Controller. See self._get_log_op object
+ for details.
+ '''
+ if self._alive():
+ logging.debug(
+ 'Dc._on_get_log_fail() - %s | %s: %s. Retry in %s sec',
+ self.id,
+ self.device,
+ err,
+ Dc.GET_LOG_PAGE_RETRY_RERIOD_SEC,
+ )
+ if fail_cnt == 1: # Throttle the logs. Only print the first time the command fails
+ logging.error('%s | %s - Failed to retrieve log pages. %s', self.id, self.device, err)
+ op_obj.retry(Dc.GET_LOG_PAGE_RETRY_RERIOD_SEC)
+ else:
+ logging.debug(
+ 'Dc._on_get_log_fail() - %s | %s: Received event on dead object. %s',
+ self.id,
+ self.device,
+ err,
+ )
+ op_obj.kill()
+
+
+# ******************************************************************************
+class Ioc(Controller):
+ '''@brief This object establishes a connection to one I/O Controller.'''
+
+ def __init__(self, stac, tid: trid.TID):
+ self._dlpe = None
+ super().__init__(tid, stac)
+
+ def _find_existing_connection(self):
+ return self._udev.find_nvme_ioc_device(self.tid)
+
+ def _on_aen(self, aen: int):
+ pass
+
+ def _on_nvme_event(self, nvme_event):
+ pass
+
+ def reload_hdlr(self):
+ '''@brief This is called when a "reload" signal is received.'''
+ if not self.connected() and self._retry_connect_tmr.time_remaining() == 0:
+ self._try_to_connect_deferred.schedule()
+
+ @property
+ def eflags(self):
+ '''@brief Return the eflag field of the DLPE'''
+ return get_eflags(self._dlpe)
+
+ @property
+ def ncc(self):
+ '''@brief Return Not Connected to CDC status'''
+ return get_ncc(self.eflags)
+
+ def details(self) -> dict:
+ '''@brief return detailed debug info about this controller'''
+ details = super().details()
+ details['dlpe'] = str(self._dlpe)
+ details['dlpe.eflags.ncc'] = str(self.ncc)
+ return details
+
+ def update_dlpe(self, dlpe):
+ '''@brief This method is called when a new DLPE associated
+ with this controller is received.'''
+ new_ncc = get_ncc(get_eflags(dlpe))
+ old_ncc = self.ncc
+ self._dlpe = dlpe
+
+ if old_ncc and not new_ncc: # NCC bit cleared?
+ if not self.connected():
+ self._connect_attempts = 0
+ self._try_to_connect_deferred.schedule()
+
+ def _should_try_to_reconnect(self):
+ '''@brief This is used to determine when it's time to stop trying toi connect'''
+ max_connect_attempts = conf.SvcConf().connect_attempts_on_ncc if self.ncc else 0
+ return max_connect_attempts == 0 or self._connect_attempts < max_connect_attempts