diff options
Diffstat (limited to 'staslib/ctrl.py')
-rw-r--r-- | staslib/ctrl.py | 360 |
1 files changed, 184 insertions, 176 deletions
diff --git a/staslib/ctrl.py b/staslib/ctrl.py index 97a1c7b..ad221e0 100644 --- a/staslib/ctrl.py +++ b/staslib/ctrl.py @@ -135,26 +135,7 @@ class Controller(stas.ControllerABC): # pylint: disable=too-many-instance-attri self._root.log_level("debug" if tron else "err") def _on_udev_notification(self, udev_obj): - if self._alive(): - if udev_obj.action == 'change': - nvme_aen = udev_obj.get('NVME_AEN') - nvme_event = udev_obj.get('NVME_EVENT') - if isinstance(nvme_aen, str): - logging.info('%s | %s - Received AEN: %s', self.id, udev_obj.sys_name, nvme_aen) - self._on_aen(int(nvme_aen, 16)) - if isinstance(nvme_event, str): - self._on_nvme_event(nvme_event) - elif udev_obj.action == 'remove': - logging.info('%s | %s - Received "remove" event', self.id, udev_obj.sys_name) - self._on_ctrl_removed(udev_obj) - else: - logging.debug( - 'Controller._on_udev_notification() - %s | %s: Received "%s" event', - self.id, - udev_obj.sys_name, - udev_obj.action, - ) - else: + if not self._alive(): logging.debug( 'Controller._on_udev_notification() - %s | %s: Received event on dead object. udev_obj %s: %s', self.id, @@ -162,6 +143,26 @@ class Controller(stas.ControllerABC): # pylint: disable=too-many-instance-attri udev_obj.action, udev_obj.sys_name, ) + return + + if udev_obj.action == 'change': + nvme_aen = udev_obj.get('NVME_AEN') + nvme_event = udev_obj.get('NVME_EVENT') + if isinstance(nvme_aen, str): + logging.info('%s | %s - Received AEN: %s', self.id, udev_obj.sys_name, nvme_aen) + self._on_aen(int(nvme_aen, 16)) + if isinstance(nvme_event, str): + self._on_nvme_event(nvme_event) + elif udev_obj.action == 'remove': + logging.info('%s | %s - Received "remove" event', self.id, udev_obj.sys_name) + self._on_ctrl_removed(udev_obj) + else: + logging.debug( + 'Controller._on_udev_notification() - %s | %s: Received "%s" event', + self.id, + udev_obj.sys_name, + udev_obj.action, + ) def _on_ctrl_removed(self, udev_obj): # pylint: disable=unused-argument if self._udev: @@ -269,48 +270,51 @@ class Controller(stas.ControllerABC): # pylint: disable=too-many-instance-attri op_obj.kill() self._connect_op = None - if self._alive(): - self._device = self._ctrl.name - logging.info('%s | %s - Connection established!', self.id, self.device) - self._connect_attempts = 0 - self._udev.register_for_device_events(self._device, self._on_udev_notification) - else: + if not self._alive(): logging.debug( 'Controller._on_connect_success() - %s | %s: Received event on dead object. data=%s', self.id, self.device, data, ) + return + + self._device = self._ctrl.name + logging.info('%s | %s - Connection established!', self.id, self.device) + self._connect_attempts = 0 + self._udev.register_for_device_events(self._device, self._on_udev_notification) def _on_connect_fail(self, op_obj: gutil.AsyncTask, err, fail_cnt): # pylint: disable=unused-argument '''@brief Function called when we fail to connect to the Controller.''' op_obj.kill() self._connect_op = None - if self._alive(): - if self._connect_attempts == 1: - # Do a fast re-try on the first failure. - self._retry_connect_tmr.set_timeout(self.FAST_CONNECT_RETRY_PERIOD_SEC) - elif self._connect_attempts == 2: - # If the fast connect re-try fails, then we can print a message to - # indicate the failure, and start a slow re-try period. - self._retry_connect_tmr.set_timeout(self.CONNECT_RETRY_PERIOD_SEC) - logging.error('%s Failed to connect to controller. %s %s', self.id, err.domain, err.message) - - if self._should_try_to_reconnect(): - logging.debug( - 'Controller._on_connect_fail() - %s %s. Retry in %s sec.', - self.id, - err, - self._retry_connect_tmr.get_timeout(), - ) - self._retry_connect_tmr.start() - else: + + if not self._alive(): logging.debug( 'Controller._on_connect_fail() - %s Received event on dead object. %s %s', self.id, err.domain, err.message, ) + return + + if self._connect_attempts == 1: + # Do a fast re-try on the first failure. + self._retry_connect_tmr.set_timeout(self.FAST_CONNECT_RETRY_PERIOD_SEC) + elif self._connect_attempts == 2: + # If the fast connect re-try fails, then we can print a message to + # indicate the failure, and start a slow re-try period. + self._retry_connect_tmr.set_timeout(self.CONNECT_RETRY_PERIOD_SEC) + logging.error('%s Failed to connect to controller. %s %s', self.id, err.domain, err.message) + + if self._should_try_to_reconnect(): + logging.debug( + 'Controller._on_connect_fail() - %s %s. Retry in %s sec.', + self.id, + err, + self._retry_connect_tmr.get_timeout(), + ) + self._retry_connect_tmr.start() def disconnect(self, disconnected_cb, keep_connection): '''@brief Issue an asynchronous disconnect command to a Controller. @@ -507,11 +511,7 @@ class Dc(Controller): return - logging.info( - '%s | %s - Controller not responding. Retrying...', - self.id, - self.device, - ) + logging.info('%s | %s - Controller not responding. Retrying...', self.id, self.device) self._ctrl_unresponsive_time = None self._ctrl_unresponsive_tmr.stop() @@ -555,8 +555,8 @@ class Dc(Controller): def _post_registration_actions(self): # Need to check that supported_log_pages() is available (introduced in libnvme 1.2) - has_supported_log_pages = hasattr(self._ctrl, 'supported_log_pages') - if not has_supported_log_pages: + get_slp = getattr(self._ctrl, 'supported_log_pages', None) + if get_slp is None: logging.warning( '%s | %s - libnvme-%s does not support "Get supported log pages". Please upgrade libnvme.', self.id, @@ -564,9 +564,9 @@ class Dc(Controller): defs.LIBNVME_VERSION, ) - if conf.SvcConf().pleo_enabled and self._is_ddc() and has_supported_log_pages: + if conf.SvcConf().pleo_enabled and self._is_ddc() and get_slp is not None: self._get_supported_op = gutil.AsyncTask( - self._on_get_supported_success, self._on_get_supported_fail, self._ctrl.supported_log_pages + self._on_get_supported_success, self._on_get_supported_fail, get_slp ) self._get_supported_op.run_async() else: @@ -580,21 +580,23 @@ class Dc(Controller): ''' super()._on_connect_success(op_obj, data) - if self._alive(): - self._ctrl_unresponsive_time = None - self._ctrl_unresponsive_tmr.stop() - self._ctrl_unresponsive_tmr.set_timeout(0) - - if self._ctrl.is_registration_supported(): - self._register_op = gutil.AsyncTask( - self._on_registration_success, - self._on_registration_fail, - self._ctrl.registration_ctlr, - nvme.NVMF_DIM_TAS_REGISTER, - ) - self._register_op.run_async() - else: - self._post_registration_actions() + if not self._alive(): + return + + self._ctrl_unresponsive_time = None + self._ctrl_unresponsive_tmr.stop() + self._ctrl_unresponsive_tmr.set_timeout(0) + + if self._ctrl.is_registration_supported(): + self._register_op = gutil.AsyncTask( + self._on_registration_success, + self._on_registration_fail, + self._ctrl.registration_ctlr, + nvme.NVMF_DIM_TAS_REGISTER, + ) + self._register_op.run_async() + else: + self._post_registration_actions() def _on_connect_fail(self, op_obj: gutil.AsyncTask, err, fail_cnt): '''@brief Function called when we fail to connect to the Controller.''' @@ -613,35 +615,25 @@ class Dc(Controller): refers to the fact that a successful exchange was made with the DC. It doesn't mean that the registration itself succeeded. ''' - if self._alive(): - if data is not None: - logging.warning('%s | %s - Registration error. %s.', self.id, self.device, data) - else: - logging.debug('Dc._on_registration_success() - %s | %s', self.id, self.device) - - self._post_registration_actions() - else: + if not self._alive(): logging.debug( 'Dc._on_registration_success() - %s | %s: Received event on dead object.', self.id, self.device ) + return + + if data is not None: + logging.warning('%s | %s - Registration error. %s.', self.id, self.device, data) + else: + logging.debug('Dc._on_registration_success() - %s | %s', self.id, self.device) + + self._post_registration_actions() def _on_registration_fail(self, op_obj: gutil.AsyncTask, err, fail_cnt): '''@brief Function called when we fail to register with the Discovery Controller. See self._register_op object for details. ''' - if self._alive(): - logging.debug( - 'Dc._on_registration_fail() - %s | %s: %s. Retry in %s sec', - self.id, - self.device, - err, - Dc.REGISTRATION_RETRY_RERIOD_SEC, - ) - if fail_cnt == 1: # Throttle the logs. Only print the first time the command fails - logging.error('%s | %s - Failed to register with Discovery Controller. %s', self.id, self.device, err) - op_obj.retry(Dc.REGISTRATION_RETRY_RERIOD_SEC) - else: + if not self._alive(): logging.debug( 'Dc._on_registration_fail() - %s | %s: Received event on dead object. %s', self.id, @@ -649,6 +641,18 @@ class Dc(Controller): err, ) op_obj.kill() + return + + logging.debug( + 'Dc._on_registration_fail() - %s | %s: %s. Retry in %s sec', + self.id, + self.device, + err, + Dc.REGISTRATION_RETRY_RERIOD_SEC, + ) + if fail_cnt == 1: # Throttle the logs. Only print the first time the command fails + logging.error('%s | %s - Failed to register with Discovery Controller. %s', self.id, self.device, err) + op_obj.retry(Dc.REGISTRATION_RETRY_RERIOD_SEC) # -------------------------------------------------------------------------- def _on_get_supported_success(self, op_obj: gutil.AsyncTask, data): # pylint: disable=unused-argument @@ -660,68 +664,70 @@ class Dc(Controller): refers to the fact that a successful exchange was made with the DC. It doesn't mean that the Get Supported Log Page itself succeeded. ''' - if self._alive(): - try: - dlp_supp_opts = data[nvme.NVME_LOG_LID_DISCOVER] >> 16 - except (TypeError, IndexError): - dlp_supp_opts = 0 - + if not self._alive(): logging.debug( - 'Dc._on_get_supported_success() - %s | %s: supported options = 0x%04X = %s', - self.id, - self.device, - dlp_supp_opts, - dlp_supp_opts_as_string(dlp_supp_opts), + 'Dc._on_get_supported_success() - %s | %s: Received event on dead object.', self.id, self.device ) + return - if 'lsp' in inspect.signature(self._ctrl.discover).parameters: - lsp = nvme.NVMF_LOG_DISC_LSP_PLEO if dlp_supp_opts & nvme.NVMF_LOG_DISC_LID_PLEOS else 0 - self._get_log_op = gutil.AsyncTask( - self._on_get_log_success, self._on_get_log_fail, self._ctrl.discover, lsp - ) - else: - logging.warning( - '%s | %s - libnvme-%s does not support setting PLEO bit. Please upgrade.', - self.id, - self.device, - defs.LIBNVME_VERSION, - ) - self._get_log_op = gutil.AsyncTask(self._on_get_log_success, self._on_get_log_fail, self._ctrl.discover) - self._get_log_op.run_async() + try: + dlp_supp_opts = data[nvme.NVME_LOG_LID_DISCOVER] >> 16 + except (TypeError, IndexError): + dlp_supp_opts = 0 + + logging.debug( + 'Dc._on_get_supported_success() - %s | %s: supported options = 0x%04X = %s', + self.id, + self.device, + dlp_supp_opts, + dlp_supp_opts_as_string(dlp_supp_opts), + ) + + if 'lsp' in inspect.signature(self._ctrl.discover).parameters: + lsp = nvme.NVMF_LOG_DISC_LSP_PLEO if dlp_supp_opts & nvme.NVMF_LOG_DISC_LID_PLEOS else 0 + self._get_log_op = gutil.AsyncTask( + self._on_get_log_success, self._on_get_log_fail, self._ctrl.discover, lsp + ) else: - logging.debug( - 'Dc._on_get_supported_success() - %s | %s: Received event on dead object.', self.id, self.device + logging.warning( + '%s | %s - libnvme-%s does not support setting PLEO bit. Please upgrade.', + self.id, + self.device, + defs.LIBNVME_VERSION, ) + self._get_log_op = gutil.AsyncTask(self._on_get_log_success, self._on_get_log_fail, self._ctrl.discover) + self._get_log_op.run_async() def _on_get_supported_fail(self, op_obj: gutil.AsyncTask, err, fail_cnt): '''@brief Function called when we fail to retrieve the supported log page from the Discovery Controller. See self._get_supported_op object for details. ''' - if self._alive(): + if not self._alive(): logging.debug( - 'Dc._on_get_supported_fail() - %s | %s: %s. Retry in %s sec', + 'Dc._on_get_supported_fail() - %s | %s: Received event on dead object. %s', self.id, self.device, err, - Dc.GET_SUPPORTED_RETRY_RERIOD_SEC, ) - if fail_cnt == 1: # Throttle the logs. Only print the first time the command fails - logging.error( - '%s | %s - Failed to Get supported log pages from Discovery Controller. %s', - self.id, - self.device, - err, - ) - op_obj.retry(Dc.GET_SUPPORTED_RETRY_RERIOD_SEC) - else: - logging.debug( - 'Dc._on_get_supported_fail() - %s | %s: Received event on dead object. %s', + op_obj.kill() + return + + logging.debug( + 'Dc._on_get_supported_fail() - %s | %s: %s. Retry in %s sec', + self.id, + self.device, + err, + Dc.GET_SUPPORTED_RETRY_RERIOD_SEC, + ) + if fail_cnt == 1: # Throttle the logs. Only print the first time the command fails + logging.error( + '%s | %s - Failed to Get supported log pages from Discovery Controller. %s', self.id, self.device, err, ) - op_obj.kill() + op_obj.retry(Dc.GET_SUPPORTED_RETRY_RERIOD_SEC) # -------------------------------------------------------------------------- def _on_get_log_success(self, op_obj: gutil.AsyncTask, data): # pylint: disable=unused-argument @@ -729,61 +735,51 @@ class Dc(Controller): from the Discovery Controller. See self._get_log_op object for details. ''' - if self._alive(): - # Note that for historical reasons too long to explain, the CDC may - # return invalid addresses ('0.0.0.0', '::', or ''). Those need to - # be filtered out. - referrals_before = self.referrals() - self._log_pages = ( - [ - {k.strip(): str(v).strip() for k, v in dictionary.items()} - for dictionary in data - if dictionary.get('traddr', '').strip() not in ('0.0.0.0', '::', '') - ] - if data - else list() + if not self._alive(): + logging.debug( + 'Dc._on_get_log_success() - %s | %s: Received event on dead object.', self.id, self.device ) - logging.info( - '%s | %s - Received discovery log pages (num records=%s).', self.id, self.device, len(self._log_pages) + return + + # Note that for historical reasons too long to explain, the CDC may + # return invalid addresses ('0.0.0.0', '::', or ''). Those need to + # be filtered out. + referrals_before = self.referrals() + self._log_pages = ( + [ + {k.strip(): str(v).strip() for k, v in dictionary.items()} + for dictionary in data + if dictionary.get('traddr', '').strip() not in ('0.0.0.0', '::', '') + ] + if data + else list() + ) + logging.info( + '%s | %s - Received discovery log pages (num records=%s).', self.id, self.device, len(self._log_pages) + ) + referrals_after = self.referrals() + self._serv.log_pages_changed(self, self.device) + if referrals_after != referrals_before: + logging.debug( + 'Dc._on_get_log_success() - %s | %s: Referrals before = %s', + self.id, + self.device, + referrals_before, ) - referrals_after = self.referrals() - self._serv.log_pages_changed(self, self.device) - if referrals_after != referrals_before: - logging.debug( - 'Dc._on_get_log_success() - %s | %s: Referrals before = %s', - self.id, - self.device, - referrals_before, - ) - logging.debug( - 'Dc._on_get_log_success() - %s | %s: Referrals after = %s', - self.id, - self.device, - referrals_after, - ) - self._serv.referrals_changed() - else: logging.debug( - 'Dc._on_get_log_success() - %s | %s: Received event on dead object.', self.id, self.device + 'Dc._on_get_log_success() - %s | %s: Referrals after = %s', + self.id, + self.device, + referrals_after, ) + self._serv.referrals_changed() def _on_get_log_fail(self, op_obj: gutil.AsyncTask, err, fail_cnt): '''@brief Function called when we fail to retrieve the log pages from the Discovery Controller. See self._get_log_op object for details. ''' - if self._alive(): - logging.debug( - 'Dc._on_get_log_fail() - %s | %s: %s. Retry in %s sec', - self.id, - self.device, - err, - Dc.GET_LOG_PAGE_RETRY_RERIOD_SEC, - ) - if fail_cnt == 1: # Throttle the logs. Only print the first time the command fails - logging.error('%s | %s - Failed to retrieve log pages. %s', self.id, self.device, err) - op_obj.retry(Dc.GET_LOG_PAGE_RETRY_RERIOD_SEC) - else: + if not self._alive(): logging.debug( 'Dc._on_get_log_fail() - %s | %s: Received event on dead object. %s', self.id, @@ -791,6 +787,18 @@ class Dc(Controller): err, ) op_obj.kill() + return + + logging.debug( + 'Dc._on_get_log_fail() - %s | %s: %s. Retry in %s sec', + self.id, + self.device, + err, + Dc.GET_LOG_PAGE_RETRY_RERIOD_SEC, + ) + if fail_cnt == 1: # Throttle the logs. Only print the first time the command fails + logging.error('%s | %s - Failed to retrieve log pages. %s', self.id, self.device, err) + op_obj.retry(Dc.GET_LOG_PAGE_RETRY_RERIOD_SEC) # ****************************************************************************** |