1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
|
# Copyright (c) 2022, Dell Inc. or its subsidiaries. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# See the LICENSE file for details.
#
# This file is part of NVMe STorage Appliance Services (nvme-stas).
#
# Authors: Martin Belanger <Martin.Belanger@dell.com>
#
'''This module defines the base Controller object from which the
Dc (Discovery Controller) and Ioc (I/O Controller) objects are derived.'''
import time
import inspect
import logging
from gi.repository import GLib
from libnvme import nvme
from staslib import conf, defs, gutil, trid, udev, stas
DLP_CHANGED = (
(nvme.NVME_LOG_LID_DISCOVER << 16) | (nvme.NVME_AER_NOTICE_DISC_CHANGED << 8) | nvme.NVME_AER_NOTICE
) # 0x70f002
def get_eflags(dlpe):
'''@brief Return eflags field of dlpe'''
return int(dlpe.get('eflags', 0)) if dlpe else 0
def get_ncc(eflags: int):
'''@brief Return True if Not Connected to CDC bit is asserted, False otherwise'''
return eflags & nvme.NVMF_DISC_EFLAGS_NCC != 0
def dlp_supp_opts_as_string(dlp_supp_opts: int):
'''@brief Return the list of options supported by the Get
discovery log page command.
'''
data = {
nvme.NVMF_LOG_DISC_LID_EXTDLPES: "EXTDLPES",
nvme.NVMF_LOG_DISC_LID_PLEOS: "PLEOS",
nvme.NVMF_LOG_DISC_LID_ALLSUBES: "ALLSUBES",
}
return [txt for msk, txt in data.items() if dlp_supp_opts & msk]
# ******************************************************************************
class Controller(stas.ControllerABC): # pylint: disable=too-many-instance-attributes
'''@brief Base class used to manage the connection to a controller.'''
def __init__(self, tid: trid.TID, service, discovery_ctrl: bool = False):
sysconf = conf.SysConf()
self._nvme_options = conf.NvmeOptions()
self._root = nvme.root()
self._host = nvme.host(
self._root, hostnqn=sysconf.hostnqn, hostid=sysconf.hostid, hostsymname=sysconf.hostsymname
)
self._host.dhchap_key = sysconf.hostkey if self._nvme_options.dhchap_hostkey_supp else None
self._udev = udev.UDEV
self._device = None # Refers to the nvme device (e.g. /dev/nvme[n])
self._ctrl = None # libnvme's nvme.ctrl object
self._connect_op = None
super().__init__(tid, service, discovery_ctrl)
def _release_resources(self):
logging.debug('Controller._release_resources() - %s | %s', self.id, self.device)
if self._udev:
self._udev.unregister_for_device_events(self._on_udev_notification)
self._kill_ops()
super()._release_resources()
self._ctrl = None
self._udev = None
self._host = None
self._root = None
self._nvme_options = None
@property
def device(self) -> str:
'''@brief return the Linux nvme device id (e.g. nvme3) or empty
string if no device is associated with this controller'''
if not self._device and self._ctrl and self._ctrl.name:
self._device = self._ctrl.name
return self._device or 'nvme?'
def all_ops_completed(self) -> bool:
'''@brief Returns True if all operations have completed. False otherwise.'''
return self._connect_op is None or self._connect_op.completed()
def connected(self):
'''@brief Return whether a connection is established'''
return self._ctrl and self._ctrl.connected()
def controller_id_dict(self) -> dict:
'''@brief return the controller ID as a dict.'''
cid = super().controller_id_dict()
cid['device'] = self.device
return cid
def details(self) -> dict:
'''@brief return detailed debug info about this controller'''
details = super().details()
details.update(
self._udev.get_attributes(self.device, ('hostid', 'hostnqn', 'model', 'serial', 'dctype', 'cntrltype'))
)
details['connected'] = str(self.connected())
return details
def info(self) -> dict:
'''@brief Get the controller info for this object'''
info = super().info()
if self._connect_op:
info['connect operation'] = str(self._connect_op.as_dict())
return info
def cancel(self):
'''@brief Used to cancel pending operations.'''
super().cancel()
if self._connect_op:
self._connect_op.cancel()
def _kill_ops(self):
if self._connect_op:
self._connect_op.kill()
self._connect_op = None
def set_level_from_tron(self, tron):
'''Set log level based on TRON'''
if self._root:
self._root.log_level("debug" if tron else "err")
def _on_udev_notification(self, udev_obj):
if self._alive():
if udev_obj.action == 'change':
nvme_aen = udev_obj.get('NVME_AEN')
nvme_event = udev_obj.get('NVME_EVENT')
if isinstance(nvme_aen, str):
logging.info('%s | %s - Received AEN: %s', self.id, udev_obj.sys_name, nvme_aen)
self._on_aen(int(nvme_aen, 16))
if isinstance(nvme_event, str):
self._on_nvme_event(nvme_event)
elif udev_obj.action == 'remove':
logging.info('%s | %s - Received "remove" event', self.id, udev_obj.sys_name)
self._on_ctrl_removed(udev_obj)
else:
logging.debug(
'Controller._on_udev_notification() - %s | %s: Received "%s" event',
self.id,
udev_obj.sys_name,
udev_obj.action,
)
else:
logging.debug(
'Controller._on_udev_notification() - %s | %s: Received event on dead object. udev_obj %s: %s',
self.id,
self.device,
udev_obj.action,
udev_obj.sys_name,
)
def _on_ctrl_removed(self, udev_obj): # pylint: disable=unused-argument
if self._udev:
self._udev.unregister_for_device_events(self._on_udev_notification)
self._kill_ops() # Kill all pending operations
self._ctrl = None
# Defer removal of this object to the next main loop's idle period.
GLib.idle_add(self._serv.remove_controller, self, True)
def _get_cfg(self):
'''Get configuration parameters. These may either come from the [Global]
section or from a "controller" entry in the configuration file. A
definition found in a "controller" entry overrides the same definition
found in the [Global] section.
'''
cfg = {}
service_conf = conf.SvcConf()
for option, keyword in (
('kato', 'keep_alive_tmo'),
('queue-size', 'queue_size'),
('hdr-digest', 'hdr_digest'),
('data-digest', 'data_digest'),
('nr-io-queues', 'nr_io_queues'),
('ctrl-loss-tmo', 'ctrl_loss_tmo'),
('disable-sqflow', 'disable_sqflow'),
('nr-poll-queues', 'nr_poll_queues'),
('nr-write-queues', 'nr_write_queues'),
('reconnect-delay', 'reconnect_delay'),
):
# Check if the value is defined as a "controller" entry (i.e. override)
ovrd_val = self.tid.cfg.get(option, None)
if ovrd_val is not None:
cfg[keyword] = ovrd_val
else:
# Check if the value is found in the [Global] section.
glob_val = service_conf.get_option('Global', option)
if glob_val is not None:
cfg[keyword] = glob_val
return cfg
def _do_connect(self):
service_conf = conf.SvcConf()
host_iface = (
self.tid.host_iface
if (self.tid.host_iface and not service_conf.ignore_iface and self._nvme_options.host_iface_supp)
else None
)
self._ctrl = nvme.ctrl(
self._root,
subsysnqn=self.tid.subsysnqn,
transport=self.tid.transport,
traddr=self.tid.traddr,
trsvcid=self.tid.trsvcid if self.tid.trsvcid else None,
host_traddr=self.tid.host_traddr if self.tid.host_traddr else None,
host_iface=host_iface,
)
self._ctrl.discovery_ctrl_set(self._discovery_ctrl)
# Set the DHCHAP key on the controller
# NOTE that this will eventually have to
# change once we have support for AVE (TP8019)
ctrl_dhchap_key = self.tid.cfg.get('dhchap-ctrl-secret')
if ctrl_dhchap_key and self._nvme_options.dhchap_ctrlkey_supp:
has_dhchap_key = hasattr(self._ctrl, 'dhchap_key')
if not has_dhchap_key:
logging.warning(
'%s | %s - libnvme-%s does not allow setting the controller DHCHAP key. Please upgrade libnvme.',
self.id,
self.device,
defs.LIBNVME_VERSION,
)
else:
self._ctrl.dhchap_key = ctrl_dhchap_key
# Audit existing nvme devices. If we find a match, then
# we'll just borrow that device instead of creating a new one.
udev_obj = self._find_existing_connection()
if udev_obj is not None:
# A device already exists.
self._device = udev_obj.sys_name
logging.debug(
'Controller._do_connect() - %s Found existing control device: %s', self.id, udev_obj.sys_name
)
self._connect_op = gutil.AsyncTask(
self._on_connect_success, self._on_connect_fail, self._ctrl.init, self._host, int(udev_obj.sys_number)
)
else:
cfg = self._get_cfg()
logging.debug(
'Controller._do_connect() - %s Connecting to nvme control with cfg=%s', self.id, cfg
)
self._connect_op = gutil.AsyncTask(
self._on_connect_success, self._on_connect_fail, self._ctrl.connect, self._host, cfg
)
self._connect_op.run_async()
# --------------------------------------------------------------------------
def _on_connect_success(self, op_obj: gutil.AsyncTask, data):
'''@brief Function called when we successfully connect to the
Controller.
'''
op_obj.kill()
self._connect_op = None
if self._alive():
self._device = self._ctrl.name
logging.info('%s | %s - Connection established!', self.id, self.device)
self._connect_attempts = 0
self._udev.register_for_device_events(self._device, self._on_udev_notification)
else:
logging.debug(
'Controller._on_connect_success() - %s | %s: Received event on dead object. data=%s',
self.id,
self.device,
data,
)
def _on_connect_fail(self, op_obj: gutil.AsyncTask, err, fail_cnt): # pylint: disable=unused-argument
'''@brief Function called when we fail to connect to the Controller.'''
op_obj.kill()
self._connect_op = None
if self._alive():
if self._connect_attempts == 1:
# Do a fast re-try on the first failure.
self._retry_connect_tmr.set_timeout(self.FAST_CONNECT_RETRY_PERIOD_SEC)
elif self._connect_attempts == 2:
# If the fast connect re-try fails, then we can print a message to
# indicate the failure, and start a slow re-try period.
self._retry_connect_tmr.set_timeout(self.CONNECT_RETRY_PERIOD_SEC)
logging.error('%s Failed to connect to controller. %s %s', self.id, err.domain, err.message)
if self._should_try_to_reconnect():
logging.debug(
'Controller._on_connect_fail() - %s %s. Retry in %s sec.',
self.id,
err,
self._retry_connect_tmr.get_timeout(),
)
self._retry_connect_tmr.start()
else:
logging.debug(
'Controller._on_connect_fail() - %s Received event on dead object. %s %s',
self.id,
err.domain,
err.message,
)
def disconnect(self, disconnected_cb, keep_connection):
'''@brief Issue an asynchronous disconnect command to a Controller.
Once the async command has completed, the callback 'disconnected_cb'
will be invoked. If a controller is already disconnected, then the
callback will be added to the main loop's next idle slot to be executed
ASAP.
@param disconnected_cb: Callback to be called when disconnect has
completed. the callback must have this signature:
def cback(controller: Controller, success: bool)
@param keep_connection: Whether the underlying connection should remain
in the kernel.
'''
logging.debug(
'Controller.disconnect() - %s | %s: keep_connection=%s', self.id, self.device, keep_connection
)
if self._ctrl and self._ctrl.connected() and not keep_connection:
logging.info('%s | %s - Disconnect initiated', self.id, self.device)
op = gutil.AsyncTask(self._on_disconn_success, self._on_disconn_fail, self._ctrl.disconnect)
op.run_async(disconnected_cb)
else:
# Defer callback to the next main loop's idle period. The callback
# cannot be called directly as the current Controller object is in the
# process of being disconnected and the callback will in fact delete
# the object. This would invariably lead to unpredictable outcome.
GLib.idle_add(disconnected_cb, self, True)
def _on_disconn_success(self, op_obj: gutil.AsyncTask, data, disconnected_cb): # pylint: disable=unused-argument
logging.debug('Controller._on_disconn_success() - %s | %s', self.id, self.device)
op_obj.kill()
# Defer callback to the next main loop's idle period. The callback
# cannot be called directly as the current Controller object is in the
# process of being disconnected and the callback will in fact delete
# the object. This would invariably lead to unpredictable outcome.
GLib.idle_add(disconnected_cb, self, True)
def _on_disconn_fail(
self, op_obj: gutil.AsyncTask, err, fail_cnt, disconnected_cb
): # pylint: disable=unused-argument
logging.debug('Controller._on_disconn_fail() - %s | %s: %s', self.id, self.device, err)
op_obj.kill()
# Defer callback to the next main loop's idle period. The callback
# cannot be called directly as the current Controller object is in the
# process of being disconnected and the callback will in fact delete
# the object. This would invariably lead to unpredictable outcome.
GLib.idle_add(disconnected_cb, self, False)
# ******************************************************************************
class Dc(Controller):
'''@brief This object establishes a connection to one Discover Controller (DC).
It retrieves the discovery log pages and caches them.
It also monitors udev events associated with that DC and updates
the cached discovery log pages accordingly.
'''
GET_LOG_PAGE_RETRY_RERIOD_SEC = 20
REGISTRATION_RETRY_RERIOD_SEC = 5
GET_SUPPORTED_RETRY_RERIOD_SEC = 5
def __init__(self, staf, tid: trid.TID, log_pages=None, origin=None):
super().__init__(tid, staf, discovery_ctrl=True)
self._register_op = None
self._get_supported_op = None
self._get_log_op = None
self._origin = origin
self._log_pages = log_pages if log_pages else list() # Log pages cache
# For Avahi-discovered DCs that later become unresponsive, monitor how
# long the controller remains unresponsive and if it does not return for
# a configurable soak period (_ctrl_unresponsive_tmr), remove that
# controller. Only Avahi-discovered controllers need this timeout-based
# cleanup.
self._ctrl_unresponsive_time = None # The time at which connectivity was lost
self._ctrl_unresponsive_tmr = gutil.GTimer(0, self._serv.controller_unresponsive, self.tid)
def _release_resources(self):
logging.debug('Dc._release_resources() - %s | %s', self.id, self.device)
super()._release_resources()
if self._ctrl_unresponsive_tmr is not None:
self._ctrl_unresponsive_tmr.kill()
self._log_pages = list()
self._ctrl_unresponsive_tmr = None
def _kill_ops(self):
super()._kill_ops()
if self._get_log_op:
self._get_log_op.kill()
self._get_log_op = None
if self._register_op:
self._register_op.kill()
self._register_op = None
if self._get_supported_op:
self._get_supported_op.kill()
self._get_supported_op = None
def all_ops_completed(self) -> bool:
'''@brief Returns True if all operations have completed. False otherwise.'''
return (
super().all_ops_completed()
and (self._get_log_op is None or self._get_log_op.completed())
and (self._register_op is None or self._register_op.completed())
and (self._get_supported_op is None or self._get_supported_op.completed())
)
@property
def origin(self):
'''@brief Return how this controller came into existance. Was it
"discovered" through mDNS service discovery (TP8009), was it manually
"configured" in stafd.conf, or was it a "referral".
'''
return self._origin
@origin.setter
def origin(self, value):
'''@brief Set the origin of this controller.'''
if value in ('discovered', 'configured', 'referral'):
self._origin = value
self._handle_lost_controller()
else:
logging.error('%s | %s - Trying to set invalid origin to %s', self.id, self.device, value)
def reload_hdlr(self):
'''@brief This is called when a "reload" signal is received.'''
logging.debug('Dc.reload_hdlr() - %s | %s', self.id, self.device)
self._handle_lost_controller()
self._resync_with_controller()
def info(self) -> dict:
'''@brief Get the controller info for this object'''
timeout = conf.SvcConf().zeroconf_persistence_sec
unresponsive_time = (
time.asctime(self._ctrl_unresponsive_time) if self._ctrl_unresponsive_time is not None else '---'
)
info = super().info()
info['origin'] = self.origin
if self.origin == 'discovered':
# The code that handles "unresponsive" DCs only applies to
# discovered DCs. So, let's only print that info when it's relevant.
info['unresponsive timer'] = str(self._ctrl_unresponsive_tmr)
info['unresponsive timeout'] = f'{timeout} sec' if timeout >= 0 else 'forever'
info['unresponsive time'] = unresponsive_time
if self._get_log_op:
info['get log page operation'] = str(self._get_log_op.as_dict())
if self._register_op:
info['register operation'] = str(self._register_op.as_dict())
if self._get_supported_op:
info['get supported log page operation'] = str(self._get_supported_op.as_dict())
return info
def cancel(self):
'''@brief Used to cancel pending operations.'''
super().cancel()
if self._get_log_op:
self._get_log_op.cancel()
if self._register_op:
self._register_op.cancel()
if self._get_supported_op:
self._get_supported_op.cancel()
def log_pages(self) -> list:
'''@brief Get the cached log pages for this object'''
return self._log_pages
def referrals(self) -> list:
'''@brief Return the list of referrals'''
return [page for page in self._log_pages if page['subtype'] == 'referral']
def _is_ddc(self):
return self._ctrl and self._ctrl.dctype != 'cdc'
def _on_aen(self, aen: int):
if aen == DLP_CHANGED and self._get_log_op:
self._get_log_op.run_async()
def _handle_lost_controller(self):
if self.origin == 'discovered': # Only apply to mDNS-discovered DCs
if not self._serv.is_avahi_reported(self.tid) and not self.connected():
timeout = conf.SvcConf().zeroconf_persistence_sec
if timeout >= 0:
if self._ctrl_unresponsive_time is None:
self._ctrl_unresponsive_time = time.localtime()
self._ctrl_unresponsive_tmr.start(timeout)
logging.info(
'%s | %s - Controller is not responding. Will be removed by %s unless restored',
self.id,
self.device,
time.ctime(time.mktime(self._ctrl_unresponsive_time) + timeout),
)
return
logging.info(
'%s | %s - Controller not responding. Retrying...',
self.id,
self.device,
)
self._ctrl_unresponsive_time = None
self._ctrl_unresponsive_tmr.stop()
self._ctrl_unresponsive_tmr.set_timeout(0)
def is_unresponsive(self):
'''@brief For "discovered" DC, return True if DC is unresponsive,
False otherwise.
'''
return (
self.origin == 'discovered'
and not self._serv.is_avahi_reported(self.tid)
and not self.connected()
and self._ctrl_unresponsive_time is not None
and self._ctrl_unresponsive_tmr.time_remaining() <= 0
)
def _resync_with_controller(self):
'''Communicate with DC to resync the states'''
if self._register_op:
self._register_op.run_async()
elif self._get_supported_op:
self._get_supported_op.run_async()
elif self._get_log_op:
self._get_log_op.run_async()
def _on_nvme_event(self, nvme_event: str):
if nvme_event in ('connected', 'rediscover'):
# This event indicates that the kernel
# driver re-connected to the DC.
logging.debug(
'Dc._on_nvme_event() - %s | %s: Received "%s" event',
self.id,
self.device,
nvme_event,
)
self._resync_with_controller()
def _find_existing_connection(self):
return self._udev.find_nvme_dc_device(self.tid)
def _post_registration_actions(self):
# Need to check that supported_log_pages() is available (introduced in libnvme 1.2)
has_supported_log_pages = hasattr(self._ctrl, 'supported_log_pages')
if not has_supported_log_pages:
logging.warning(
'%s | %s - libnvme-%s does not support "Get supported log pages". Please upgrade libnvme.',
self.id,
self.device,
defs.LIBNVME_VERSION,
)
if conf.SvcConf().pleo_enabled and self._is_ddc() and has_supported_log_pages:
self._get_supported_op = gutil.AsyncTask(
self._on_get_supported_success, self._on_get_supported_fail, self._ctrl.supported_log_pages
)
self._get_supported_op.run_async()
else:
self._get_log_op = gutil.AsyncTask(self._on_get_log_success, self._on_get_log_fail, self._ctrl.discover)
self._get_log_op.run_async()
# --------------------------------------------------------------------------
def _on_connect_success(self, op_obj: gutil.AsyncTask, data):
'''@brief Function called when we successfully connect to the
Discovery Controller.
'''
super()._on_connect_success(op_obj, data)
if self._alive():
self._ctrl_unresponsive_time = None
self._ctrl_unresponsive_tmr.stop()
self._ctrl_unresponsive_tmr.set_timeout(0)
if self._ctrl.is_registration_supported():
self._register_op = gutil.AsyncTask(
self._on_registration_success,
self._on_registration_fail,
self._ctrl.registration_ctlr,
nvme.NVMF_DIM_TAS_REGISTER,
)
self._register_op.run_async()
else:
self._post_registration_actions()
def _on_connect_fail(self, op_obj: gutil.AsyncTask, err, fail_cnt):
'''@brief Function called when we fail to connect to the Controller.'''
super()._on_connect_fail(op_obj, err, fail_cnt)
if self._alive():
self._handle_lost_controller()
# --------------------------------------------------------------------------
def _on_registration_success(self, op_obj: gutil.AsyncTask, data): # pylint: disable=unused-argument
'''@brief Function called when we successfully register with the
Discovery Controller. See self._register_op object
for details.
NOTE: The name _on_registration_success() may be misleading. "success"
refers to the fact that a successful exchange was made with the DC.
It doesn't mean that the registration itself succeeded.
'''
if self._alive():
if data is not None:
logging.warning('%s | %s - Registration error. %s.', self.id, self.device, data)
else:
logging.debug('Dc._on_registration_success() - %s | %s', self.id, self.device)
self._post_registration_actions()
else:
logging.debug(
'Dc._on_registration_success() - %s | %s: Received event on dead object.', self.id, self.device
)
def _on_registration_fail(self, op_obj: gutil.AsyncTask, err, fail_cnt):
'''@brief Function called when we fail to register with the
Discovery Controller. See self._register_op object
for details.
'''
if self._alive():
logging.debug(
'Dc._on_registration_fail() - %s | %s: %s. Retry in %s sec',
self.id,
self.device,
err,
Dc.REGISTRATION_RETRY_RERIOD_SEC,
)
if fail_cnt == 1: # Throttle the logs. Only print the first time the command fails
logging.error('%s | %s - Failed to register with Discovery Controller. %s', self.id, self.device, err)
op_obj.retry(Dc.REGISTRATION_RETRY_RERIOD_SEC)
else:
logging.debug(
'Dc._on_registration_fail() - %s | %s: Received event on dead object. %s',
self.id,
self.device,
err,
)
op_obj.kill()
# --------------------------------------------------------------------------
def _on_get_supported_success(self, op_obj: gutil.AsyncTask, data): # pylint: disable=unused-argument
'''@brief Function called when we successfully retrieved the supported
log pages from the Discovery Controller. See self._get_supported_op object
for details.
NOTE: The name _on_get_supported_success() may be misleading. "success"
refers to the fact that a successful exchange was made with the DC.
It doesn't mean that the Get Supported Log Page itself succeeded.
'''
if self._alive():
try:
dlp_supp_opts = data[nvme.NVME_LOG_LID_DISCOVER] >> 16
except (TypeError, IndexError):
dlp_supp_opts = 0
logging.debug(
'Dc._on_get_supported_success() - %s | %s: supported options = 0x%04X = %s',
self.id,
self.device,
dlp_supp_opts,
dlp_supp_opts_as_string(dlp_supp_opts),
)
if 'lsp' in inspect.signature(self._ctrl.discover).parameters:
lsp = nvme.NVMF_LOG_DISC_LSP_PLEO if dlp_supp_opts & nvme.NVMF_LOG_DISC_LID_PLEOS else 0
self._get_log_op = gutil.AsyncTask(
self._on_get_log_success, self._on_get_log_fail, self._ctrl.discover, lsp
)
else:
logging.warning(
'%s | %s - libnvme-%s does not support setting PLEO bit. Please upgrade.',
self.id,
self.device,
defs.LIBNVME_VERSION,
)
self._get_log_op = gutil.AsyncTask(self._on_get_log_success, self._on_get_log_fail, self._ctrl.discover)
self._get_log_op.run_async()
else:
logging.debug(
'Dc._on_get_supported_success() - %s | %s: Received event on dead object.', self.id, self.device
)
def _on_get_supported_fail(self, op_obj: gutil.AsyncTask, err, fail_cnt):
'''@brief Function called when we fail to retrieve the supported log
page from the Discovery Controller. See self._get_supported_op object
for details.
'''
if self._alive():
logging.debug(
'Dc._on_get_supported_fail() - %s | %s: %s. Retry in %s sec',
self.id,
self.device,
err,
Dc.GET_SUPPORTED_RETRY_RERIOD_SEC,
)
if fail_cnt == 1: # Throttle the logs. Only print the first time the command fails
logging.error(
'%s | %s - Failed to Get supported log pages from Discovery Controller. %s',
self.id,
self.device,
err,
)
op_obj.retry(Dc.GET_SUPPORTED_RETRY_RERIOD_SEC)
else:
logging.debug(
'Dc._on_get_supported_fail() - %s | %s: Received event on dead object. %s',
self.id,
self.device,
err,
)
op_obj.kill()
# --------------------------------------------------------------------------
def _on_get_log_success(self, op_obj: gutil.AsyncTask, data): # pylint: disable=unused-argument
'''@brief Function called when we successfully retrieve the log pages
from the Discovery Controller. See self._get_log_op object
for details.
'''
if self._alive():
# Note that for historical reasons too long to explain, the CDC may
# return invalid addresses ('0.0.0.0', '::', or ''). Those need to
# be filtered out.
referrals_before = self.referrals()
self._log_pages = (
[
{k.strip(): str(v).strip() for k, v in dictionary.items()}
for dictionary in data
if dictionary.get('traddr', '').strip() not in ('0.0.0.0', '::', '')
]
if data
else list()
)
logging.info(
'%s | %s - Received discovery log pages (num records=%s).', self.id, self.device, len(self._log_pages)
)
referrals_after = self.referrals()
self._serv.log_pages_changed(self, self.device)
if referrals_after != referrals_before:
logging.debug(
'Dc._on_get_log_success() - %s | %s: Referrals before = %s',
self.id,
self.device,
referrals_before,
)
logging.debug(
'Dc._on_get_log_success() - %s | %s: Referrals after = %s',
self.id,
self.device,
referrals_after,
)
self._serv.referrals_changed()
else:
logging.debug(
'Dc._on_get_log_success() - %s | %s: Received event on dead object.', self.id, self.device
)
def _on_get_log_fail(self, op_obj: gutil.AsyncTask, err, fail_cnt):
'''@brief Function called when we fail to retrieve the log pages
from the Discovery Controller. See self._get_log_op object
for details.
'''
if self._alive():
logging.debug(
'Dc._on_get_log_fail() - %s | %s: %s. Retry in %s sec',
self.id,
self.device,
err,
Dc.GET_LOG_PAGE_RETRY_RERIOD_SEC,
)
if fail_cnt == 1: # Throttle the logs. Only print the first time the command fails
logging.error('%s | %s - Failed to retrieve log pages. %s', self.id, self.device, err)
op_obj.retry(Dc.GET_LOG_PAGE_RETRY_RERIOD_SEC)
else:
logging.debug(
'Dc._on_get_log_fail() - %s | %s: Received event on dead object. %s',
self.id,
self.device,
err,
)
op_obj.kill()
# ******************************************************************************
class Ioc(Controller):
'''@brief This object establishes a connection to one I/O Controller.'''
def __init__(self, stac, tid: trid.TID):
self._dlpe = None
super().__init__(tid, stac)
def _find_existing_connection(self):
return self._udev.find_nvme_ioc_device(self.tid)
def _on_aen(self, aen: int):
pass
def _on_nvme_event(self, nvme_event):
pass
def reload_hdlr(self):
'''@brief This is called when a "reload" signal is received.'''
if not self.connected() and self._retry_connect_tmr.time_remaining() == 0:
self._try_to_connect_deferred.schedule()
@property
def eflags(self):
'''@brief Return the eflag field of the DLPE'''
return get_eflags(self._dlpe)
@property
def ncc(self):
'''@brief Return Not Connected to CDC status'''
return get_ncc(self.eflags)
def details(self) -> dict:
'''@brief return detailed debug info about this controller'''
details = super().details()
details['dlpe'] = str(self._dlpe)
details['dlpe.eflags.ncc'] = str(self.ncc)
return details
def update_dlpe(self, dlpe):
'''@brief This method is called when a new DLPE associated
with this controller is received.'''
new_ncc = get_ncc(get_eflags(dlpe))
old_ncc = self.ncc
self._dlpe = dlpe
if old_ncc and not new_ncc: # NCC bit cleared?
if not self.connected():
self._connect_attempts = 0
self._try_to_connect_deferred.schedule()
def _should_try_to_reconnect(self):
'''@brief This is used to determine when it's time to stop trying toi connect'''
max_connect_attempts = conf.SvcConf().connect_attempts_on_ncc if self.ncc else 0
return max_connect_attempts == 0 or self._connect_attempts < max_connect_attempts
|