src/pybind/mgr/diskprediction_local/predictor.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265

"""Sample code for disk failure prediction.

This sample code is a community version for anyone who is interested in Machine
Learning and care about disk failure.

This class provides a disk failure prediction module. Given models dirpath to
initialize a predictor instance and then use 6 days data to predict. Predict
function will return a string to indicate disk failure status: "Good",
"Warning", "Bad", or "Unknown".

An example code is as follows:

>>> model = disk_failure_predictor.DiskFailurePredictor()
>>> status = model.initialize("./models")
>>> if status:
>>>     model.predict(disk_days)
'Bad'


Provided by ProphetStor Data Services Inc.
http://www.prophetstor.com/

"""

from __future__ import print_function
import os
import json
import pickle


def get_diskfailurepredictor_path():
    path = os.path.abspath(__file__)
    dir_path = os.path.dirname(path)
    return dir_path


class DiskFailurePredictor(object):
    """Disk failure prediction

    This class implements a disk failure prediction module.
    """

    CONFIG_FILE = "config.json"
    EXCLUDED_ATTRS = ['smart_9_raw', 'smart_241_raw', 'smart_242_raw']

    def __init__(self):
        """
        This function may throw exception due to wrong file operation.
        """

        self.model_dirpath = ""
        self.model_context = {}

    def initialize(self, model_dirpath):
        """
        Initialize all models.

        Args: None

        Returns:
            Error message. If all goes well, return an empty string.

        Raises:
        """

        config_path = os.path.join(model_dirpath, self.CONFIG_FILE)
        if not os.path.isfile(config_path):
            return "Missing config file: " + config_path
        else:
            with open(config_path) as f_conf:
                self.model_context = json.load(f_conf)

        for model_name in self.model_context:
            model_path = os.path.join(model_dirpath, model_name)

            if not os.path.isfile(model_path):
                return "Missing model file: " + model_path

        self.model_dirpath = model_dirpath

    def __preprocess(self, disk_days):
        """
        Preprocess disk attributes.

        Args:
            disk_days: Refer to function predict(...).

        Returns:
            new_disk_days: Processed disk days.
        """

        req_attrs = []
        new_disk_days = []

        attr_list = set.intersection(*[set(disk_day.keys())
                                       for disk_day in disk_days])
        for attr in attr_list:
            if (attr.startswith('smart_') and attr.endswith('_raw')) and \
                    attr not in self.EXCLUDED_ATTRS:
                req_attrs.append(attr)

        for disk_day in disk_days:
            new_disk_day = {}
            for attr in req_attrs:
                if float(disk_day[attr]) >= 0.0:
                    new_disk_day[attr] = disk_day[attr]

            new_disk_days.append(new_disk_day)

        return new_disk_days

    @staticmethod
    def __get_diff_attrs(disk_days):
        """
        Get 5 days differential attributes.

        Args:
            disk_days: Refer to function predict(...).

        Returns:
            attr_list: All S.M.A.R.T. attributes used in given disk. Here we
                       use intersection set of all disk days.

            diff_disk_days: A list struct comprises 5 dictionaries, each
                            dictionary contains differential attributes.

        Raises:
            Exceptions of wrong list/dict operations.
        """

        all_attrs = [set(disk_day.keys()) for disk_day in disk_days]
        attr_list = list(set.intersection(*all_attrs))
        attr_list = disk_days[0].keys()
        prev_days = disk_days[:-1]
        curr_days = disk_days[1:]
        diff_disk_days = []

        for prev, cur in zip(prev_days, curr_days):
            diff_disk_days.append({attr:(int(cur[attr]) - int(prev[attr]))
                                   for attr in attr_list})

        return attr_list, diff_disk_days

    def __get_best_models(self, attr_list):
        """
        Find the best model from model list according to given attribute list.

        Args:
            attr_list: All S.M.A.R.T. attributes used in given disk.

        Returns:
            modelpath: The best model for the given attribute list.
            model_attrlist: 'Ordered' attribute list of the returned model.
                            Must be aware that SMART attributes is in order.

        Raises:
        """

        models = self.model_context.keys()

        scores = []
        for model_name in models:
            scores.append(sum(attr in attr_list
                              for attr in self.model_context[model_name]))
        max_score = max(scores)

        # Skip if too few matched attributes.
        if max_score < 3:
            print("Too few matched attributes")
            return None

        best_models = {}
        best_model_indices = [idx for idx, score in enumerate(scores)
                              if score > max_score - 2]
        for model_idx in best_model_indices:
            model_name = list(models)[model_idx]
            model_path = os.path.join(self.model_dirpath, model_name)
            model_attrlist = self.model_context[model_name]
            best_models[model_path] = model_attrlist

        return best_models
        # return os.path.join(self.model_dirpath, model_name), model_attrlist

    @staticmethod
    def __get_ordered_attrs(disk_days, model_attrlist):
        """
        Return ordered attributes of given disk days.

        Args:
            disk_days: Unordered disk days.
            model_attrlist: Model's ordered attribute list.

        Returns:
            ordered_attrs: Ordered disk days.

        Raises: None
        """

        ordered_attrs = []

        for one_day in disk_days:
            one_day_attrs = []

            for attr in model_attrlist:
                if attr in one_day:
                    one_day_attrs.append(one_day[attr])
                else:
                    one_day_attrs.append(0)

            ordered_attrs.append(one_day_attrs)

        return ordered_attrs

    def predict(self, disk_days):
        """
        Predict using given 6-days disk S.M.A.R.T. attributes.

        Args:
            disk_days: A list struct comprises 6 dictionaries. These
                       dictionaries store 'consecutive' days of disk SMART
                       attributes.
        Returns:
            A string indicates prediction result. One of following four strings
            will be returned according to disk failure status:
            (1) Good : Disk is health
            (2) Warning : Disk has some symptoms but may not fail immediately
            (3) Bad : Disk is in danger and data backup is highly recommended
            (4) Unknown : Not enough data for prediction.

        Raises:
            Pickle exceptions
        """

        all_pred = []

        proc_disk_days = self.__preprocess(disk_days)
        attr_list, diff_data = DiskFailurePredictor.__get_diff_attrs(proc_disk_days)
        modellist = self.__get_best_models(attr_list)
        if modellist is None:
            return "Unknown"

        for modelpath in modellist:
            model_attrlist = modellist[modelpath]
            ordered_data = DiskFailurePredictor.__get_ordered_attrs(
                diff_data, model_attrlist)

            try:
                with open(modelpath, 'rb') as f_model:
                    clf = pickle.load(f_model)

            except UnicodeDecodeError:
                # Compatibility for python3
                with open(modelpath, 'rb') as f_model:
                    clf = pickle.load(f_model, encoding='latin1')

            pred = clf.predict(ordered_data)

            all_pred.append(1 if any(pred) else 0)

        score = 2 ** sum(all_pred) - len(modellist)
        if score > 10:
            return "Bad"
        if score > 4:
            return "Warning"
        return "Good"