summaryrefslogtreecommitdiffstats
path: root/ansible_collections/community/general/plugins/modules/aerospike_migrations.py
blob: 1eee5b1a2f18092d31773162a41c3bdc3a21a8dc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""short_description: Check or wait for migrations between nodes"""

# Copyright (c) 2018, Albert Autin
# GNU General Public License v3.0+ (see LICENSES/GPL-3.0-or-later.txt or https://www.gnu.org/licenses/gpl-3.0.txt)
# SPDX-License-Identifier: GPL-3.0-or-later
from __future__ import (absolute_import, division, print_function)

__metaclass__ = type

DOCUMENTATION = '''
---
module: aerospike_migrations
short_description: Check or wait for migrations between nodes
description:
    - This can be used to check for migrations in a cluster.
      This makes it easy to do a rolling upgrade/update on Aerospike nodes.
    - If waiting for migrations is not desired, simply just poll until
      port 3000 if available or asinfo -v status returns ok
author: "Albert Autin (@Alb0t)"
extends_documentation_fragment:
  - community.general.attributes
attributes:
  check_mode:
    support: full
  diff_mode:
    support: none
options:
    host:
        description:
            - Which host do we use as seed for info connection
        required: false
        type: str
        default: localhost
    port:
        description:
            - Which port to connect to Aerospike on (service port)
        required: false
        type: int
        default: 3000
    connect_timeout:
        description:
            - How long to try to connect before giving up (milliseconds)
        required: false
        type: int
        default: 1000
    consecutive_good_checks:
        description:
            - How many times should the cluster report "no migrations"
              consecutively before returning OK back to ansible?
        required: false
        type: int
        default: 3
    sleep_between_checks:
        description:
            - How long to sleep between each check (seconds).
        required: false
        type: int
        default: 60
    tries_limit:
        description:
            - How many times do we poll before giving up and failing?
        default: 300
        required: false
        type: int
    local_only:
        description:
            - Do you wish to only check for migrations on the local node
              before returning, or do you want all nodes in the cluster
              to finish before returning?
        required: true
        type: bool
    min_cluster_size:
        description:
            - Check will return bad until cluster size is met
              or until tries is exhausted
        required: false
        type: int
        default: 1
    fail_on_cluster_change:
        description:
            - Fail if the cluster key changes
              if something else is changing the cluster, we may want to fail
        required: false
        type: bool
        default: true
    migrate_tx_key:
        description:
            - The metric key used to determine if we have tx migrations
              remaining. Changeable due to backwards compatibility.
        required: false
        type: str
        default: migrate_tx_partitions_remaining
    migrate_rx_key:
        description:
            - The metric key used to determine if we have rx migrations
              remaining. Changeable due to backwards compatibility.
        required: false
        type: str
        default: migrate_rx_partitions_remaining
    target_cluster_size:
        description:
            - When all aerospike builds in the cluster are greater than
              version 4.3, then the C(cluster-stable) info command will be used.
              Inside this command, you can optionally specify what the target
              cluster size is - but it is not necessary. You can still rely on
              min_cluster_size if you don't want to use this option.
            - If this option is specified on a cluster that has at least 1
              host <4.3 then it will be ignored until the min version reaches
              4.3.
        required: false
        type: int
'''
EXAMPLES = '''
# check for migrations on local node
- name: Wait for migrations on local node before proceeding
  community.general.aerospike_migrations:
    host: "localhost"
    connect_timeout: 2000
    consecutive_good_checks: 5
    sleep_between_checks: 15
    tries_limit: 600
    local_only: false

# example playbook:
- name: Upgrade aerospike
  hosts: all
  become: true
  serial: 1
  tasks:
    - name: Install dependencies
      ansible.builtin.apt:
        name:
            - python
            - python-pip
            - python-setuptools
        state: latest
    - name: Setup aerospike
      ansible.builtin.pip:
          name: aerospike
# check for migrations every (sleep_between_checks)
# If at least (consecutive_good_checks) checks come back OK in a row, then return OK.
# Will exit if any exception, which can be caused by bad nodes,
# nodes not returning data, or other reasons.
# Maximum runtime before giving up in this case will be:
# Tries Limit * Sleep Between Checks * delay * retries
    - name: Wait for aerospike migrations
      community.general.aerospike_migrations:
          local_only: true
          sleep_between_checks: 1
          tries_limit: 5
          consecutive_good_checks: 3
          fail_on_cluster_change: true
          min_cluster_size: 3
          target_cluster_size: 4
      register: migrations_check
      until: migrations_check is succeeded
      changed_when: false
      delay: 60
      retries: 120
    - name: Another thing
      ansible.builtin.shell: |
          echo foo
    - name: Reboot
      ansible.builtin.reboot:
'''

RETURN = '''
# Returns only a success/failure result. Changed is always false.
'''

import traceback

from ansible.module_utils.basic import AnsibleModule, missing_required_lib

LIB_FOUND_ERR = None
try:
    import aerospike
    from time import sleep
    import re
except ImportError as ie:
    LIB_FOUND = False
    LIB_FOUND_ERR = traceback.format_exc()
else:
    LIB_FOUND = True


def run_module():
    """run ansible module"""
    module_args = dict(
        host=dict(type='str', required=False, default='localhost'),
        port=dict(type='int', required=False, default=3000),
        connect_timeout=dict(type='int', required=False, default=1000),
        consecutive_good_checks=dict(type='int', required=False, default=3),
        sleep_between_checks=dict(type='int', required=False, default=60),
        tries_limit=dict(type='int', required=False, default=300),
        local_only=dict(type='bool', required=True),
        min_cluster_size=dict(type='int', required=False, default=1),
        target_cluster_size=dict(type='int', required=False, default=None),
        fail_on_cluster_change=dict(type='bool', required=False, default=True),
        migrate_tx_key=dict(type='str', required=False, no_log=False,
                            default="migrate_tx_partitions_remaining"),
        migrate_rx_key=dict(type='str', required=False, no_log=False,
                            default="migrate_rx_partitions_remaining")
    )

    result = dict(
        changed=False,
    )

    module = AnsibleModule(
        argument_spec=module_args,
        supports_check_mode=True
    )
    if not LIB_FOUND:
        module.fail_json(msg=missing_required_lib('aerospike'),
                         exception=LIB_FOUND_ERR)

    try:
        if module.check_mode:
            has_migrations, skip_reason = False, None
        else:
            migrations = Migrations(module)
            has_migrations, skip_reason = migrations.has_migs(
                module.params['local_only']
            )

        if has_migrations:
            module.fail_json(msg="Failed.", skip_reason=skip_reason)
    except Exception as e:
        module.fail_json(msg="Error: {0}".format(e))

    module.exit_json(**result)


class Migrations:
    """ Check or wait for migrations between nodes """

    def __init__(self, module):
        self.module = module
        self._client = self._create_client().connect()
        self._nodes = {}
        self._update_nodes_list()
        self._cluster_statistics = {}
        self._update_cluster_statistics()
        self._namespaces = set()
        self._update_cluster_namespace_list()
        self._build_list = set()
        self._update_build_list()
        self._start_cluster_key = \
            self._cluster_statistics[self._nodes[0]]['cluster_key']

    def _create_client(self):
        """ TODO: add support for auth, tls, and other special features
         I won't use those features, so I'll wait until somebody complains
         or does it for me (Cross fingers)
         create the client object"""
        config = {
            'hosts': [
                (self.module.params['host'], self.module.params['port'])
            ],
            'policies': {
                'timeout': self.module.params['connect_timeout']
            }
        }
        return aerospike.client(config)

    def _info_cmd_helper(self, cmd, node=None, delimiter=';'):
        """delimiter is for separate stats that come back, NOT for kv
        separation which is ="""
        if node is None:  # If no node passed, use the first one (local)
            node = self._nodes[0]
        data = self._client.info_node(cmd, node)
        data = data.split("\t")
        if len(data) != 1 and len(data) != 2:
            self.module.fail_json(
                msg="Unexpected number of values returned in info command: " +
                str(len(data))
            )
        # data will be in format 'command\touput'
        data = data[-1]
        data = data.rstrip("\n\r")
        data_arr = data.split(delimiter)

        # some commands don't return in kv format
        # so we dont want a dict from those.
        if '=' in data:
            retval = dict(
                metric.split("=", 1) for metric in data_arr
            )
        else:
            # if only 1 element found, and not kv, return just the value.
            if len(data_arr) == 1:
                retval = data_arr[0]
            else:
                retval = data_arr
        return retval

    def _update_build_list(self):
        """creates self._build_list which is a unique list
        of build versions."""
        self._build_list = set()
        for node in self._nodes:
            build = self._info_cmd_helper('build', node)
            self._build_list.add(build)

    # just checks to see if the version is 4.3 or greater
    def _can_use_cluster_stable(self):
        # if version <4.3 we can't use cluster-stable info cmd
        # regex hack to check for versions beginning with 0-3 or
        # beginning with 4.0,4.1,4.2
        if re.search(R'^([0-3]\.|4\.[0-2])', min(self._build_list)):
            return False
        return True

    def _update_cluster_namespace_list(self):
        """ make a unique list of namespaces
        TODO: does this work on a rolling namespace add/deletion?
        thankfully if it doesn't, we dont need this on builds >=4.3"""
        self._namespaces = set()
        for node in self._nodes:
            namespaces = self._info_cmd_helper('namespaces', node)
            for namespace in namespaces:
                self._namespaces.add(namespace)

    def _update_cluster_statistics(self):
        """create a dict of nodes with their related stats """
        self._cluster_statistics = {}
        for node in self._nodes:
            self._cluster_statistics[node] = \
                self._info_cmd_helper('statistics', node)

    def _update_nodes_list(self):
        """get a fresh list of all the nodes"""
        self._nodes = self._client.get_nodes()
        if not self._nodes:
            self.module.fail_json("Failed to retrieve at least 1 node.")

    def _namespace_has_migs(self, namespace, node=None):
        """returns a True or False.
        Does the namespace have migrations for the node passed?
        If no node passed, uses the local node or the first one in the list"""
        namespace_stats = self._info_cmd_helper("namespace/" + namespace, node)
        try:
            namespace_tx = \
                int(namespace_stats[self.module.params['migrate_tx_key']])
            namespace_rx = \
                int(namespace_stats[self.module.params['migrate_rx_key']])
        except KeyError:
            self.module.fail_json(
                msg="Did not find partition remaining key:" +
                self.module.params['migrate_tx_key'] +
                " or key:" +
                self.module.params['migrate_rx_key'] +
                " in 'namespace/" +
                namespace +
                "' output."
            )
        except TypeError:
            self.module.fail_json(
                msg="namespace stat returned was not numerical"
            )
        return namespace_tx != 0 or namespace_rx != 0

    def _node_has_migs(self, node=None):
        """just calls namespace_has_migs and
        if any namespace has migs returns true"""
        migs = 0
        self._update_cluster_namespace_list()
        for namespace in self._namespaces:
            if self._namespace_has_migs(namespace, node):
                migs += 1
        return migs != 0

    def _cluster_key_consistent(self):
        """create a dictionary to store what each node
        returns the cluster key as. we should end up with only 1 dict key,
        with the key being the cluster key."""
        cluster_keys = {}
        for node in self._nodes:
            cluster_key = self._cluster_statistics[node][
                'cluster_key']
            if cluster_key not in cluster_keys:
                cluster_keys[cluster_key] = 1
            else:
                cluster_keys[cluster_key] += 1
        if len(cluster_keys.keys()) == 1 and \
                self._start_cluster_key in cluster_keys:
            return True
        return False

    def _cluster_migrates_allowed(self):
        """ensure all nodes have 'migrate_allowed' in their stats output"""
        for node in self._nodes:
            node_stats = self._info_cmd_helper('statistics', node)
            allowed = node_stats['migrate_allowed']
            if allowed == "false":
                return False
        return True

    def _cluster_has_migs(self):
        """calls node_has_migs for each node"""
        migs = 0
        for node in self._nodes:
            if self._node_has_migs(node):
                migs += 1
        if migs == 0:
            return False
        return True

    def _has_migs(self, local):
        if local:
            return self._local_node_has_migs()
        return self._cluster_has_migs()

    def _local_node_has_migs(self):
        return self._node_has_migs(None)

    def _is_min_cluster_size(self):
        """checks that all nodes in the cluster are returning the
        minimum cluster size specified in their statistics output"""
        sizes = set()
        for node in self._cluster_statistics:
            sizes.add(int(self._cluster_statistics[node]['cluster_size']))

        if (len(sizes)) > 1:  # if we are getting more than 1 size, lets say no
            return False
        if (min(sizes)) >= self.module.params['min_cluster_size']:
            return True
        return False

    def _cluster_stable(self):
        """Added 4.3:
        cluster-stable:size=<target-cluster-size>;ignore-migrations=<yes/no>;namespace=<namespace-name>
        Returns the current 'cluster_key' when the following are satisfied:

         If 'size' is specified then the target node's 'cluster-size'
         must match size.
         If 'ignore-migrations' is either unspecified or 'false' then
         the target node's migrations counts must be zero for the provided
         'namespace' or all namespaces if 'namespace' is not provided."""
        cluster_key = set()
        cluster_key.add(self._info_cmd_helper('statistics')['cluster_key'])
        cmd = "cluster-stable:"
        target_cluster_size = self.module.params['target_cluster_size']
        if target_cluster_size is not None:
            cmd = cmd + "size=" + str(target_cluster_size) + ";"
        for node in self._nodes:
            try:
                cluster_key.add(self._info_cmd_helper(cmd, node))
            except aerospike.exception.ServerError as e:  # unstable-cluster is returned in form of Exception
                if 'unstable-cluster' in e.msg:
                    return False
                raise e
        if len(cluster_key) == 1:
            return True
        return False

    def _cluster_good_state(self):
        """checks a few things to make sure we're OK to say the cluster
        has no migs. It could be in a unhealthy condition that does not allow
        migs, or a split brain"""
        if self._cluster_key_consistent() is not True:
            return False, "Cluster key inconsistent."
        if self._is_min_cluster_size() is not True:
            return False, "Cluster min size not reached."
        if self._cluster_migrates_allowed() is not True:
            return False, "migrate_allowed is false somewhere."
        return True, "OK."

    def has_migs(self, local=True):
        """returns a boolean, False if no migrations otherwise True"""
        consecutive_good = 0
        try_num = 0
        skip_reason = list()
        while \
            try_num < int(self.module.params['tries_limit']) and \
                consecutive_good < \
                int(self.module.params['consecutive_good_checks']):

            self._update_nodes_list()
            self._update_cluster_statistics()

            # These checks are outside of the while loop because
            # we probably want to skip & sleep instead of failing entirely
            stable, reason = self._cluster_good_state()
            if stable is not True:
                skip_reason.append(
                    "Skipping on try#" + str(try_num) +
                    " for reason:" + reason
                )
            else:
                if self._can_use_cluster_stable():
                    if self._cluster_stable():
                        consecutive_good += 1
                    else:
                        consecutive_good = 0
                        skip_reason.append(
                            "Skipping on try#" + str(try_num) +
                            " for reason:" + " cluster_stable"
                        )
                elif self._has_migs(local):
                    # print("_has_migs")
                    skip_reason.append(
                        "Skipping on try#" + str(try_num) +
                        " for reason:" + " migrations"
                    )
                    consecutive_good = 0
                else:
                    consecutive_good += 1
                    if consecutive_good == self.module.params[
                            'consecutive_good_checks']:
                        break
            try_num += 1
            sleep(self.module.params['sleep_between_checks'])
            # print(skip_reason)
        if consecutive_good == self.module.params['consecutive_good_checks']:
            return False, None
        return True, skip_reason


def main():
    """main method for ansible module"""
    run_module()


if __name__ == '__main__':
    main()