summaryrefslogtreecommitdiffstats
path: root/src/pybind/mgr/pg_autoscaler/module.py
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/pybind/mgr/pg_autoscaler/module.py507
1 files changed, 507 insertions, 0 deletions
diff --git a/src/pybind/mgr/pg_autoscaler/module.py b/src/pybind/mgr/pg_autoscaler/module.py
new file mode 100644
index 00000000..3d5d5175
--- /dev/null
+++ b/src/pybind/mgr/pg_autoscaler/module.py
@@ -0,0 +1,507 @@
+"""
+Automatically scale pg_num based on how much data is stored in each pool.
+"""
+
+import errno
+import json
+import mgr_util
+import threading
+import uuid
+from six import itervalues, iteritems
+from collections import defaultdict
+from prettytable import PrettyTable, PLAIN_COLUMNS
+
+from mgr_module import MgrModule
+
+"""
+Some terminology is made up for the purposes of this module:
+
+ - "raw pgs": pg count after applying replication, i.e. the real resource
+ consumption of a pool.
+ - "grow/shrink" - increase/decrease the pg_num in a pool
+ - "crush subtree" - non-overlapping domains in crush hierarchy: used as
+ units of resource management.
+"""
+
+INTERVAL = 5
+
+PG_NUM_MIN = 32 # unless specified on a per-pool basis
+
+def nearest_power_of_two(n):
+ v = int(n)
+
+ v -= 1
+ v |= v >> 1
+ v |= v >> 2
+ v |= v >> 4
+ v |= v >> 8
+ v |= v >> 16
+
+ # High bound power of two
+ v += 1
+
+ # Low bound power of tow
+ x = v >> 1
+
+ return x if (v - n) > (n - x) else v
+
+def effective_target_ratio(target_ratio, total_target_ratio, total_target_bytes, capacity):
+ """
+ Returns the target ratio after normalizing for ratios across pools and
+ adjusting for capacity reserved by pools that have target_size_bytes set.
+ """
+ target_ratio = float(target_ratio)
+ if total_target_ratio:
+ target_ratio = target_ratio / total_target_ratio
+
+ if total_target_bytes and capacity:
+ fraction_available = 1.0 - min(1.0, float(total_target_bytes) / capacity)
+ target_ratio *= fraction_available
+
+ return target_ratio
+
+
+class PgAdjustmentProgress(object):
+ """
+ Keeps the initial and target pg_num values
+ """
+ def __init__(self, pg_num, pg_num_target, ev_id, increase_decrease):
+ self._ev_id = ev_id
+ self._pg_num = pg_num
+ self._pg_num_target = pg_num_target
+ self._increase_decrease = increase_decrease
+
+
+class PgAutoscaler(MgrModule):
+ """
+ PG autoscaler.
+ """
+ COMMANDS = [
+ {
+ "cmd": "osd pool autoscale-status",
+ "desc": "report on pool pg_num sizing recommendation and intent",
+ "perm": "r"
+ },
+ ]
+
+ NATIVE_OPTIONS = [
+ 'mon_target_pg_per_osd',
+ 'mon_max_pg_per_osd',
+ ]
+
+ MODULE_OPTIONS = [
+ {
+ 'name': 'sleep_interval',
+ 'default': str(60),
+ },
+ ]
+
+ def __init__(self, *args, **kwargs):
+ super(PgAutoscaler, self).__init__(*args, **kwargs)
+ self._shutdown = threading.Event()
+
+ # So much of what we do peeks at the osdmap that it's easiest
+ # to just keep a copy of the pythonized version.
+ self._osd_map = None
+
+ def config_notify(self):
+ for opt in self.NATIVE_OPTIONS:
+ setattr(self,
+ opt,
+ self.get_ceph_option(opt))
+ self.log.debug(' native option %s = %s', opt, getattr(self, opt))
+ for opt in self.MODULE_OPTIONS:
+ setattr(self,
+ opt['name'],
+ self.get_module_option(opt['name']))
+ self.log.debug(' mgr option %s = %s',
+ opt['name'], getattr(self, opt['name']))
+
+
+ def handle_command(self, inbuf, cmd):
+ if cmd['prefix'] == "osd pool autoscale-status":
+ retval = self._command_autoscale_status(cmd)
+ else:
+ assert False # ceph-mgr should never pass us unknown cmds
+ return retval
+
+ def _command_autoscale_status(self, cmd):
+ osdmap = self.get_osdmap()
+ pools = osdmap.get_pools_by_name()
+ ps, root_map, pool_root = self._get_pool_status(osdmap, pools)
+
+ if cmd.get('format') == 'json' or cmd.get('format') == 'json-pretty':
+ return 0, json.dumps(ps, indent=2), ''
+ else:
+ table = PrettyTable(['POOL', 'SIZE', 'TARGET SIZE',
+ 'RATE', 'RAW CAPACITY',
+ 'RATIO', 'TARGET RATIO',
+ 'EFFECTIVE RATIO',
+ 'BIAS',
+ 'PG_NUM',
+# 'IDEAL',
+ 'NEW PG_NUM', 'AUTOSCALE'],
+ border=False)
+ table.left_padding_width = 0
+ table.right_padding_width = 1
+ table.align['POOL'] = 'l'
+ table.align['SIZE'] = 'r'
+ table.align['TARGET SIZE'] = 'r'
+ table.align['RATE'] = 'r'
+ table.align['RAW CAPACITY'] = 'r'
+ table.align['RATIO'] = 'r'
+ table.align['TARGET RATIO'] = 'r'
+ table.align['EFFECTIVE RATIO'] = 'r'
+ table.align['BIAS'] = 'r'
+ table.align['PG_NUM'] = 'r'
+# table.align['IDEAL'] = 'r'
+ table.align['NEW PG_NUM'] = 'r'
+ table.align['AUTOSCALE'] = 'l'
+ for p in ps:
+ if p['would_adjust']:
+ final = str(p['pg_num_final'])
+ else:
+ final = ''
+ if p['target_bytes'] > 0:
+ ts = mgr_util.format_bytes(p['target_bytes'], 6)
+ else:
+ ts = ''
+ if p['target_ratio'] > 0.0:
+ tr = '%.4f' % p['target_ratio']
+ else:
+ tr = ''
+ if p['effective_target_ratio'] > 0.0:
+ etr = '%.4f' % p['effective_target_ratio']
+ else:
+ etr = ''
+ table.add_row([
+ p['pool_name'],
+ mgr_util.format_bytes(p['logical_used'], 6),
+ ts,
+ p['raw_used_rate'],
+ mgr_util.format_bytes(p['subtree_capacity'], 6),
+ '%.4f' % p['capacity_ratio'],
+ tr,
+ etr,
+ p['bias'],
+ p['pg_num_target'],
+# p['pg_num_ideal'],
+ final,
+ p['pg_autoscale_mode'],
+ ])
+ return 0, table.get_string(), ''
+
+ def serve(self):
+ self.config_notify()
+ while not self._shutdown.is_set():
+ self._maybe_adjust()
+ self._shutdown.wait(timeout=int(self.sleep_interval))
+
+ def shutdown(self):
+ self.log.info('Stopping pg_autoscaler')
+ self._shutdown.set()
+
+ def get_subtree_resource_status(self, osdmap, crush):
+ """
+ For each CRUSH subtree of interest (i.e. the roots under which
+ we have pools), calculate the current resource usages and targets,
+ such as how many PGs there are, vs. how many PGs we would
+ like there to be.
+ """
+ result = {}
+ pool_root = {}
+ roots = []
+
+ class CrushSubtreeResourceStatus(object):
+ def __init__(self):
+ self.root_ids = []
+ self.osds = set()
+ self.osd_count = None # Number of OSDs
+ self.pg_target = None # Ideal full-capacity PG count?
+ self.pg_current = 0 # How many PGs already?
+ self.capacity = None # Total capacity of OSDs in subtree
+ self.pool_ids = []
+ self.pool_names = []
+ self.total_target_ratio = 0.0
+ self.total_target_bytes = 0 # including replication / EC overhead
+
+ # identify subtrees (note that they may overlap!)
+ for pool_id, pool in osdmap.get_pools().items():
+ cr_name = crush.get_rule_by_id(pool['crush_rule'])['rule_name']
+ root_id = int(crush.get_rule_root(cr_name))
+ pool_root[pool_id] = root_id
+ osds = set(crush.get_osds_under(root_id))
+
+ # do we intersect an existing root?
+ s = None
+ for prev in itervalues(result):
+ if osds & prev.osds:
+ s = prev
+ break
+ if not s:
+ s = CrushSubtreeResourceStatus()
+ roots.append(s)
+ result[root_id] = s
+ s.root_ids.append(root_id)
+ s.osds |= osds
+ s.pool_ids.append(int(pool_id))
+ s.pool_names.append(pool['pool_name'])
+ s.pg_current += pool['pg_num_target'] * pool['size']
+ target_ratio = pool['options'].get('target_size_ratio', 0.0)
+ if target_ratio:
+ s.total_target_ratio += target_ratio
+ else:
+ target_bytes = pool['options'].get('target_size_bytes', 0)
+ if target_bytes:
+ s.total_target_bytes += target_bytes * osdmap.pool_raw_used_rate(pool_id)
+
+ # finish subtrees
+ all_stats = self.get('osd_stats')
+ for s in roots:
+ s.osd_count = len(s.osds)
+ s.pg_target = s.osd_count * int(self.mon_target_pg_per_osd)
+
+ capacity = 0.0
+ for osd_stats in all_stats['osd_stats']:
+ if osd_stats['osd'] in s.osds:
+ # Intentionally do not apply the OSD's reweight to
+ # this, because we want to calculate PG counts based
+ # on the physical storage available, not how it is
+ # reweighted right now.
+ capacity += osd_stats['kb'] * 1024
+
+ s.capacity = capacity
+
+ self.log.debug('root_ids %s pools %s with %d osds, pg_target %d',
+ s.root_ids,
+ s.pool_ids,
+ s.osd_count,
+ s.pg_target)
+
+ return result, pool_root
+
+ def _get_pool_status(
+ self,
+ osdmap,
+ pools,
+ threshold=3.0,
+ ):
+ assert threshold >= 2.0
+
+ crush_map = osdmap.get_crush()
+
+ root_map, pool_root = self.get_subtree_resource_status(osdmap, crush_map)
+
+ df = self.get('df')
+ pool_stats = dict([(p['id'], p['stats']) for p in df['pools']])
+
+ ret = []
+
+ # iterate over all pools to determine how they should be sized
+ for pool_name, p in iteritems(pools):
+ pool_id = p['pool']
+ if pool_id not in pool_stats:
+ # race with pool deletion; skip
+ continue
+
+ # FIXME: we assume there is only one take per pool, but that
+ # may not be true.
+ cr_name = crush_map.get_rule_by_id(p['crush_rule'])['rule_name']
+ root_id = int(crush_map.get_rule_root(cr_name))
+ pool_root[pool_name] = root_id
+
+ capacity = root_map[root_id].capacity
+ if capacity == 0:
+ self.log.debug('skipping empty subtree %s', cr_name)
+ continue
+
+ raw_used_rate = osdmap.pool_raw_used_rate(pool_id)
+
+ pool_logical_used = pool_stats[pool_id]['stored']
+ bias = p['options'].get('pg_autoscale_bias', 1.0)
+ target_bytes = 0
+ # ratio takes precedence if both are set
+ if p['options'].get('target_size_ratio', 0.0) == 0.0:
+ target_bytes = p['options'].get('target_size_bytes', 0)
+
+ # What proportion of space are we using?
+ actual_raw_used = pool_logical_used * raw_used_rate
+ actual_capacity_ratio = float(actual_raw_used) / capacity
+
+ pool_raw_used = max(pool_logical_used, target_bytes) * raw_used_rate
+ capacity_ratio = float(pool_raw_used) / capacity
+
+ self.log.info("effective_target_ratio {0} {1} {2} {3}".format(
+ p['options'].get('target_size_ratio', 0.0),
+ root_map[root_id].total_target_ratio,
+ root_map[root_id].total_target_bytes,
+ capacity))
+ target_ratio = effective_target_ratio(p['options'].get('target_size_ratio', 0.0),
+ root_map[root_id].total_target_ratio,
+ root_map[root_id].total_target_bytes,
+ capacity)
+
+ final_ratio = max(capacity_ratio, target_ratio)
+
+ # So what proportion of pg allowance should we be using?
+ pool_pg_target = (final_ratio * root_map[root_id].pg_target) / p['size'] * bias
+
+ final_pg_target = max(p['options'].get('pg_num_min', PG_NUM_MIN),
+ nearest_power_of_two(pool_pg_target))
+
+ self.log.info("Pool '{0}' root_id {1} using {2} of space, bias {3}, "
+ "pg target {4} quantized to {5} (current {6})".format(
+ p['pool_name'],
+ root_id,
+ final_ratio,
+ bias,
+ pool_pg_target,
+ final_pg_target,
+ p['pg_num_target']
+ ))
+
+ adjust = False
+ if (final_pg_target > p['pg_num_target'] * threshold or \
+ final_pg_target <= p['pg_num_target'] / threshold) and \
+ final_ratio >= 0.0 and \
+ final_ratio <= 1.0:
+ adjust = True
+
+ ret.append({
+ 'pool_id': pool_id,
+ 'pool_name': p['pool_name'],
+ 'crush_root_id': root_id,
+ 'pg_autoscale_mode': p['pg_autoscale_mode'],
+ 'pg_num_target': p['pg_num_target'],
+ 'logical_used': pool_logical_used,
+ 'target_bytes': target_bytes,
+ 'raw_used_rate': raw_used_rate,
+ 'subtree_capacity': capacity,
+ 'actual_raw_used': actual_raw_used,
+ 'raw_used': pool_raw_used,
+ 'actual_capacity_ratio': actual_capacity_ratio,
+ 'capacity_ratio': capacity_ratio,
+ 'target_ratio': p['options'].get('target_size_ratio', 0.0),
+ 'effective_target_ratio': target_ratio,
+ 'pg_num_ideal': int(pool_pg_target),
+ 'pg_num_final': final_pg_target,
+ 'would_adjust': adjust,
+ 'bias': p.get('options', {}).get('pg_autoscale_bias', 1.0),
+ });
+
+ return (ret, root_map, pool_root)
+
+
+ def _maybe_adjust(self):
+ self.log.info('_maybe_adjust')
+ osdmap = self.get_osdmap()
+ pools = osdmap.get_pools_by_name()
+ ps, root_map, pool_root = self._get_pool_status(osdmap, pools)
+
+ # Anyone in 'warn', set the health message for them and then
+ # drop them from consideration.
+ too_few = []
+ too_many = []
+ bytes_and_ratio = []
+ health_checks = {}
+
+ total_bytes = dict([(r, 0) for r in iter(root_map)])
+ total_target_bytes = dict([(r, 0.0) for r in iter(root_map)])
+ target_bytes_pools = dict([(r, []) for r in iter(root_map)])
+
+ for p in ps:
+ pool_opts = pools[p['pool_name']]['options']
+ if pool_opts.get('target_size_ratio', 0) > 0 and pool_opts.get('target_size_bytes', 0) > 0:
+ bytes_and_ratio.append('Pool %s has target_size_bytes and target_size_ratio set' % p['pool_name'])
+ total_bytes[p['crush_root_id']] += max(
+ p['actual_raw_used'],
+ p['target_bytes'] * p['raw_used_rate'])
+ if p['target_bytes'] > 0:
+ total_target_bytes[p['crush_root_id']] += p['target_bytes'] * p['raw_used_rate']
+ target_bytes_pools[p['crush_root_id']].append(p['pool_name'])
+ if not p['would_adjust']:
+ continue
+ if p['pg_autoscale_mode'] == 'warn':
+ msg = 'Pool %s has %d placement groups, should have %d' % (
+ p['pool_name'],
+ p['pg_num_target'],
+ p['pg_num_final'])
+ if p['pg_num_final'] > p['pg_num_target']:
+ too_few.append(msg)
+ else:
+ too_many.append(msg)
+
+ if p['pg_autoscale_mode'] == 'on':
+ # Note that setting pg_num actually sets pg_num_target (see
+ # OSDMonitor.cc)
+ r = self.mon_command({
+ 'prefix': 'osd pool set',
+ 'pool': p['pool_name'],
+ 'var': 'pg_num',
+ 'val': str(p['pg_num_final'])
+ })
+
+ if r[0] != 0:
+ # FIXME: this is a serious and unexpected thing,
+ # we should expose it as a cluster log error once
+ # the hook for doing that from ceph-mgr modules is
+ # in.
+ self.log.error("pg_num adjustment on {0} to {1} failed: {2}"
+ .format(p['pool_name'],
+ p['pg_num_final'], r))
+
+ if too_few:
+ summary = "{0} pools have too few placement groups".format(
+ len(too_few))
+ health_checks['POOL_TOO_FEW_PGS'] = {
+ 'severity': 'warning',
+ 'summary': summary,
+ 'detail': too_few
+ }
+ if too_many:
+ summary = "{0} pools have too many placement groups".format(
+ len(too_many))
+ health_checks['POOL_TOO_MANY_PGS'] = {
+ 'severity': 'warning',
+ 'summary': summary,
+ 'detail': too_many
+ }
+
+ too_much_target_bytes = []
+ for root_id, total in iteritems(total_bytes):
+ total_target = total_target_bytes[root_id]
+ if total_target > 0 and total > root_map[root_id].capacity and root_map[root_id].capacity:
+ too_much_target_bytes.append(
+ 'Pools %s overcommit available storage by %.03fx due to '
+ 'target_size_bytes %s on pools %s' % (
+ root_map[root_id].pool_names,
+ total / root_map[root_id].capacity,
+ mgr_util.format_bytes(total_target, 5, colored=False),
+ target_bytes_pools[root_id]
+ )
+ )
+ elif total_target > root_map[root_id].capacity and root_map[root_id].capacity:
+ too_much_target_bytes.append(
+ 'Pools %s overcommit available storage by %.03fx due to '
+ 'collective target_size_bytes of %s' % (
+ root_map[root_id].pool_names,
+ total / root_map[root_id].capacity,
+ mgr_util.format_bytes(total_target, 5, colored=False),
+ )
+ )
+ if too_much_target_bytes:
+ health_checks['POOL_TARGET_SIZE_BYTES_OVERCOMMITTED'] = {
+ 'severity': 'warning',
+ 'summary': "%d subtrees have overcommitted pool target_size_bytes" % len(too_much_target_bytes),
+ 'detail': too_much_target_bytes,
+ }
+
+ if bytes_and_ratio:
+ health_checks['POOL_HAS_TARGET_SIZE_BYTES_AND_RATIO'] = {
+ 'severity': 'warning',
+ 'summary': "%d pools have both target_size_bytes and target_size_ratio set" % len(bytes_and_ratio),
+ 'count': len(bytes_and_ratio),
+ 'detail': bytes_and_ratio,
+ }
+
+ self.set_health_checks(health_checks)