diff options
Diffstat (limited to 'collectors/python.d.plugin/ceph')
-rw-r--r-- | collectors/python.d.plugin/ceph/Makefile.inc | 13 | ||||
-rw-r--r-- | collectors/python.d.plugin/ceph/README.md | 48 | ||||
-rw-r--r-- | collectors/python.d.plugin/ceph/ceph.chart.py | 374 | ||||
-rw-r--r-- | collectors/python.d.plugin/ceph/ceph.conf | 75 |
4 files changed, 510 insertions, 0 deletions
diff --git a/collectors/python.d.plugin/ceph/Makefile.inc b/collectors/python.d.plugin/ceph/Makefile.inc new file mode 100644 index 0000000..15b039e --- /dev/null +++ b/collectors/python.d.plugin/ceph/Makefile.inc @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-3.0-or-later + +# THIS IS NOT A COMPLETE Makefile +# IT IS INCLUDED BY ITS PARENT'S Makefile.am +# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT + +# install these files +dist_python_DATA += ceph/ceph.chart.py +dist_pythonconfig_DATA += ceph/ceph.conf + +# do not install these files, but include them in the distribution +dist_noinst_DATA += ceph/README.md ceph/Makefile.inc + diff --git a/collectors/python.d.plugin/ceph/README.md b/collectors/python.d.plugin/ceph/README.md new file mode 100644 index 0000000..b75ba6d --- /dev/null +++ b/collectors/python.d.plugin/ceph/README.md @@ -0,0 +1,48 @@ +<!-- +title: "CEPH monitoring with Netdata" +custom_edit_url: https://github.com/netdata/netdata/edit/master/collectors/python.d.plugin/ceph/README.md +sidebar_label: "CEPH" +--> + +# CEPH monitoring with Netdata + +Monitors the ceph cluster usage and consumption data of a server, and produces: + +- Cluster statistics (usage, available, latency, objects, read/write rate) +- OSD usage +- OSD latency +- Pool usage +- Pool read/write operations +- Pool read/write rate +- number of objects per pool + +## Requirements + +- `rados` python module +- Granting read permissions to ceph group from keyring file + +```shell +# chmod 640 /etc/ceph/ceph.client.admin.keyring +``` + +## Configuration + +Edit the `python.d/ceph.conf` configuration file using `edit-config` from the Netdata [config +directory](/docs/configure/nodes.md), which is typically at `/etc/netdata`. + +```bash +cd /etc/netdata # Replace this path with your Netdata config directory, if different +sudo ./edit-config python.d/ceph.conf +``` + +Sample: + +```yaml +local: + config_file: '/etc/ceph/ceph.conf' + keyring_file: '/etc/ceph/ceph.client.admin.keyring' +``` + +--- + + diff --git a/collectors/python.d.plugin/ceph/ceph.chart.py b/collectors/python.d.plugin/ceph/ceph.chart.py new file mode 100644 index 0000000..494eef4 --- /dev/null +++ b/collectors/python.d.plugin/ceph/ceph.chart.py @@ -0,0 +1,374 @@ +# -*- coding: utf-8 -*- +# Description: ceph netdata python.d module +# Author: Luis Eduardo (lets00) +# SPDX-License-Identifier: GPL-3.0-or-later + +try: + import rados + + CEPH = True +except ImportError: + CEPH = False + +import json +import os + +from bases.FrameworkServices.SimpleService import SimpleService + +# default module values (can be overridden per job in `config`) +update_every = 10 + +ORDER = [ + 'general_usage', + 'general_objects', + 'general_bytes', + 'general_operations', + 'general_latency', + 'pool_usage', + 'pool_objects', + 'pool_read_bytes', + 'pool_write_bytes', + 'pool_read_operations', + 'pool_write_operations', + 'osd_usage', + 'osd_size', + 'osd_apply_latency', + 'osd_commit_latency' +] + +CHARTS = { + 'general_usage': { + 'options': [None, 'Ceph General Space', 'KiB', 'general', 'ceph.general_usage', 'stacked'], + 'lines': [ + ['general_available', 'avail', 'absolute'], + ['general_usage', 'used', 'absolute'] + ] + }, + 'general_objects': { + 'options': [None, 'Ceph General Objects', 'objects', 'general', 'ceph.general_objects', 'area'], + 'lines': [ + ['general_objects', 'cluster', 'absolute'] + ] + }, + 'general_bytes': { + 'options': [None, 'Ceph General Read/Write Data/s', 'KiB/s', 'general', 'ceph.general_bytes', + 'area'], + 'lines': [ + ['general_read_bytes', 'read', 'absolute', 1, 1024], + ['general_write_bytes', 'write', 'absolute', -1, 1024] + ] + }, + 'general_operations': { + 'options': [None, 'Ceph General Read/Write Operations/s', 'operations', 'general', 'ceph.general_operations', + 'area'], + 'lines': [ + ['general_read_operations', 'read', 'absolute', 1], + ['general_write_operations', 'write', 'absolute', -1] + ] + }, + 'general_latency': { + 'options': [None, 'Ceph General Apply/Commit latency', 'milliseconds', 'general', 'ceph.general_latency', + 'area'], + 'lines': [ + ['general_apply_latency', 'apply', 'absolute'], + ['general_commit_latency', 'commit', 'absolute'] + ] + }, + 'pool_usage': { + 'options': [None, 'Ceph Pools', 'KiB', 'pool', 'ceph.pool_usage', 'line'], + 'lines': [] + }, + 'pool_objects': { + 'options': [None, 'Ceph Pools', 'objects', 'pool', 'ceph.pool_objects', 'line'], + 'lines': [] + }, + 'pool_read_bytes': { + 'options': [None, 'Ceph Read Pool Data/s', 'KiB/s', 'pool', 'ceph.pool_read_bytes', 'area'], + 'lines': [] + }, + 'pool_write_bytes': { + 'options': [None, 'Ceph Write Pool Data/s', 'KiB/s', 'pool', 'ceph.pool_write_bytes', 'area'], + 'lines': [] + }, + 'pool_read_operations': { + 'options': [None, 'Ceph Read Pool Operations/s', 'operations', 'pool', 'ceph.pool_read_operations', 'area'], + 'lines': [] + }, + 'pool_write_operations': { + 'options': [None, 'Ceph Write Pool Operations/s', 'operations', 'pool', 'ceph.pool_write_operations', 'area'], + 'lines': [] + }, + 'osd_usage': { + 'options': [None, 'Ceph OSDs', 'KiB', 'osd', 'ceph.osd_usage', 'line'], + 'lines': [] + }, + 'osd_size': { + 'options': [None, 'Ceph OSDs size', 'KiB', 'osd', 'ceph.osd_size', 'line'], + 'lines': [] + }, + 'osd_apply_latency': { + 'options': [None, 'Ceph OSDs apply latency', 'milliseconds', 'osd', 'ceph.apply_latency', 'line'], + 'lines': [] + }, + 'osd_commit_latency': { + 'options': [None, 'Ceph OSDs commit latency', 'milliseconds', 'osd', 'ceph.commit_latency', 'line'], + 'lines': [] + } + +} + + +class Service(SimpleService): + def __init__(self, configuration=None, name=None): + SimpleService.__init__(self, configuration=configuration, name=name) + self.order = ORDER + self.definitions = CHARTS + self.config_file = self.configuration.get('config_file') + self.keyring_file = self.configuration.get('keyring_file') + self.rados_id = self.configuration.get('rados_id', 'admin') + + def check(self): + """ + Checks module + :return: + """ + if not CEPH: + self.error('rados module is needed to use ceph.chart.py') + return False + if not (self.config_file and self.keyring_file): + self.error('config_file and/or keyring_file is not defined') + return False + + # Verify files and permissions + if not (os.access(self.config_file, os.F_OK)): + self.error('{0} does not exist'.format(self.config_file)) + return False + if not (os.access(self.keyring_file, os.F_OK)): + self.error('{0} does not exist'.format(self.keyring_file)) + return False + if not (os.access(self.config_file, os.R_OK)): + self.error('Ceph plugin does not read {0}, define read permission.'.format(self.config_file)) + return False + if not (os.access(self.keyring_file, os.R_OK)): + self.error('Ceph plugin does not read {0}, define read permission.'.format(self.keyring_file)) + return False + try: + self.cluster = rados.Rados(conffile=self.config_file, + conf=dict(keyring=self.keyring_file), + rados_id=self.rados_id) + self.cluster.connect() + except rados.Error as error: + self.error(error) + return False + self.create_definitions() + return True + + def create_definitions(self): + """ + Create dynamically charts options + :return: None + """ + # Pool lines + for pool in sorted(self._get_df()['pools'], key=lambda x: sorted(x.keys())): + self.definitions['pool_usage']['lines'].append([pool['name'], + pool['name'], + 'absolute']) + self.definitions['pool_objects']['lines'].append(["obj_{0}".format(pool['name']), + pool['name'], + 'absolute']) + self.definitions['pool_read_bytes']['lines'].append(['read_{0}'.format(pool['name']), + pool['name'], + 'absolute', 1, 1024]) + self.definitions['pool_write_bytes']['lines'].append(['write_{0}'.format(pool['name']), + pool['name'], + 'absolute', 1, 1024]) + self.definitions['pool_read_operations']['lines'].append(['read_operations_{0}'.format(pool['name']), + pool['name'], + 'absolute']) + self.definitions['pool_write_operations']['lines'].append(['write_operations_{0}'.format(pool['name']), + pool['name'], + 'absolute']) + + # OSD lines + for osd in sorted(self._get_osd_df()['nodes'], key=lambda x: sorted(x.keys())): + self.definitions['osd_usage']['lines'].append([osd['name'], + osd['name'], + 'absolute']) + self.definitions['osd_size']['lines'].append(['size_{0}'.format(osd['name']), + osd['name'], + 'absolute']) + self.definitions['osd_apply_latency']['lines'].append(['apply_latency_{0}'.format(osd['name']), + osd['name'], + 'absolute']) + self.definitions['osd_commit_latency']['lines'].append(['commit_latency_{0}'.format(osd['name']), + osd['name'], + 'absolute']) + + def get_data(self): + """ + Catch all ceph data + :return: dict + """ + try: + data = {} + df = self._get_df() + osd_df = self._get_osd_df() + osd_perf = self._get_osd_perf() + osd_perf_infos = get_osd_perf_infos(osd_perf) + pool_stats = self._get_osd_pool_stats() + + data.update(self._get_general(osd_perf_infos, pool_stats)) + for pool in df['pools']: + data.update(self._get_pool_usage(pool)) + data.update(self._get_pool_objects(pool)) + for pool_io in pool_stats: + data.update(self._get_pool_rw(pool_io)) + for osd in osd_df['nodes']: + data.update(self._get_osd_usage(osd)) + data.update(self._get_osd_size(osd)) + for osd_apply_commit in osd_perf_infos: + data.update(self._get_osd_latency(osd_apply_commit)) + return data + except (ValueError, AttributeError) as error: + self.error(error) + return None + + def _get_general(self, osd_perf_infos, pool_stats): + """ + Get ceph's general usage + :return: dict + """ + status = self.cluster.get_cluster_stats() + read_bytes_sec = 0 + write_bytes_sec = 0 + read_op_per_sec = 0 + write_op_per_sec = 0 + apply_latency = 0 + commit_latency = 0 + + for pool_rw_io_b in pool_stats: + read_bytes_sec += pool_rw_io_b['client_io_rate'].get('read_bytes_sec', 0) + write_bytes_sec += pool_rw_io_b['client_io_rate'].get('write_bytes_sec', 0) + read_op_per_sec += pool_rw_io_b['client_io_rate'].get('read_op_per_sec', 0) + write_op_per_sec += pool_rw_io_b['client_io_rate'].get('write_op_per_sec', 0) + for perf in osd_perf_infos: + apply_latency += perf['perf_stats']['apply_latency_ms'] + commit_latency += perf['perf_stats']['commit_latency_ms'] + + return { + 'general_usage': int(status['kb_used']), + 'general_available': int(status['kb_avail']), + 'general_objects': int(status['num_objects']), + 'general_read_bytes': read_bytes_sec, + 'general_write_bytes': write_bytes_sec, + 'general_read_operations': read_op_per_sec, + 'general_write_operations': write_op_per_sec, + 'general_apply_latency': apply_latency, + 'general_commit_latency': commit_latency + } + + @staticmethod + def _get_pool_usage(pool): + """ + Process raw data into pool usage dict information + :return: A pool dict with pool name's key and usage bytes' value + """ + return {pool['name']: pool['stats']['kb_used']} + + @staticmethod + def _get_pool_objects(pool): + """ + Process raw data into pool usage dict information + :return: A pool dict with pool name's key and object numbers + """ + return {'obj_{0}'.format(pool['name']): pool['stats']['objects']} + + @staticmethod + def _get_pool_rw(pool): + """ + Get read/write kb and operations in a pool + :return: A pool dict with both read/write bytes and operations. + """ + return { + 'read_{0}'.format(pool['pool_name']): int(pool['client_io_rate'].get('read_bytes_sec', 0)), + 'write_{0}'.format(pool['pool_name']): int(pool['client_io_rate'].get('write_bytes_sec', 0)), + 'read_operations_{0}'.format(pool['pool_name']): int(pool['client_io_rate'].get('read_op_per_sec', 0)), + 'write_operations_{0}'.format(pool['pool_name']): int(pool['client_io_rate'].get('write_op_per_sec', 0)) + } + + @staticmethod + def _get_osd_usage(osd): + """ + Process raw data into osd dict information to get osd usage + :return: A osd dict with osd name's key and usage bytes' value + """ + return {osd['name']: float(osd['kb_used'])} + + @staticmethod + def _get_osd_size(osd): + """ + Process raw data into osd dict information to get osd size (kb) + :return: A osd dict with osd name's key and size bytes' value + """ + return {'size_{0}'.format(osd['name']): float(osd['kb'])} + + @staticmethod + def _get_osd_latency(osd): + """ + Get ceph osd apply and commit latency + :return: A osd dict with osd name's key with both apply and commit latency values + """ + return { + 'apply_latency_osd.{0}'.format(osd['id']): osd['perf_stats']['apply_latency_ms'], + 'commit_latency_osd.{0}'.format(osd['id']): osd['perf_stats']['commit_latency_ms'] + } + + def _get_df(self): + """ + Get ceph df output + :return: ceph df --format json + """ + return json.loads(self.cluster.mon_command(json.dumps({ + 'prefix': 'df', + 'format': 'json' + }), '')[1].decode('utf-8')) + + def _get_osd_df(self): + """ + Get ceph osd df output + :return: ceph osd df --format json + """ + return json.loads(self.cluster.mon_command(json.dumps({ + 'prefix': 'osd df', + 'format': 'json' + }), '')[1].decode('utf-8').replace('-nan', '"-nan"')) + + def _get_osd_perf(self): + """ + Get ceph osd performance + :return: ceph osd perf --format json + """ + return json.loads(self.cluster.mon_command(json.dumps({ + 'prefix': 'osd perf', + 'format': 'json' + }), '')[1].decode('utf-8')) + + def _get_osd_pool_stats(self): + """ + Get ceph osd pool status. + This command is used to get information about both + read/write operation and bytes per second on each pool + :return: ceph osd pool stats --format json + """ + return json.loads(self.cluster.mon_command(json.dumps({ + 'prefix': 'osd pool stats', + 'format': 'json' + }), '')[1].decode('utf-8')) + + +def get_osd_perf_infos(osd_perf): + # https://github.com/netdata/netdata/issues/8247 + # module uses 'osd_perf_infos' data, its been moved under 'osdstats` since Ceph v14.2 + if 'osd_perf_infos' in osd_perf: + return osd_perf['osd_perf_infos'] + return osd_perf['osdstats']['osd_perf_infos'] diff --git a/collectors/python.d.plugin/ceph/ceph.conf b/collectors/python.d.plugin/ceph/ceph.conf new file mode 100644 index 0000000..81788e8 --- /dev/null +++ b/collectors/python.d.plugin/ceph/ceph.conf @@ -0,0 +1,75 @@ +# netdata python.d.plugin configuration for ceph stats +# +# This file is in YaML format. Generally the format is: +# +# name: value +# +# There are 2 sections: +# - global variables +# - one or more JOBS +# +# JOBS allow you to collect values from multiple sources. +# Each source will have its own set of charts. +# +# JOB parameters have to be indented (using spaces only, example below). + +# ---------------------------------------------------------------------- +# Global Variables +# These variables set the defaults for all JOBs, however each JOB +# may define its own, overriding the defaults. + +# update_every sets the default data collection frequency. +# If unset, the python.d.plugin default is used. +# update_every: 10 + +# priority controls the order of charts at the netdata dashboard. +# Lower numbers move the charts towards the top of the page. +# If unset, the default for python.d.plugin is used. +# priority: 60000 + +# penalty indicates whether to apply penalty to update_every in case of failures. +# Penalty will increase every 5 failed updates in a row. Maximum penalty is 10 minutes. +# penalty: yes + +# autodetection_retry sets the job re-check interval in seconds. +# The job is not deleted if check fails. +# Attempts to start the job are made once every autodetection_retry. +# This feature is disabled by default. +# autodetection_retry: 0 + +# ---------------------------------------------------------------------- +# JOBS (data collection sources) +# +# The default JOBS share the same *name*. JOBS with the same name +# are mutually exclusive. Only one of them will be allowed running at +# any time. This allows autodetection to try several alternatives and +# pick the one that works. +# +# Any number of jobs is supported. +# +# All python.d.plugin JOBS (for all its modules) support a set of +# predefined parameters. These are: +# +# job_name: +# name: myname # the JOB's name as it will appear at the +# # dashboard (by default is the job_name) +# # JOBs sharing a name are mutually exclusive +# update_every: 10 # the JOB's data collection frequency +# priority: 60000 # the JOB's order on the dashboard +# penalty: yes # the JOB's penalty +# autodetection_retry: 0 # the JOB's re-check interval in seconds +# +# Additionally to the above, ceph plugin also supports the following: +# +# config_file: 'config_file' # Ceph config file. +# keyring_file: 'keyring_file' # Ceph keyring file. netdata user must be added into ceph group +# # and keyring file must be read group permission. +# rados_id: 'rados username' # ID used to connect to ceph cluster. Allows +# # creating a read only key for pulling data v.s. admin +# ---------------------------------------------------------------------- +# AUTO-DETECTION JOBS +# only one of them will run (they have the same name) +# +config_file: '/etc/ceph/ceph.conf' +keyring_file: '/etc/ceph/ceph.client.admin.keyring' +rados_id: 'admin' |