summaryrefslogtreecommitdiffstats
path: root/collectors/python.d.plugin/ceph
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 14:31:17 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 14:31:17 +0000
commit8020f71afd34d7696d7933659df2d763ab05542f (patch)
tree2fdf1b5447ffd8bdd61e702ca183e814afdcb4fc /collectors/python.d.plugin/ceph
parentInitial commit. (diff)
downloadnetdata-upstream.tar.xz
netdata-upstream.zip
Adding upstream version 1.37.1.upstream/1.37.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'collectors/python.d.plugin/ceph')
-rw-r--r--collectors/python.d.plugin/ceph/Makefile.inc13
-rw-r--r--collectors/python.d.plugin/ceph/README.md48
-rw-r--r--collectors/python.d.plugin/ceph/ceph.chart.py374
-rw-r--r--collectors/python.d.plugin/ceph/ceph.conf75
4 files changed, 510 insertions, 0 deletions
diff --git a/collectors/python.d.plugin/ceph/Makefile.inc b/collectors/python.d.plugin/ceph/Makefile.inc
new file mode 100644
index 0000000..15b039e
--- /dev/null
+++ b/collectors/python.d.plugin/ceph/Makefile.inc
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+# THIS IS NOT A COMPLETE Makefile
+# IT IS INCLUDED BY ITS PARENT'S Makefile.am
+# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT
+
+# install these files
+dist_python_DATA += ceph/ceph.chart.py
+dist_pythonconfig_DATA += ceph/ceph.conf
+
+# do not install these files, but include them in the distribution
+dist_noinst_DATA += ceph/README.md ceph/Makefile.inc
+
diff --git a/collectors/python.d.plugin/ceph/README.md b/collectors/python.d.plugin/ceph/README.md
new file mode 100644
index 0000000..b75ba6d
--- /dev/null
+++ b/collectors/python.d.plugin/ceph/README.md
@@ -0,0 +1,48 @@
+<!--
+title: "CEPH monitoring with Netdata"
+custom_edit_url: https://github.com/netdata/netdata/edit/master/collectors/python.d.plugin/ceph/README.md
+sidebar_label: "CEPH"
+-->
+
+# CEPH monitoring with Netdata
+
+Monitors the ceph cluster usage and consumption data of a server, and produces:
+
+- Cluster statistics (usage, available, latency, objects, read/write rate)
+- OSD usage
+- OSD latency
+- Pool usage
+- Pool read/write operations
+- Pool read/write rate
+- number of objects per pool
+
+## Requirements
+
+- `rados` python module
+- Granting read permissions to ceph group from keyring file
+
+```shell
+# chmod 640 /etc/ceph/ceph.client.admin.keyring
+```
+
+## Configuration
+
+Edit the `python.d/ceph.conf` configuration file using `edit-config` from the Netdata [config
+directory](/docs/configure/nodes.md), which is typically at `/etc/netdata`.
+
+```bash
+cd /etc/netdata # Replace this path with your Netdata config directory, if different
+sudo ./edit-config python.d/ceph.conf
+```
+
+Sample:
+
+```yaml
+local:
+ config_file: '/etc/ceph/ceph.conf'
+ keyring_file: '/etc/ceph/ceph.client.admin.keyring'
+```
+
+---
+
+
diff --git a/collectors/python.d.plugin/ceph/ceph.chart.py b/collectors/python.d.plugin/ceph/ceph.chart.py
new file mode 100644
index 0000000..494eef4
--- /dev/null
+++ b/collectors/python.d.plugin/ceph/ceph.chart.py
@@ -0,0 +1,374 @@
+# -*- coding: utf-8 -*-
+# Description: ceph netdata python.d module
+# Author: Luis Eduardo (lets00)
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+try:
+ import rados
+
+ CEPH = True
+except ImportError:
+ CEPH = False
+
+import json
+import os
+
+from bases.FrameworkServices.SimpleService import SimpleService
+
+# default module values (can be overridden per job in `config`)
+update_every = 10
+
+ORDER = [
+ 'general_usage',
+ 'general_objects',
+ 'general_bytes',
+ 'general_operations',
+ 'general_latency',
+ 'pool_usage',
+ 'pool_objects',
+ 'pool_read_bytes',
+ 'pool_write_bytes',
+ 'pool_read_operations',
+ 'pool_write_operations',
+ 'osd_usage',
+ 'osd_size',
+ 'osd_apply_latency',
+ 'osd_commit_latency'
+]
+
+CHARTS = {
+ 'general_usage': {
+ 'options': [None, 'Ceph General Space', 'KiB', 'general', 'ceph.general_usage', 'stacked'],
+ 'lines': [
+ ['general_available', 'avail', 'absolute'],
+ ['general_usage', 'used', 'absolute']
+ ]
+ },
+ 'general_objects': {
+ 'options': [None, 'Ceph General Objects', 'objects', 'general', 'ceph.general_objects', 'area'],
+ 'lines': [
+ ['general_objects', 'cluster', 'absolute']
+ ]
+ },
+ 'general_bytes': {
+ 'options': [None, 'Ceph General Read/Write Data/s', 'KiB/s', 'general', 'ceph.general_bytes',
+ 'area'],
+ 'lines': [
+ ['general_read_bytes', 'read', 'absolute', 1, 1024],
+ ['general_write_bytes', 'write', 'absolute', -1, 1024]
+ ]
+ },
+ 'general_operations': {
+ 'options': [None, 'Ceph General Read/Write Operations/s', 'operations', 'general', 'ceph.general_operations',
+ 'area'],
+ 'lines': [
+ ['general_read_operations', 'read', 'absolute', 1],
+ ['general_write_operations', 'write', 'absolute', -1]
+ ]
+ },
+ 'general_latency': {
+ 'options': [None, 'Ceph General Apply/Commit latency', 'milliseconds', 'general', 'ceph.general_latency',
+ 'area'],
+ 'lines': [
+ ['general_apply_latency', 'apply', 'absolute'],
+ ['general_commit_latency', 'commit', 'absolute']
+ ]
+ },
+ 'pool_usage': {
+ 'options': [None, 'Ceph Pools', 'KiB', 'pool', 'ceph.pool_usage', 'line'],
+ 'lines': []
+ },
+ 'pool_objects': {
+ 'options': [None, 'Ceph Pools', 'objects', 'pool', 'ceph.pool_objects', 'line'],
+ 'lines': []
+ },
+ 'pool_read_bytes': {
+ 'options': [None, 'Ceph Read Pool Data/s', 'KiB/s', 'pool', 'ceph.pool_read_bytes', 'area'],
+ 'lines': []
+ },
+ 'pool_write_bytes': {
+ 'options': [None, 'Ceph Write Pool Data/s', 'KiB/s', 'pool', 'ceph.pool_write_bytes', 'area'],
+ 'lines': []
+ },
+ 'pool_read_operations': {
+ 'options': [None, 'Ceph Read Pool Operations/s', 'operations', 'pool', 'ceph.pool_read_operations', 'area'],
+ 'lines': []
+ },
+ 'pool_write_operations': {
+ 'options': [None, 'Ceph Write Pool Operations/s', 'operations', 'pool', 'ceph.pool_write_operations', 'area'],
+ 'lines': []
+ },
+ 'osd_usage': {
+ 'options': [None, 'Ceph OSDs', 'KiB', 'osd', 'ceph.osd_usage', 'line'],
+ 'lines': []
+ },
+ 'osd_size': {
+ 'options': [None, 'Ceph OSDs size', 'KiB', 'osd', 'ceph.osd_size', 'line'],
+ 'lines': []
+ },
+ 'osd_apply_latency': {
+ 'options': [None, 'Ceph OSDs apply latency', 'milliseconds', 'osd', 'ceph.apply_latency', 'line'],
+ 'lines': []
+ },
+ 'osd_commit_latency': {
+ 'options': [None, 'Ceph OSDs commit latency', 'milliseconds', 'osd', 'ceph.commit_latency', 'line'],
+ 'lines': []
+ }
+
+}
+
+
+class Service(SimpleService):
+ def __init__(self, configuration=None, name=None):
+ SimpleService.__init__(self, configuration=configuration, name=name)
+ self.order = ORDER
+ self.definitions = CHARTS
+ self.config_file = self.configuration.get('config_file')
+ self.keyring_file = self.configuration.get('keyring_file')
+ self.rados_id = self.configuration.get('rados_id', 'admin')
+
+ def check(self):
+ """
+ Checks module
+ :return:
+ """
+ if not CEPH:
+ self.error('rados module is needed to use ceph.chart.py')
+ return False
+ if not (self.config_file and self.keyring_file):
+ self.error('config_file and/or keyring_file is not defined')
+ return False
+
+ # Verify files and permissions
+ if not (os.access(self.config_file, os.F_OK)):
+ self.error('{0} does not exist'.format(self.config_file))
+ return False
+ if not (os.access(self.keyring_file, os.F_OK)):
+ self.error('{0} does not exist'.format(self.keyring_file))
+ return False
+ if not (os.access(self.config_file, os.R_OK)):
+ self.error('Ceph plugin does not read {0}, define read permission.'.format(self.config_file))
+ return False
+ if not (os.access(self.keyring_file, os.R_OK)):
+ self.error('Ceph plugin does not read {0}, define read permission.'.format(self.keyring_file))
+ return False
+ try:
+ self.cluster = rados.Rados(conffile=self.config_file,
+ conf=dict(keyring=self.keyring_file),
+ rados_id=self.rados_id)
+ self.cluster.connect()
+ except rados.Error as error:
+ self.error(error)
+ return False
+ self.create_definitions()
+ return True
+
+ def create_definitions(self):
+ """
+ Create dynamically charts options
+ :return: None
+ """
+ # Pool lines
+ for pool in sorted(self._get_df()['pools'], key=lambda x: sorted(x.keys())):
+ self.definitions['pool_usage']['lines'].append([pool['name'],
+ pool['name'],
+ 'absolute'])
+ self.definitions['pool_objects']['lines'].append(["obj_{0}".format(pool['name']),
+ pool['name'],
+ 'absolute'])
+ self.definitions['pool_read_bytes']['lines'].append(['read_{0}'.format(pool['name']),
+ pool['name'],
+ 'absolute', 1, 1024])
+ self.definitions['pool_write_bytes']['lines'].append(['write_{0}'.format(pool['name']),
+ pool['name'],
+ 'absolute', 1, 1024])
+ self.definitions['pool_read_operations']['lines'].append(['read_operations_{0}'.format(pool['name']),
+ pool['name'],
+ 'absolute'])
+ self.definitions['pool_write_operations']['lines'].append(['write_operations_{0}'.format(pool['name']),
+ pool['name'],
+ 'absolute'])
+
+ # OSD lines
+ for osd in sorted(self._get_osd_df()['nodes'], key=lambda x: sorted(x.keys())):
+ self.definitions['osd_usage']['lines'].append([osd['name'],
+ osd['name'],
+ 'absolute'])
+ self.definitions['osd_size']['lines'].append(['size_{0}'.format(osd['name']),
+ osd['name'],
+ 'absolute'])
+ self.definitions['osd_apply_latency']['lines'].append(['apply_latency_{0}'.format(osd['name']),
+ osd['name'],
+ 'absolute'])
+ self.definitions['osd_commit_latency']['lines'].append(['commit_latency_{0}'.format(osd['name']),
+ osd['name'],
+ 'absolute'])
+
+ def get_data(self):
+ """
+ Catch all ceph data
+ :return: dict
+ """
+ try:
+ data = {}
+ df = self._get_df()
+ osd_df = self._get_osd_df()
+ osd_perf = self._get_osd_perf()
+ osd_perf_infos = get_osd_perf_infos(osd_perf)
+ pool_stats = self._get_osd_pool_stats()
+
+ data.update(self._get_general(osd_perf_infos, pool_stats))
+ for pool in df['pools']:
+ data.update(self._get_pool_usage(pool))
+ data.update(self._get_pool_objects(pool))
+ for pool_io in pool_stats:
+ data.update(self._get_pool_rw(pool_io))
+ for osd in osd_df['nodes']:
+ data.update(self._get_osd_usage(osd))
+ data.update(self._get_osd_size(osd))
+ for osd_apply_commit in osd_perf_infos:
+ data.update(self._get_osd_latency(osd_apply_commit))
+ return data
+ except (ValueError, AttributeError) as error:
+ self.error(error)
+ return None
+
+ def _get_general(self, osd_perf_infos, pool_stats):
+ """
+ Get ceph's general usage
+ :return: dict
+ """
+ status = self.cluster.get_cluster_stats()
+ read_bytes_sec = 0
+ write_bytes_sec = 0
+ read_op_per_sec = 0
+ write_op_per_sec = 0
+ apply_latency = 0
+ commit_latency = 0
+
+ for pool_rw_io_b in pool_stats:
+ read_bytes_sec += pool_rw_io_b['client_io_rate'].get('read_bytes_sec', 0)
+ write_bytes_sec += pool_rw_io_b['client_io_rate'].get('write_bytes_sec', 0)
+ read_op_per_sec += pool_rw_io_b['client_io_rate'].get('read_op_per_sec', 0)
+ write_op_per_sec += pool_rw_io_b['client_io_rate'].get('write_op_per_sec', 0)
+ for perf in osd_perf_infos:
+ apply_latency += perf['perf_stats']['apply_latency_ms']
+ commit_latency += perf['perf_stats']['commit_latency_ms']
+
+ return {
+ 'general_usage': int(status['kb_used']),
+ 'general_available': int(status['kb_avail']),
+ 'general_objects': int(status['num_objects']),
+ 'general_read_bytes': read_bytes_sec,
+ 'general_write_bytes': write_bytes_sec,
+ 'general_read_operations': read_op_per_sec,
+ 'general_write_operations': write_op_per_sec,
+ 'general_apply_latency': apply_latency,
+ 'general_commit_latency': commit_latency
+ }
+
+ @staticmethod
+ def _get_pool_usage(pool):
+ """
+ Process raw data into pool usage dict information
+ :return: A pool dict with pool name's key and usage bytes' value
+ """
+ return {pool['name']: pool['stats']['kb_used']}
+
+ @staticmethod
+ def _get_pool_objects(pool):
+ """
+ Process raw data into pool usage dict information
+ :return: A pool dict with pool name's key and object numbers
+ """
+ return {'obj_{0}'.format(pool['name']): pool['stats']['objects']}
+
+ @staticmethod
+ def _get_pool_rw(pool):
+ """
+ Get read/write kb and operations in a pool
+ :return: A pool dict with both read/write bytes and operations.
+ """
+ return {
+ 'read_{0}'.format(pool['pool_name']): int(pool['client_io_rate'].get('read_bytes_sec', 0)),
+ 'write_{0}'.format(pool['pool_name']): int(pool['client_io_rate'].get('write_bytes_sec', 0)),
+ 'read_operations_{0}'.format(pool['pool_name']): int(pool['client_io_rate'].get('read_op_per_sec', 0)),
+ 'write_operations_{0}'.format(pool['pool_name']): int(pool['client_io_rate'].get('write_op_per_sec', 0))
+ }
+
+ @staticmethod
+ def _get_osd_usage(osd):
+ """
+ Process raw data into osd dict information to get osd usage
+ :return: A osd dict with osd name's key and usage bytes' value
+ """
+ return {osd['name']: float(osd['kb_used'])}
+
+ @staticmethod
+ def _get_osd_size(osd):
+ """
+ Process raw data into osd dict information to get osd size (kb)
+ :return: A osd dict with osd name's key and size bytes' value
+ """
+ return {'size_{0}'.format(osd['name']): float(osd['kb'])}
+
+ @staticmethod
+ def _get_osd_latency(osd):
+ """
+ Get ceph osd apply and commit latency
+ :return: A osd dict with osd name's key with both apply and commit latency values
+ """
+ return {
+ 'apply_latency_osd.{0}'.format(osd['id']): osd['perf_stats']['apply_latency_ms'],
+ 'commit_latency_osd.{0}'.format(osd['id']): osd['perf_stats']['commit_latency_ms']
+ }
+
+ def _get_df(self):
+ """
+ Get ceph df output
+ :return: ceph df --format json
+ """
+ return json.loads(self.cluster.mon_command(json.dumps({
+ 'prefix': 'df',
+ 'format': 'json'
+ }), '')[1].decode('utf-8'))
+
+ def _get_osd_df(self):
+ """
+ Get ceph osd df output
+ :return: ceph osd df --format json
+ """
+ return json.loads(self.cluster.mon_command(json.dumps({
+ 'prefix': 'osd df',
+ 'format': 'json'
+ }), '')[1].decode('utf-8').replace('-nan', '"-nan"'))
+
+ def _get_osd_perf(self):
+ """
+ Get ceph osd performance
+ :return: ceph osd perf --format json
+ """
+ return json.loads(self.cluster.mon_command(json.dumps({
+ 'prefix': 'osd perf',
+ 'format': 'json'
+ }), '')[1].decode('utf-8'))
+
+ def _get_osd_pool_stats(self):
+ """
+ Get ceph osd pool status.
+ This command is used to get information about both
+ read/write operation and bytes per second on each pool
+ :return: ceph osd pool stats --format json
+ """
+ return json.loads(self.cluster.mon_command(json.dumps({
+ 'prefix': 'osd pool stats',
+ 'format': 'json'
+ }), '')[1].decode('utf-8'))
+
+
+def get_osd_perf_infos(osd_perf):
+ # https://github.com/netdata/netdata/issues/8247
+ # module uses 'osd_perf_infos' data, its been moved under 'osdstats` since Ceph v14.2
+ if 'osd_perf_infos' in osd_perf:
+ return osd_perf['osd_perf_infos']
+ return osd_perf['osdstats']['osd_perf_infos']
diff --git a/collectors/python.d.plugin/ceph/ceph.conf b/collectors/python.d.plugin/ceph/ceph.conf
new file mode 100644
index 0000000..81788e8
--- /dev/null
+++ b/collectors/python.d.plugin/ceph/ceph.conf
@@ -0,0 +1,75 @@
+# netdata python.d.plugin configuration for ceph stats
+#
+# This file is in YaML format. Generally the format is:
+#
+# name: value
+#
+# There are 2 sections:
+# - global variables
+# - one or more JOBS
+#
+# JOBS allow you to collect values from multiple sources.
+# Each source will have its own set of charts.
+#
+# JOB parameters have to be indented (using spaces only, example below).
+
+# ----------------------------------------------------------------------
+# Global Variables
+# These variables set the defaults for all JOBs, however each JOB
+# may define its own, overriding the defaults.
+
+# update_every sets the default data collection frequency.
+# If unset, the python.d.plugin default is used.
+# update_every: 10
+
+# priority controls the order of charts at the netdata dashboard.
+# Lower numbers move the charts towards the top of the page.
+# If unset, the default for python.d.plugin is used.
+# priority: 60000
+
+# penalty indicates whether to apply penalty to update_every in case of failures.
+# Penalty will increase every 5 failed updates in a row. Maximum penalty is 10 minutes.
+# penalty: yes
+
+# autodetection_retry sets the job re-check interval in seconds.
+# The job is not deleted if check fails.
+# Attempts to start the job are made once every autodetection_retry.
+# This feature is disabled by default.
+# autodetection_retry: 0
+
+# ----------------------------------------------------------------------
+# JOBS (data collection sources)
+#
+# The default JOBS share the same *name*. JOBS with the same name
+# are mutually exclusive. Only one of them will be allowed running at
+# any time. This allows autodetection to try several alternatives and
+# pick the one that works.
+#
+# Any number of jobs is supported.
+#
+# All python.d.plugin JOBS (for all its modules) support a set of
+# predefined parameters. These are:
+#
+# job_name:
+# name: myname # the JOB's name as it will appear at the
+# # dashboard (by default is the job_name)
+# # JOBs sharing a name are mutually exclusive
+# update_every: 10 # the JOB's data collection frequency
+# priority: 60000 # the JOB's order on the dashboard
+# penalty: yes # the JOB's penalty
+# autodetection_retry: 0 # the JOB's re-check interval in seconds
+#
+# Additionally to the above, ceph plugin also supports the following:
+#
+# config_file: 'config_file' # Ceph config file.
+# keyring_file: 'keyring_file' # Ceph keyring file. netdata user must be added into ceph group
+# # and keyring file must be read group permission.
+# rados_id: 'rados username' # ID used to connect to ceph cluster. Allows
+# # creating a read only key for pulling data v.s. admin
+# ----------------------------------------------------------------------
+# AUTO-DETECTION JOBS
+# only one of them will run (they have the same name)
+#
+config_file: '/etc/ceph/ceph.conf'
+keyring_file: '/etc/ceph/ceph.client.admin.keyring'
+rados_id: 'admin'