summaryrefslogtreecommitdiffstats
path: root/src/collectors/python.d.plugin/ceph
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-07-24 09:54:23 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-07-24 09:54:44 +0000
commit836b47cb7e99a977c5a23b059ca1d0b5065d310e (patch)
tree1604da8f482d02effa033c94a84be42bc0c848c3 /src/collectors/python.d.plugin/ceph
parentReleasing debian version 1.44.3-2. (diff)
downloadnetdata-836b47cb7e99a977c5a23b059ca1d0b5065d310e.tar.xz
netdata-836b47cb7e99a977c5a23b059ca1d0b5065d310e.zip
Merging upstream version 1.46.3.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/collectors/python.d.plugin/ceph')
l---------src/collectors/python.d.plugin/ceph/README.md1
-rw-r--r--src/collectors/python.d.plugin/ceph/ceph.chart.py374
-rw-r--r--src/collectors/python.d.plugin/ceph/ceph.conf75
-rw-r--r--src/collectors/python.d.plugin/ceph/integrations/ceph.md194
-rw-r--r--src/collectors/python.d.plugin/ceph/metadata.yaml223
5 files changed, 867 insertions, 0 deletions
diff --git a/src/collectors/python.d.plugin/ceph/README.md b/src/collectors/python.d.plugin/ceph/README.md
new file mode 120000
index 00000000..654248b7
--- /dev/null
+++ b/src/collectors/python.d.plugin/ceph/README.md
@@ -0,0 +1 @@
+integrations/ceph.md \ No newline at end of file
diff --git a/src/collectors/python.d.plugin/ceph/ceph.chart.py b/src/collectors/python.d.plugin/ceph/ceph.chart.py
new file mode 100644
index 00000000..4bcbe197
--- /dev/null
+++ b/src/collectors/python.d.plugin/ceph/ceph.chart.py
@@ -0,0 +1,374 @@
+# -*- coding: utf-8 -*-
+# Description: ceph netdata python.d module
+# Author: Luis Eduardo (lets00)
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+try:
+ import rados
+
+ CEPH = True
+except ImportError:
+ CEPH = False
+
+import json
+import os
+
+from bases.FrameworkServices.SimpleService import SimpleService
+
+# default module values (can be overridden per job in `config`)
+update_every = 10
+
+ORDER = [
+ 'general_usage',
+ 'general_objects',
+ 'general_bytes',
+ 'general_operations',
+ 'general_latency',
+ 'pool_usage',
+ 'pool_objects',
+ 'pool_read_bytes',
+ 'pool_write_bytes',
+ 'pool_read_operations',
+ 'pool_write_operations',
+ 'osd_usage',
+ 'osd_size',
+ 'osd_apply_latency',
+ 'osd_commit_latency'
+]
+
+CHARTS = {
+ 'general_usage': {
+ 'options': [None, 'Ceph General Space', 'KiB', 'general', 'ceph.general_usage', 'stacked'],
+ 'lines': [
+ ['general_available', 'avail', 'absolute'],
+ ['general_usage', 'used', 'absolute']
+ ]
+ },
+ 'general_objects': {
+ 'options': [None, 'Ceph General Objects', 'objects', 'general', 'ceph.general_objects', 'area'],
+ 'lines': [
+ ['general_objects', 'cluster', 'absolute']
+ ]
+ },
+ 'general_bytes': {
+ 'options': [None, 'Ceph General Read/Write Data/s', 'KiB/s', 'general', 'ceph.general_bytes',
+ 'area'],
+ 'lines': [
+ ['general_read_bytes', 'read', 'absolute', 1, 1024],
+ ['general_write_bytes', 'write', 'absolute', -1, 1024]
+ ]
+ },
+ 'general_operations': {
+ 'options': [None, 'Ceph General Read/Write Operations/s', 'operations', 'general', 'ceph.general_operations',
+ 'area'],
+ 'lines': [
+ ['general_read_operations', 'read', 'absolute', 1],
+ ['general_write_operations', 'write', 'absolute', -1]
+ ]
+ },
+ 'general_latency': {
+ 'options': [None, 'Ceph General Apply/Commit latency', 'milliseconds', 'general', 'ceph.general_latency',
+ 'area'],
+ 'lines': [
+ ['general_apply_latency', 'apply', 'absolute'],
+ ['general_commit_latency', 'commit', 'absolute']
+ ]
+ },
+ 'pool_usage': {
+ 'options': [None, 'Ceph Pools', 'KiB', 'pool', 'ceph.pool_usage', 'line'],
+ 'lines': []
+ },
+ 'pool_objects': {
+ 'options': [None, 'Ceph Pools', 'objects', 'pool', 'ceph.pool_objects', 'line'],
+ 'lines': []
+ },
+ 'pool_read_bytes': {
+ 'options': [None, 'Ceph Read Pool Data/s', 'KiB/s', 'pool', 'ceph.pool_read_bytes', 'area'],
+ 'lines': []
+ },
+ 'pool_write_bytes': {
+ 'options': [None, 'Ceph Write Pool Data/s', 'KiB/s', 'pool', 'ceph.pool_write_bytes', 'area'],
+ 'lines': []
+ },
+ 'pool_read_operations': {
+ 'options': [None, 'Ceph Read Pool Operations/s', 'operations', 'pool', 'ceph.pool_read_operations', 'area'],
+ 'lines': []
+ },
+ 'pool_write_operations': {
+ 'options': [None, 'Ceph Write Pool Operations/s', 'operations', 'pool', 'ceph.pool_write_operations', 'area'],
+ 'lines': []
+ },
+ 'osd_usage': {
+ 'options': [None, 'Ceph OSDs', 'KiB', 'osd', 'ceph.osd_usage', 'line'],
+ 'lines': []
+ },
+ 'osd_size': {
+ 'options': [None, 'Ceph OSDs size', 'KiB', 'osd', 'ceph.osd_size', 'line'],
+ 'lines': []
+ },
+ 'osd_apply_latency': {
+ 'options': [None, 'Ceph OSDs apply latency', 'milliseconds', 'osd', 'ceph.apply_latency', 'line'],
+ 'lines': []
+ },
+ 'osd_commit_latency': {
+ 'options': [None, 'Ceph OSDs commit latency', 'milliseconds', 'osd', 'ceph.commit_latency', 'line'],
+ 'lines': []
+ }
+
+}
+
+
+class Service(SimpleService):
+ def __init__(self, configuration=None, name=None):
+ SimpleService.__init__(self, configuration=configuration, name=name)
+ self.order = ORDER
+ self.definitions = CHARTS
+ self.config_file = self.configuration.get('config_file')
+ self.keyring_file = self.configuration.get('keyring_file')
+ self.rados_id = self.configuration.get('rados_id', 'admin')
+
+ def check(self):
+ """
+ Checks module
+ :return:
+ """
+ if not CEPH:
+ self.error('rados module is needed to use ceph.chart.py')
+ return False
+ if not (self.config_file and self.keyring_file):
+ self.error('config_file and/or keyring_file is not defined')
+ return False
+
+ # Verify files and permissions
+ if not (os.access(self.config_file, os.F_OK)):
+ self.error('{0} does not exist'.format(self.config_file))
+ return False
+ if not (os.access(self.keyring_file, os.F_OK)):
+ self.error('{0} does not exist'.format(self.keyring_file))
+ return False
+ if not (os.access(self.config_file, os.R_OK)):
+ self.error('Ceph plugin does not read {0}, define read permission.'.format(self.config_file))
+ return False
+ if not (os.access(self.keyring_file, os.R_OK)):
+ self.error('Ceph plugin does not read {0}, define read permission.'.format(self.keyring_file))
+ return False
+ try:
+ self.cluster = rados.Rados(conffile=self.config_file,
+ conf=dict(keyring=self.keyring_file),
+ rados_id=self.rados_id)
+ self.cluster.connect()
+ except rados.Error as error:
+ self.error(error)
+ return False
+ self.create_definitions()
+ return True
+
+ def create_definitions(self):
+ """
+ Create dynamically charts options
+ :return: None
+ """
+ # Pool lines
+ for pool in sorted(self._get_df()['pools'], key=lambda x: sorted(x.keys())):
+ self.definitions['pool_usage']['lines'].append([pool['name'],
+ pool['name'],
+ 'absolute'])
+ self.definitions['pool_objects']['lines'].append(["obj_{0}".format(pool['name']),
+ pool['name'],
+ 'absolute'])
+ self.definitions['pool_read_bytes']['lines'].append(['read_{0}'.format(pool['name']),
+ pool['name'],
+ 'absolute', 1, 1024])
+ self.definitions['pool_write_bytes']['lines'].append(['write_{0}'.format(pool['name']),
+ pool['name'],
+ 'absolute', 1, 1024])
+ self.definitions['pool_read_operations']['lines'].append(['read_operations_{0}'.format(pool['name']),
+ pool['name'],
+ 'absolute'])
+ self.definitions['pool_write_operations']['lines'].append(['write_operations_{0}'.format(pool['name']),
+ pool['name'],
+ 'absolute'])
+
+ # OSD lines
+ for osd in sorted(self._get_osd_df()['nodes'], key=lambda x: sorted(x.keys())):
+ self.definitions['osd_usage']['lines'].append([osd['name'],
+ osd['name'],
+ 'absolute'])
+ self.definitions['osd_size']['lines'].append(['size_{0}'.format(osd['name']),
+ osd['name'],
+ 'absolute'])
+ self.definitions['osd_apply_latency']['lines'].append(['apply_latency_{0}'.format(osd['name']),
+ osd['name'],
+ 'absolute'])
+ self.definitions['osd_commit_latency']['lines'].append(['commit_latency_{0}'.format(osd['name']),
+ osd['name'],
+ 'absolute'])
+
+ def get_data(self):
+ """
+ Catch all ceph data
+ :return: dict
+ """
+ try:
+ data = {}
+ df = self._get_df()
+ osd_df = self._get_osd_df()
+ osd_perf = self._get_osd_perf()
+ osd_perf_infos = get_osd_perf_infos(osd_perf)
+ pool_stats = self._get_osd_pool_stats()
+
+ data.update(self._get_general(osd_perf_infos, pool_stats))
+ for pool in df['pools']:
+ data.update(self._get_pool_usage(pool))
+ data.update(self._get_pool_objects(pool))
+ for pool_io in pool_stats:
+ data.update(self._get_pool_rw(pool_io))
+ for osd in osd_df['nodes']:
+ data.update(self._get_osd_usage(osd))
+ data.update(self._get_osd_size(osd))
+ for osd_apply_commit in osd_perf_infos:
+ data.update(self._get_osd_latency(osd_apply_commit))
+ return data
+ except (ValueError, AttributeError) as error:
+ self.error(error)
+ return None
+
+ def _get_general(self, osd_perf_infos, pool_stats):
+ """
+ Get ceph's general usage
+ :return: dict
+ """
+ status = self.cluster.get_cluster_stats()
+ read_bytes_sec = 0
+ write_bytes_sec = 0
+ read_op_per_sec = 0
+ write_op_per_sec = 0
+ apply_latency = 0
+ commit_latency = 0
+
+ for pool_rw_io_b in pool_stats:
+ read_bytes_sec += pool_rw_io_b['client_io_rate'].get('read_bytes_sec', 0)
+ write_bytes_sec += pool_rw_io_b['client_io_rate'].get('write_bytes_sec', 0)
+ read_op_per_sec += pool_rw_io_b['client_io_rate'].get('read_op_per_sec', 0)
+ write_op_per_sec += pool_rw_io_b['client_io_rate'].get('write_op_per_sec', 0)
+ for perf in osd_perf_infos:
+ apply_latency += perf['perf_stats']['apply_latency_ms']
+ commit_latency += perf['perf_stats']['commit_latency_ms']
+
+ return {
+ 'general_usage': int(status['kb_used']),
+ 'general_available': int(status['kb_avail']),
+ 'general_objects': int(status['num_objects']),
+ 'general_read_bytes': read_bytes_sec,
+ 'general_write_bytes': write_bytes_sec,
+ 'general_read_operations': read_op_per_sec,
+ 'general_write_operations': write_op_per_sec,
+ 'general_apply_latency': apply_latency,
+ 'general_commit_latency': commit_latency
+ }
+
+ @staticmethod
+ def _get_pool_usage(pool):
+ """
+ Process raw data into pool usage dict information
+ :return: A pool dict with pool name's key and usage bytes' value
+ """
+ return {pool['name']: pool['stats']['kb_used']}
+
+ @staticmethod
+ def _get_pool_objects(pool):
+ """
+ Process raw data into pool usage dict information
+ :return: A pool dict with pool name's key and object numbers
+ """
+ return {'obj_{0}'.format(pool['name']): pool['stats']['objects']}
+
+ @staticmethod
+ def _get_pool_rw(pool):
+ """
+ Get read/write kb and operations in a pool
+ :return: A pool dict with both read/write bytes and operations.
+ """
+ return {
+ 'read_{0}'.format(pool['pool_name']): int(pool['client_io_rate'].get('read_bytes_sec', 0)),
+ 'write_{0}'.format(pool['pool_name']): int(pool['client_io_rate'].get('write_bytes_sec', 0)),
+ 'read_operations_{0}'.format(pool['pool_name']): int(pool['client_io_rate'].get('read_op_per_sec', 0)),
+ 'write_operations_{0}'.format(pool['pool_name']): int(pool['client_io_rate'].get('write_op_per_sec', 0))
+ }
+
+ @staticmethod
+ def _get_osd_usage(osd):
+ """
+ Process raw data into osd dict information to get osd usage
+ :return: A osd dict with osd name's key and usage bytes' value
+ """
+ return {osd['name']: float(osd['kb_used'])}
+
+ @staticmethod
+ def _get_osd_size(osd):
+ """
+ Process raw data into osd dict information to get osd size (kb)
+ :return: A osd dict with osd name's key and size bytes' value
+ """
+ return {'size_{0}'.format(osd['name']): float(osd['kb'])}
+
+ @staticmethod
+ def _get_osd_latency(osd):
+ """
+ Get ceph osd apply and commit latency
+ :return: A osd dict with osd name's key with both apply and commit latency values
+ """
+ return {
+ 'apply_latency_osd.{0}'.format(osd['id']): osd['perf_stats']['apply_latency_ms'],
+ 'commit_latency_osd.{0}'.format(osd['id']): osd['perf_stats']['commit_latency_ms']
+ }
+
+ def _get_df(self):
+ """
+ Get ceph df output
+ :return: ceph df --format json
+ """
+ return json.loads(self.cluster.mon_command(json.dumps({
+ 'prefix': 'df',
+ 'format': 'json'
+ }), b'')[1].decode('utf-8'))
+
+ def _get_osd_df(self):
+ """
+ Get ceph osd df output
+ :return: ceph osd df --format json
+ """
+ return json.loads(self.cluster.mon_command(json.dumps({
+ 'prefix': 'osd df',
+ 'format': 'json'
+ }), b'')[1].decode('utf-8').replace('-nan', '"-nan"'))
+
+ def _get_osd_perf(self):
+ """
+ Get ceph osd performance
+ :return: ceph osd perf --format json
+ """
+ return json.loads(self.cluster.mon_command(json.dumps({
+ 'prefix': 'osd perf',
+ 'format': 'json'
+ }), b'')[1].decode('utf-8'))
+
+ def _get_osd_pool_stats(self):
+ """
+ Get ceph osd pool status.
+ This command is used to get information about both
+ read/write operation and bytes per second on each pool
+ :return: ceph osd pool stats --format json
+ """
+ return json.loads(self.cluster.mon_command(json.dumps({
+ 'prefix': 'osd pool stats',
+ 'format': 'json'
+ }), b'')[1].decode('utf-8'))
+
+
+def get_osd_perf_infos(osd_perf):
+ # https://github.com/netdata/netdata/issues/8247
+ # module uses 'osd_perf_infos' data, its been moved under 'osdstats` since Ceph v14.2
+ if 'osd_perf_infos' in osd_perf:
+ return osd_perf['osd_perf_infos']
+ return osd_perf['osdstats']['osd_perf_infos']
diff --git a/src/collectors/python.d.plugin/ceph/ceph.conf b/src/collectors/python.d.plugin/ceph/ceph.conf
new file mode 100644
index 00000000..81788e86
--- /dev/null
+++ b/src/collectors/python.d.plugin/ceph/ceph.conf
@@ -0,0 +1,75 @@
+# netdata python.d.plugin configuration for ceph stats
+#
+# This file is in YaML format. Generally the format is:
+#
+# name: value
+#
+# There are 2 sections:
+# - global variables
+# - one or more JOBS
+#
+# JOBS allow you to collect values from multiple sources.
+# Each source will have its own set of charts.
+#
+# JOB parameters have to be indented (using spaces only, example below).
+
+# ----------------------------------------------------------------------
+# Global Variables
+# These variables set the defaults for all JOBs, however each JOB
+# may define its own, overriding the defaults.
+
+# update_every sets the default data collection frequency.
+# If unset, the python.d.plugin default is used.
+# update_every: 10
+
+# priority controls the order of charts at the netdata dashboard.
+# Lower numbers move the charts towards the top of the page.
+# If unset, the default for python.d.plugin is used.
+# priority: 60000
+
+# penalty indicates whether to apply penalty to update_every in case of failures.
+# Penalty will increase every 5 failed updates in a row. Maximum penalty is 10 minutes.
+# penalty: yes
+
+# autodetection_retry sets the job re-check interval in seconds.
+# The job is not deleted if check fails.
+# Attempts to start the job are made once every autodetection_retry.
+# This feature is disabled by default.
+# autodetection_retry: 0
+
+# ----------------------------------------------------------------------
+# JOBS (data collection sources)
+#
+# The default JOBS share the same *name*. JOBS with the same name
+# are mutually exclusive. Only one of them will be allowed running at
+# any time. This allows autodetection to try several alternatives and
+# pick the one that works.
+#
+# Any number of jobs is supported.
+#
+# All python.d.plugin JOBS (for all its modules) support a set of
+# predefined parameters. These are:
+#
+# job_name:
+# name: myname # the JOB's name as it will appear at the
+# # dashboard (by default is the job_name)
+# # JOBs sharing a name are mutually exclusive
+# update_every: 10 # the JOB's data collection frequency
+# priority: 60000 # the JOB's order on the dashboard
+# penalty: yes # the JOB's penalty
+# autodetection_retry: 0 # the JOB's re-check interval in seconds
+#
+# Additionally to the above, ceph plugin also supports the following:
+#
+# config_file: 'config_file' # Ceph config file.
+# keyring_file: 'keyring_file' # Ceph keyring file. netdata user must be added into ceph group
+# # and keyring file must be read group permission.
+# rados_id: 'rados username' # ID used to connect to ceph cluster. Allows
+# # creating a read only key for pulling data v.s. admin
+# ----------------------------------------------------------------------
+# AUTO-DETECTION JOBS
+# only one of them will run (they have the same name)
+#
+config_file: '/etc/ceph/ceph.conf'
+keyring_file: '/etc/ceph/ceph.client.admin.keyring'
+rados_id: 'admin'
diff --git a/src/collectors/python.d.plugin/ceph/integrations/ceph.md b/src/collectors/python.d.plugin/ceph/integrations/ceph.md
new file mode 100644
index 00000000..2b49a331
--- /dev/null
+++ b/src/collectors/python.d.plugin/ceph/integrations/ceph.md
@@ -0,0 +1,194 @@
+<!--startmeta
+custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/collectors/python.d.plugin/ceph/README.md"
+meta_yaml: "https://github.com/netdata/netdata/edit/master/src/collectors/python.d.plugin/ceph/metadata.yaml"
+sidebar_label: "Ceph"
+learn_status: "Published"
+learn_rel_path: "Collecting Metrics/Storage, Mount Points and Filesystems"
+most_popular: False
+message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE COLLECTOR'S metadata.yaml FILE"
+endmeta-->
+
+# Ceph
+
+
+<img src="https://netdata.cloud/img/ceph.svg" width="150"/>
+
+
+Plugin: python.d.plugin
+Module: ceph
+
+<img src="https://img.shields.io/badge/maintained%20by-Netdata-%2300ab44" />
+
+## Overview
+
+This collector monitors Ceph metrics about Cluster statistics, OSD usage, latency and Pool statistics.
+
+Uses the `rados` python module to connect to a Ceph cluster.
+
+This collector is supported on all platforms.
+
+This collector supports collecting metrics from multiple instances of this integration, including remote instances.
+
+
+### Default Behavior
+
+#### Auto-Detection
+
+This integration doesn't support auto-detection.
+
+#### Limits
+
+The default configuration for this integration does not impose any limits on data collection.
+
+#### Performance Impact
+
+The default configuration for this integration is not expected to impose a significant performance impact on the system.
+
+
+## Metrics
+
+Metrics grouped by *scope*.
+
+The scope defines the instance that the metric belongs to. An instance is uniquely identified by a set of labels.
+
+
+
+### Per Ceph instance
+
+These metrics refer to the entire monitored application.
+
+This scope has no labels.
+
+Metrics:
+
+| Metric | Dimensions | Unit |
+|:------|:----------|:----|
+| ceph.general_usage | avail, used | KiB |
+| ceph.general_objects | cluster | objects |
+| ceph.general_bytes | read, write | KiB/s |
+| ceph.general_operations | read, write | operations |
+| ceph.general_latency | apply, commit | milliseconds |
+| ceph.pool_usage | a dimension per Ceph Pool | KiB |
+| ceph.pool_objects | a dimension per Ceph Pool | objects |
+| ceph.pool_read_bytes | a dimension per Ceph Pool | KiB/s |
+| ceph.pool_write_bytes | a dimension per Ceph Pool | KiB/s |
+| ceph.pool_read_operations | a dimension per Ceph Pool | operations |
+| ceph.pool_write_operations | a dimension per Ceph Pool | operations |
+| ceph.osd_usage | a dimension per Ceph OSD | KiB |
+| ceph.osd_size | a dimension per Ceph OSD | KiB |
+| ceph.apply_latency | a dimension per Ceph OSD | milliseconds |
+| ceph.commit_latency | a dimension per Ceph OSD | milliseconds |
+
+
+
+## Alerts
+
+
+The following alerts are available:
+
+| Alert name | On metric | Description |
+|:------------|:----------|:------------|
+| [ ceph_cluster_space_usage ](https://github.com/netdata/netdata/blob/master/src/health/health.d/ceph.conf) | ceph.general_usage | cluster disk space utilization |
+
+
+## Setup
+
+### Prerequisites
+
+#### `rados` python module
+
+Make sure the `rados` python module is installed
+
+#### Granting read permissions to ceph group from keyring file
+
+Execute: `chmod 640 /etc/ceph/ceph.client.admin.keyring`
+
+#### Create a specific rados_id
+
+You can optionally create a rados_id to use instead of admin
+
+
+### Configuration
+
+#### File
+
+The configuration file name for this integration is `python.d/ceph.conf`.
+
+
+You can edit the configuration file using the `edit-config` script from the
+Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).
+
+```bash
+cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata
+sudo ./edit-config python.d/ceph.conf
+```
+#### Options
+
+There are 2 sections:
+
+* Global variables
+* One or more JOBS that can define multiple different instances to monitor.
+
+The following options can be defined globally: priority, penalty, autodetection_retry, update_every, but can also be defined per JOB to override the global values.
+
+Additionally, the following collapsed table contains all the options that can be configured inside a JOB definition.
+
+Every configuration JOB starts with a `job_name` value which will appear in the dashboard, unless a `name` parameter is specified.
+
+
+<details open><summary>Config options</summary>
+
+| Name | Description | Default | Required |
+|:----|:-----------|:-------|:--------:|
+| update_every | Sets the default data collection frequency. | 5 | no |
+| priority | Controls the order of charts at the netdata dashboard. | 60000 | no |
+| autodetection_retry | Sets the job re-check interval in seconds. | 0 | no |
+| penalty | Indicates whether to apply penalty to update_every in case of failures. | yes | no |
+| name | Job name. This value will overwrite the `job_name` value. JOBS with the same name are mutually exclusive. Only one of them will be allowed running at any time. This allows autodetection to try several alternatives and pick the one that works. | | no |
+| config_file | Ceph config file | | yes |
+| keyring_file | Ceph keyring file. netdata user must be added into ceph group and keyring file must be read group permission. | | yes |
+| rados_id | A rados user id to use for connecting to the Ceph cluster. | admin | no |
+
+</details>
+
+#### Examples
+
+##### Basic local Ceph cluster
+
+A basic configuration to connect to a local Ceph cluster.
+
+```yaml
+local:
+ config_file: '/etc/ceph/ceph.conf'
+ keyring_file: '/etc/ceph/ceph.client.admin.keyring'
+
+```
+
+
+## Troubleshooting
+
+### Debug Mode
+
+To troubleshoot issues with the `ceph` collector, run the `python.d.plugin` with the debug option enabled. The output
+should give you clues as to why the collector isn't working.
+
+- Navigate to the `plugins.d` directory, usually at `/usr/libexec/netdata/plugins.d/`. If that's not the case on
+ your system, open `netdata.conf` and look for the `plugins` setting under `[directories]`.
+
+ ```bash
+ cd /usr/libexec/netdata/plugins.d/
+ ```
+
+- Switch to the `netdata` user.
+
+ ```bash
+ sudo -u netdata -s
+ ```
+
+- Run the `python.d.plugin` to debug the collector:
+
+ ```bash
+ ./python.d.plugin ceph debug trace
+ ```
+
+
diff --git a/src/collectors/python.d.plugin/ceph/metadata.yaml b/src/collectors/python.d.plugin/ceph/metadata.yaml
new file mode 100644
index 00000000..64294113
--- /dev/null
+++ b/src/collectors/python.d.plugin/ceph/metadata.yaml
@@ -0,0 +1,223 @@
+plugin_name: python.d.plugin
+modules:
+ - meta:
+ plugin_name: python.d.plugin
+ module_name: ceph
+ monitored_instance:
+ name: Ceph
+ link: 'https://ceph.io/'
+ categories:
+ - data-collection.storage-mount-points-and-filesystems
+ icon_filename: 'ceph.svg'
+ related_resources:
+ integrations:
+ list: []
+ info_provided_to_referring_integrations:
+ description: ''
+ keywords:
+ - ceph
+ - storage
+ most_popular: false
+ overview:
+ data_collection:
+ metrics_description: 'This collector monitors Ceph metrics about Cluster statistics, OSD usage, latency and Pool statistics.'
+ method_description: 'Uses the `rados` python module to connect to a Ceph cluster.'
+ supported_platforms:
+ include: []
+ exclude: []
+ multi_instance: true
+ additional_permissions:
+ description: ''
+ default_behavior:
+ auto_detection:
+ description: ''
+ limits:
+ description: ''
+ performance_impact:
+ description: ''
+ setup:
+ prerequisites:
+ list:
+ - title: '`rados` python module'
+ description: 'Make sure the `rados` python module is installed'
+ - title: 'Granting read permissions to ceph group from keyring file'
+ description: 'Execute: `chmod 640 /etc/ceph/ceph.client.admin.keyring`'
+ - title: 'Create a specific rados_id'
+ description: 'You can optionally create a rados_id to use instead of admin'
+ configuration:
+ file:
+ name: python.d/ceph.conf
+ options:
+ description: |
+ There are 2 sections:
+
+ * Global variables
+ * One or more JOBS that can define multiple different instances to monitor.
+
+ The following options can be defined globally: priority, penalty, autodetection_retry, update_every, but can also be defined per JOB to override the global values.
+
+ Additionally, the following collapsed table contains all the options that can be configured inside a JOB definition.
+
+ Every configuration JOB starts with a `job_name` value which will appear in the dashboard, unless a `name` parameter is specified.
+ folding:
+ title: "Config options"
+ enabled: true
+ list:
+ - name: update_every
+ description: Sets the default data collection frequency.
+ default_value: 5
+ required: false
+ - name: priority
+ description: Controls the order of charts at the netdata dashboard.
+ default_value: 60000
+ required: false
+ - name: autodetection_retry
+ description: Sets the job re-check interval in seconds.
+ default_value: 0
+ required: false
+ - name: penalty
+ description: Indicates whether to apply penalty to update_every in case of failures.
+ default_value: yes
+ required: false
+ - name: name
+ description: Job name. This value will overwrite the `job_name` value. JOBS with the same name are mutually exclusive. Only one of them will be allowed running at any time. This allows autodetection to try several alternatives and pick the one that works.
+ default_value: ''
+ required: false
+ - name: config_file
+ description: Ceph config file
+ default_value: ''
+ required: true
+ - name: keyring_file
+ description: Ceph keyring file. netdata user must be added into ceph group and keyring file must be read group permission.
+ default_value: ''
+ required: true
+ - name: rados_id
+ description: A rados user id to use for connecting to the Ceph cluster.
+ default_value: 'admin'
+ required: false
+ examples:
+ folding:
+ enabled: true
+ title: "Config"
+ list:
+ - name: Basic local Ceph cluster
+ description: A basic configuration to connect to a local Ceph cluster.
+ folding:
+ enabled: false
+ config: |
+ local:
+ config_file: '/etc/ceph/ceph.conf'
+ keyring_file: '/etc/ceph/ceph.client.admin.keyring'
+ troubleshooting:
+ problems:
+ list: []
+ alerts:
+ - name: ceph_cluster_space_usage
+ link: https://github.com/netdata/netdata/blob/master/src/health/health.d/ceph.conf
+ metric: ceph.general_usage
+ info: cluster disk space utilization
+ metrics:
+ folding:
+ title: Metrics
+ enabled: false
+ description: ""
+ availability: []
+ scopes:
+ - name: global
+ description: "These metrics refer to the entire monitored application."
+ labels: []
+ metrics:
+ - name: ceph.general_usage
+ description: Ceph General Space
+ unit: "KiB"
+ chart_type: stacked
+ dimensions:
+ - name: avail
+ - name: used
+ - name: ceph.general_objects
+ description: Ceph General Objects
+ unit: "objects"
+ chart_type: area
+ dimensions:
+ - name: cluster
+ - name: ceph.general_bytes
+ description: Ceph General Read/Write Data/s
+ unit: "KiB/s"
+ chart_type: area
+ dimensions:
+ - name: read
+ - name: write
+ - name: ceph.general_operations
+ description: Ceph General Read/Write Operations/s
+ unit: "operations"
+ chart_type: area
+ dimensions:
+ - name: read
+ - name: write
+ - name: ceph.general_latency
+ description: Ceph General Apply/Commit latency
+ unit: "milliseconds"
+ chart_type: area
+ dimensions:
+ - name: apply
+ - name: commit
+ - name: ceph.pool_usage
+ description: Ceph Pools
+ unit: "KiB"
+ chart_type: line
+ dimensions:
+ - name: a dimension per Ceph Pool
+ - name: ceph.pool_objects
+ description: Ceph Pools
+ unit: "objects"
+ chart_type: line
+ dimensions:
+ - name: a dimension per Ceph Pool
+ - name: ceph.pool_read_bytes
+ description: Ceph Read Pool Data/s
+ unit: "KiB/s"
+ chart_type: area
+ dimensions:
+ - name: a dimension per Ceph Pool
+ - name: ceph.pool_write_bytes
+ description: Ceph Write Pool Data/s
+ unit: "KiB/s"
+ chart_type: area
+ dimensions:
+ - name: a dimension per Ceph Pool
+ - name: ceph.pool_read_operations
+ description: Ceph Read Pool Operations/s
+ unit: "operations"
+ chart_type: area
+ dimensions:
+ - name: a dimension per Ceph Pool
+ - name: ceph.pool_write_operations
+ description: Ceph Write Pool Operations/s
+ unit: "operations"
+ chart_type: area
+ dimensions:
+ - name: a dimension per Ceph Pool
+ - name: ceph.osd_usage
+ description: Ceph OSDs
+ unit: "KiB"
+ chart_type: line
+ dimensions:
+ - name: a dimension per Ceph OSD
+ - name: ceph.osd_size
+ description: Ceph OSDs size
+ unit: "KiB"
+ chart_type: line
+ dimensions:
+ - name: a dimension per Ceph OSD
+ - name: ceph.apply_latency
+ description: Ceph OSDs apply latency
+ unit: "milliseconds"
+ chart_type: line
+ dimensions:
+ - name: a dimension per Ceph OSD
+ - name: ceph.commit_latency
+ description: Ceph OSDs commit latency
+ unit: "milliseconds"
+ chart_type: line
+ dimensions:
+ - name: a dimension per Ceph OSD