summaryrefslogtreecommitdiffstats
path: root/monitoring/ceph-mixin/dashboards
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--monitoring/ceph-mixin/dashboards.jsonnet6
-rw-r--r--monitoring/ceph-mixin/dashboards.libsonnet10
-rw-r--r--monitoring/ceph-mixin/dashboards/cephfs.libsonnet89
-rw-r--r--monitoring/ceph-mixin/dashboards/host.libsonnet723
-rw-r--r--monitoring/ceph-mixin/dashboards/osd.libsonnet593
-rw-r--r--monitoring/ceph-mixin/dashboards/pool.libsonnet552
-rw-r--r--monitoring/ceph-mixin/dashboards/rbd.libsonnet337
-rw-r--r--monitoring/ceph-mixin/dashboards/rgw.libsonnet872
-rw-r--r--monitoring/ceph-mixin/dashboards/utils.libsonnet333
-rw-r--r--monitoring/ceph-mixin/dashboards_out/.lint5
-rw-r--r--monitoring/ceph-mixin/dashboards_out/ceph-cluster.json1244
-rw-r--r--monitoring/ceph-mixin/dashboards_out/cephfs-overview.json362
-rw-r--r--monitoring/ceph-mixin/dashboards_out/host-details.json1243
-rw-r--r--monitoring/ceph-mixin/dashboards_out/hosts-overview.json894
-rw-r--r--monitoring/ceph-mixin/dashboards_out/osd-device-details.json871
-rw-r--r--monitoring/ceph-mixin/dashboards_out/osds-overview.json963
-rw-r--r--monitoring/ceph-mixin/dashboards_out/pool-detail.json708
-rw-r--r--monitoring/ceph-mixin/dashboards_out/pool-overview.json1542
-rw-r--r--monitoring/ceph-mixin/dashboards_out/radosgw-detail.json542
-rw-r--r--monitoring/ceph-mixin/dashboards_out/radosgw-overview.json1266
-rw-r--r--monitoring/ceph-mixin/dashboards_out/radosgw-sync-overview.json504
-rw-r--r--monitoring/ceph-mixin/dashboards_out/rbd-details.json458
-rw-r--r--monitoring/ceph-mixin/dashboards_out/rbd-overview.json737
23 files changed, 14854 insertions, 0 deletions
diff --git a/monitoring/ceph-mixin/dashboards.jsonnet b/monitoring/ceph-mixin/dashboards.jsonnet
new file mode 100644
index 000000000..9d913ed3f
--- /dev/null
+++ b/monitoring/ceph-mixin/dashboards.jsonnet
@@ -0,0 +1,6 @@
+local dashboards = (import 'mixin.libsonnet').grafanaDashboards;
+
+{
+ [name]: dashboards[name]
+ for name in std.objectFields(dashboards)
+}
diff --git a/monitoring/ceph-mixin/dashboards.libsonnet b/monitoring/ceph-mixin/dashboards.libsonnet
new file mode 100644
index 000000000..5cae18329
--- /dev/null
+++ b/monitoring/ceph-mixin/dashboards.libsonnet
@@ -0,0 +1,10 @@
+{
+ grafanaDashboards+::
+ (import 'dashboards/cephfs.libsonnet') +
+ (import 'dashboards/host.libsonnet') +
+ (import 'dashboards/osd.libsonnet') +
+ (import 'dashboards/pool.libsonnet') +
+ (import 'dashboards/rbd.libsonnet') +
+ (import 'dashboards/rgw.libsonnet') +
+ { _config:: $._config },
+}
diff --git a/monitoring/ceph-mixin/dashboards/cephfs.libsonnet b/monitoring/ceph-mixin/dashboards/cephfs.libsonnet
new file mode 100644
index 000000000..d12d9f4dd
--- /dev/null
+++ b/monitoring/ceph-mixin/dashboards/cephfs.libsonnet
@@ -0,0 +1,89 @@
+local g = import 'grafonnet/grafana.libsonnet';
+
+(import 'utils.libsonnet') {
+ 'cephfs-overview.json':
+ $.dashboardSchema(
+ 'MDS Performance',
+ '',
+ 'tbO9LAiZz',
+ 'now-1h',
+ '30s',
+ 16,
+ $._config.dashboardTags,
+ ''
+ )
+ .addAnnotation(
+ $.addAnnotationSchema(
+ 1,
+ '-- Grafana --',
+ true,
+ true,
+ 'rgba(0, 211, 255, 1)',
+ 'Annotations & Alerts',
+ 'dashboard'
+ )
+ )
+ .addRequired(
+ type='grafana', id='grafana', name='Grafana', version='5.3.2'
+ )
+ .addRequired(
+ type='panel', id='graph', name='Graph', version='5.0.0'
+ )
+ .addTemplate(
+ g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
+ )
+ .addTemplate(
+ $.addClusterTemplate()
+ )
+ .addTemplate(
+ $.addJobTemplate()
+ )
+ .addTemplate(
+ $.addTemplateSchema('mds_servers',
+ '$datasource',
+ 'label_values(ceph_mds_inodes{%(matchers)s}, ceph_daemon)' % $.matchers(),
+ 1,
+ true,
+ 1,
+ 'MDS Server',
+ '')
+ )
+ .addPanels([
+ $.addRowSchema(false, true, 'MDS Performance') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } },
+ $.simpleGraphPanel(
+ {},
+ 'MDS Workload - $mds_servers',
+ '',
+ 'none',
+ 'Reads(-) / Writes (+)',
+ 0,
+ 'sum(rate(ceph_objecter_op_r{%(matchers)s, ceph_daemon=~"($mds_servers).*"}[$__rate_interval]))' % $.matchers(),
+ 'Read Ops',
+ 0,
+ 1,
+ 12,
+ 9
+ )
+ .addTarget($.addTargetSchema(
+ 'sum(rate(ceph_objecter_op_w{%(matchers)s, ceph_daemon=~"($mds_servers).*"}[$__rate_interval]))' % $.matchers(),
+ 'Write Ops'
+ ))
+ .addSeriesOverride(
+ { alias: '/.*Reads/', transform: 'negative-Y' }
+ ),
+ $.simpleGraphPanel(
+ {},
+ 'Client Request Load - $mds_servers',
+ '',
+ 'none',
+ 'Client Requests',
+ 0,
+ 'ceph_mds_server_handle_client_request{%(matchers)s, ceph_daemon=~"($mds_servers).*"}' % $.matchers(),
+ '{{ceph_daemon}}',
+ 12,
+ 1,
+ 12,
+ 9
+ ),
+ ]),
+}
diff --git a/monitoring/ceph-mixin/dashboards/host.libsonnet b/monitoring/ceph-mixin/dashboards/host.libsonnet
new file mode 100644
index 000000000..3e0b31f2c
--- /dev/null
+++ b/monitoring/ceph-mixin/dashboards/host.libsonnet
@@ -0,0 +1,723 @@
+local g = import 'grafonnet/grafana.libsonnet';
+
+(import 'utils.libsonnet') {
+ 'hosts-overview.json':
+ $.dashboardSchema(
+ 'Host Overview',
+ '',
+ 'y0KGL0iZz',
+ 'now-1h',
+ '30s',
+ 16,
+ $._config.dashboardTags,
+ '',
+ )
+ .addRequired(
+ type='grafana', id='grafana', name='Grafana', version='5.3.2'
+ )
+ .addRequired(
+ type='panel', id='graph', name='Graph', version='5.0.0'
+ )
+ .addRequired(
+ type='panel', id='singlestat', name='Singlestat', version='5.0.0'
+ )
+ .addAnnotation(
+ $.addAnnotationSchema(
+ 1,
+ '-- Grafana --',
+ true,
+ true,
+ 'rgba(0, 211, 255, 1)',
+ 'Annotations & Alerts',
+ 'dashboard'
+ )
+ )
+ .addTemplate(
+ g.template.datasource('datasource',
+ 'prometheus',
+ 'default',
+ label='Data Source')
+ )
+ .addTemplate(
+ $.addClusterTemplate()
+ )
+ .addTemplate(
+ $.addJobTemplate()
+ )
+ .addTemplate(
+ $.addTemplateSchema('osd_hosts',
+ '$datasource',
+ 'label_values(ceph_disk_occupation{%(matchers)s}, exported_instance)' % $.matchers(),
+ 1,
+ true,
+ 1,
+ null,
+ '([^.]*).*')
+ )
+ .addTemplate(
+ $.addTemplateSchema('mon_hosts',
+ '$datasource',
+ 'label_values(ceph_mon_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(),
+ 1,
+ true,
+ 1,
+ null,
+ 'mon.(.*)')
+ )
+ .addTemplate(
+ $.addTemplateSchema('mds_hosts',
+ '$datasource',
+ 'label_values(ceph_mds_inodes{%(matchers)s}, ceph_daemon)' % $.matchers(),
+ 1,
+ true,
+ 1,
+ null,
+ 'mds.(.*)')
+ )
+ .addTemplate(
+ $.addTemplateSchema('rgw_hosts',
+ '$datasource',
+ 'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(),
+ 1,
+ true,
+ 1,
+ null,
+ 'rgw.(.*)')
+ )
+ .addPanels([
+ $.simpleSingleStatPanel(
+ 'none',
+ 'OSD Hosts',
+ '',
+ 'current',
+ 'count(sum by (hostname) (ceph_osd_metadata{%(matchers)s}))' % $.matchers(),
+ true,
+ 'time_series',
+ 0,
+ 0,
+ 4,
+ 5
+ ),
+ $.simpleSingleStatPanel(
+ 'percentunit',
+ 'AVG CPU Busy',
+ 'Average CPU busy across all hosts (OSD, RGW, MON etc) within the cluster',
+ 'current',
+ |||
+ avg(1 - (
+ avg by(instance) (
+ rate(node_cpu_seconds_total{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval]) or
+ rate(node_cpu{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval])
+ )
+ ))
+ |||,
+ true,
+ 'time_series',
+ 4,
+ 0,
+ 4,
+ 5
+ ),
+ $.simpleSingleStatPanel(
+ 'percentunit',
+ 'AVG RAM Utilization',
+ 'Average Memory Usage across all hosts in the cluster (excludes buffer/cache usage)',
+ 'current',
+ |||
+ avg ((
+ (
+ node_memory_MemTotal{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or
+ node_memory_MemTotal_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}
+ ) - ((
+ node_memory_MemFree{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or
+ node_memory_MemFree_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}) +
+ (
+ node_memory_Cached{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or
+ node_memory_Cached_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}
+ ) + (
+ node_memory_Buffers{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or
+ node_memory_Buffers_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}
+ ) + (
+ node_memory_Slab{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or
+ node_memory_Slab_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}
+ )
+ )
+ ) / (
+ node_memory_MemTotal{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or
+ node_memory_MemTotal_bytes{instance=~"($osd_hosts|$rgw_hosts|$mon_hosts|$mds_hosts).*"}
+ ))
+ |||,
+ true,
+ 'time_series',
+ 8,
+ 0,
+ 4,
+ 5
+ ),
+ $.simpleSingleStatPanel(
+ 'none',
+ 'Physical IOPS',
+ 'IOPS Load at the device as reported by the OS on all OSD hosts',
+ 'current',
+ |||
+ sum ((
+ rate(node_disk_reads_completed{instance=~"($osd_hosts).*"}[$__rate_interval]) or
+ rate(node_disk_reads_completed_total{instance=~"($osd_hosts).*"}[$__rate_interval])
+ ) + (
+ rate(node_disk_writes_completed{instance=~"($osd_hosts).*"}[$__rate_interval]) or
+ rate(node_disk_writes_completed_total{instance=~"($osd_hosts).*"}[$__rate_interval])
+ ))
+ |||,
+ true,
+ 'time_series',
+ 12,
+ 0,
+ 4,
+ 5
+ ),
+ $.simpleSingleStatPanel(
+ 'percent',
+ 'AVG Disk Utilization',
+ 'Average Disk utilization for all OSD data devices (i.e. excludes journal/WAL)',
+ 'current',
+ |||
+ avg (
+ label_replace(
+ (rate(node_disk_io_time_ms[$__rate_interval]) / 10 ) or
+ (rate(node_disk_io_time_seconds_total[$__rate_interval]) * 100),
+ "instance", "$1", "instance", "([^.:]*).*"
+ ) * on(instance, device) group_left(ceph_daemon) label_replace(
+ label_replace(
+ ceph_disk_occupation_human{%(matchers)s, instance=~"($osd_hosts).*"},
+ "device", "$1", "device", "/dev/(.*)"
+ ), "instance", "$1", "instance", "([^.:]*).*"
+ )
+ )
+ ||| % $.matchers(),
+ true,
+ 'time_series',
+ 16,
+ 0,
+ 4,
+ 5
+ ),
+ $.simpleSingleStatPanel(
+ 'bytes',
+ 'Network Load',
+ 'Total send/receive network load across all hosts in the ceph cluster',
+ 'current',
+ |||
+ sum (
+ (
+ rate(node_network_receive_bytes{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[$__rate_interval]) or
+ rate(node_network_receive_bytes_total{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[$__rate_interval])
+ ) unless on (device, instance)
+ label_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)")
+ ) +
+ sum (
+ (
+ rate(node_network_transmit_bytes{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[$__rate_interval]) or
+ rate(node_network_transmit_bytes_total{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[$__rate_interval])
+ ) unless on (device, instance)
+ label_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)")
+ )
+ |||,
+ true,
+ 'time_series',
+ 20,
+ 0,
+ 4,
+ 5
+ ),
+ $.simpleGraphPanel(
+ {},
+ 'CPU Busy - Top 10 Hosts',
+ 'Show the top 10 busiest hosts by cpu',
+ 'percent',
+ null,
+ 0,
+ |||
+ topk(10,
+ 100 * (
+ 1 - (
+ avg by(instance) (
+ rate(node_cpu_seconds_total{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval]) or
+ rate(node_cpu{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval])
+ )
+ )
+ )
+ )
+ |||,
+ '{{instance}}',
+ 0,
+ 5,
+ 12,
+ 9
+ ),
+ $.simpleGraphPanel(
+ {},
+ 'Network Load - Top 10 Hosts',
+ 'Top 10 hosts by network load',
+ 'Bps',
+ null,
+ 0,
+ |||
+ topk(10, (sum by(instance) (
+ (
+ rate(node_network_receive_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval]) or
+ rate(node_network_receive_bytes_total{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval])
+ ) +
+ (
+ rate(node_network_transmit_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval]) or
+ rate(node_network_transmit_bytes_total{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval])
+ ) unless on (device, instance)
+ label_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)"))
+ ))
+ |||,
+ '{{instance}}',
+ 12,
+ 5,
+ 12,
+ 9
+ ),
+ ]),
+ 'host-details.json':
+ $.dashboardSchema(
+ 'Host Details',
+ '',
+ 'rtOg0AiWz',
+ 'now-1h',
+ '30s',
+ 16,
+ $._config.dashboardTags + ['overview'],
+ ''
+ )
+ .addRequired(
+ type='grafana', id='grafana', name='Grafana', version='5.3.2'
+ )
+ .addRequired(
+ type='panel', id='graph', name='Graph', version='5.0.0'
+ )
+ .addRequired(
+ type='panel', id='singlestat', name='Singlestat', version='5.0.0'
+ )
+ .addAnnotation(
+ $.addAnnotationSchema(
+ 1, '-- Grafana --', true, true, 'rgba(0, 211, 255, 1)', 'Annotations & Alerts', 'dashboard'
+ )
+ )
+ .addTemplate(
+ g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
+ )
+ .addTemplate(
+ $.addClusterTemplate()
+ )
+ .addTemplate(
+ $.addJobTemplate()
+ )
+ .addTemplate(
+ $.addTemplateSchema('ceph_hosts',
+ '$datasource',
+ 'label_values({%(clusterMatcher)s}, instance)' % $.matchers(),
+ 1,
+ false,
+ 3,
+ 'Hostname',
+ '([^.:]*).*')
+ )
+ .addPanels([
+ $.addRowSchema(false, true, '$ceph_hosts System Overview') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } },
+ $.simpleSingleStatPanel(
+ 'none',
+ 'OSDs',
+ '',
+ 'current',
+ "count(sum by (ceph_daemon) (ceph_osd_metadata{%(matchers)s, hostname='$ceph_hosts'}))" % $.matchers(),
+ null,
+ 'time_series',
+ 0,
+ 1,
+ 3,
+ 5
+ ),
+ $.simpleGraphPanel(
+ {
+ interrupt: '#447EBC',
+ steal: '#6D1F62',
+ system: '#890F02',
+ user: '#3F6833',
+ wait: '#C15C17',
+ },
+ 'CPU Utilization',
+ "Shows the CPU breakdown. When multiple servers are selected, only the first host's cpu data is shown",
+ 'percent',
+ '% Utilization',
+ null,
+ |||
+ sum by (mode) (
+ rate(node_cpu{instance=~"($ceph_hosts)([\\\\.:].*)?", mode=~"(irq|nice|softirq|steal|system|user|iowait)"}[$__rate_interval]) or
+ rate(node_cpu_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?", mode=~"(irq|nice|softirq|steal|system|user|iowait)"}[$__rate_interval])
+ ) / (
+ scalar(
+ sum(rate(node_cpu{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or
+ rate(node_cpu_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]))
+ ) * 100
+ )
+ |||,
+ '{{mode}}',
+ 3,
+ 1,
+ 6,
+ 10
+ ),
+ $.simpleGraphPanel(
+ {
+ Available: '#508642',
+ Free: '#508642',
+ Total: '#bf1b00',
+ Used: '#bf1b00',
+ total: '#bf1b00',
+ used: '#0a50a1',
+ },
+ 'RAM Usage',
+ '',
+ 'bytes',
+ 'RAM used',
+ null,
+ |||
+ node_memory_MemFree{instance=~"$ceph_hosts([\\\\.:].*)?"} or
+ node_memory_MemFree_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
+ |||,
+ 'Free',
+ 9,
+ 1,
+ 6,
+ 10
+ )
+ .addTargets(
+ [
+ $.addTargetSchema(
+ |||
+ node_memory_MemTotal{instance=~"$ceph_hosts([\\\\.:].*)?"} or
+ node_memory_MemTotal_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
+ |||,
+ 'total'
+ ),
+ $.addTargetSchema(
+ |||
+ (
+ node_memory_Cached{instance=~"$ceph_hosts([\\\\.:].*)?"} or
+ node_memory_Cached_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
+ ) + (
+ node_memory_Buffers{instance=~"$ceph_hosts([\\\\.:].*)?"} or
+ node_memory_Buffers_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
+ ) + (
+ node_memory_Slab{instance=~"$ceph_hosts([\\\\.:].*)?"} or
+ node_memory_Slab_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
+ )
+ |||,
+ 'buffers/cache'
+ ),
+ $.addTargetSchema(
+ |||
+ (
+ node_memory_MemTotal{instance=~"$ceph_hosts([\\\\.:].*)?"} or
+ node_memory_MemTotal_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
+ ) - (
+ (
+ node_memory_MemFree{instance=~"$ceph_hosts([\\\\.:].*)?"} or
+ node_memory_MemFree_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
+ ) + (
+ node_memory_Cached{instance=~"$ceph_hosts([\\\\.:].*)?"} or
+ node_memory_Cached_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
+ ) + (
+ node_memory_Buffers{instance=~"$ceph_hosts([\\\\.:].*)?"} or
+ node_memory_Buffers_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
+ ) +
+ (
+ node_memory_Slab{instance=~"$ceph_hosts([\\\\.:].*)?"} or
+ node_memory_Slab_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
+ )
+ )
+ |||,
+ 'used'
+ ),
+ ]
+ )
+ .addSeriesOverride(
+ {
+ alias: 'total',
+ color: '#bf1b00',
+ fill: 0,
+ linewidth: 2,
+ stack: false,
+ }
+ ),
+ $.simpleGraphPanel(
+ {},
+ 'Network Load',
+ "Show the network load (rx,tx) across all interfaces (excluding loopback 'lo')",
+ 'decbytes',
+ 'Send (-) / Receive (+)',
+ null,
+ |||
+ sum by (device) (
+ rate(
+ node_network_receive_bytes{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval]) or
+ rate(node_network_receive_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval]
+ )
+ )
+ |||,
+ '{{device}}.rx',
+ 15,
+ 1,
+ 6,
+ 10
+ )
+ .addTargets(
+ [
+ $.addTargetSchema(
+ |||
+ sum by (device) (
+ rate(node_network_transmit_bytes{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval]) or
+ rate(node_network_transmit_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval])
+ )
+ |||,
+ '{{device}}.tx'
+ ),
+ ]
+ )
+ .addSeriesOverride(
+ { alias: '/.*tx/', transform: 'negative-Y' }
+ ),
+ $.simpleGraphPanel(
+ {},
+ 'Network drop rate',
+ '',
+ 'pps',
+ 'Send (-) / Receive (+)',
+ null,
+ |||
+ rate(node_network_receive_drop{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or
+ rate(node_network_receive_drop_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval])
+ |||,
+ '{{device}}.rx',
+ 21,
+ 1,
+ 3,
+ 5
+ )
+ .addTargets(
+ [
+ $.addTargetSchema(
+ |||
+ rate(node_network_transmit_drop{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or
+ rate(node_network_transmit_drop_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval])
+ |||,
+ '{{device}}.tx'
+ ),
+ ]
+ )
+ .addSeriesOverride(
+ {
+ alias: '/.*tx/',
+ transform: 'negative-Y',
+ }
+ ),
+ $.simpleSingleStatPanel(
+ 'bytes',
+ 'Raw Capacity',
+ 'Each OSD consists of a Journal/WAL partition and a data partition. The RAW Capacity shown is the sum of the data partitions across all OSDs on the selected OSD hosts.',
+ 'current',
+ |||
+ sum(
+ ceph_osd_stat_bytes{%(matchers)s} and
+ on (ceph_daemon) ceph_disk_occupation{%(matchers)s, instance=~"($ceph_hosts)([\\\\.:].*)?"}
+ )
+ ||| % $.matchers(),
+ null,
+ 'time_series',
+ 0,
+ 6,
+ 3,
+ 5
+ ),
+ $.simpleGraphPanel(
+ {},
+ 'Network error rate',
+ '',
+ 'pps',
+ 'Send (-) / Receive (+)',
+ null,
+ |||
+ rate(node_network_receive_errs{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or
+ rate(node_network_receive_errs_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval])
+ |||,
+ '{{device}}.rx',
+ 21,
+ 6,
+ 3,
+ 5
+ )
+ .addTargets(
+ [$.addTargetSchema(
+ |||
+ rate(node_network_transmit_errs{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or
+ rate(node_network_transmit_errs_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval])
+ |||,
+ '{{device}}.tx'
+ )]
+ )
+ .addSeriesOverride(
+ {
+ alias: '/.*tx/',
+ transform: 'negative-Y',
+ }
+ ),
+ $.addRowSchema(false,
+ true,
+ 'OSD Disk Performance Statistics') + { gridPos: { x: 0, y: 11, w: 24, h: 1 } },
+ $.simpleGraphPanel(
+ {},
+ '$ceph_hosts Disk IOPS',
+ "For any OSD devices on the host, this chart shows the iops per physical device. Each device is shown by it's name and corresponding OSD id value",
+ 'ops',
+ 'Read (-) / Write (+)',
+ null,
+ |||
+ label_replace(
+ (
+ rate(node_disk_writes_completed{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or
+ rate(node_disk_writes_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])
+ ), "instance", "$1", "instance", "([^:.]*).*"
+ ) * on(instance, device) group_left(ceph_daemon) label_replace(
+ label_replace(
+ ceph_disk_occupation_human{%(matchers)s}, "device", "$1", "device", "/dev/(.*)"
+ ), "instance", "$1", "instance", "([^:.]*).*"
+ )
+ ||| % $.matchers(),
+ '{{device}}({{ceph_daemon}}) writes',
+ 0,
+ 12,
+ 11,
+ 9
+ )
+ .addTargets(
+ [
+ $.addTargetSchema(
+ |||
+ label_replace(
+ (
+ rate(node_disk_reads_completed{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or
+ rate(node_disk_reads_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])
+ ), "instance", "$1", "instance", "([^:.]*).*"
+ ) * on(instance, device) group_left(ceph_daemon) label_replace(
+ label_replace(
+ ceph_disk_occupation_human{%(matchers)s},"device", "$1", "device", "/dev/(.*)"
+ ), "instance", "$1", "instance", "([^:.]*).*"
+ )
+ ||| % $.matchers(),
+ '{{device}}({{ceph_daemon}}) reads'
+ ),
+ ]
+ )
+ .addSeriesOverride(
+ { alias: '/.*reads/', transform: 'negative-Y' }
+ ),
+ $.simpleGraphPanel(
+ {},
+ '$ceph_hosts Throughput by Disk',
+ 'For OSD hosts, this chart shows the disk bandwidth (read bytes/sec + write bytes/sec) of the physical OSD device. Each device is shown by device name, and corresponding OSD id',
+ 'Bps',
+ 'Read (-) / Write (+)',
+ null,
+ |||
+ label_replace(
+ (
+ rate(node_disk_bytes_written{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or
+ rate(node_disk_written_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])
+ ), "instance", "$1", "instance", "([^:.]*).*") * on(instance, device)
+ group_left(ceph_daemon) label_replace(
+ label_replace(ceph_disk_occupation_human{%(matchers)s}, "device", "$1", "device", "/dev/(.*)"),
+ "instance", "$1", "instance", "([^:.]*).*"
+ )
+ ||| % $.matchers(),
+ '{{device}}({{ceph_daemon}}) write',
+ 12,
+ 12,
+ 11,
+ 9
+ )
+ .addTargets(
+ [$.addTargetSchema(
+ |||
+ label_replace(
+ (
+ rate(node_disk_bytes_read{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or
+ rate(node_disk_read_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])
+ ),
+ "instance", "$1", "instance", "([^:.]*).*") * on(instance, device)
+ group_left(ceph_daemon) label_replace(
+ label_replace(ceph_disk_occupation_human{%(matchers)s}, "device", "$1", "device", "/dev/(.*)"),
+ "instance", "$1", "instance", "([^:.]*).*"
+ )
+ ||| % $.matchers(),
+ '{{device}}({{ceph_daemon}}) read'
+ )]
+ )
+ .addSeriesOverride(
+ { alias: '/.*read/', transform: 'negative-Y' }
+ ),
+ $.simpleGraphPanel(
+ {},
+ '$ceph_hosts Disk Latency',
+ "For OSD hosts, this chart shows the latency at the physical drive. Each drive is shown by device name, with it's corresponding OSD id",
+ 's',
+ '',
+ null,
+ |||
+ max by(instance, device) (label_replace(
+ (rate(node_disk_write_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])) /
+ clamp_min(rate(node_disk_writes_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]), 0.001) or
+ (rate(node_disk_read_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])) /
+ clamp_min(rate(node_disk_reads_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]), 0.001),
+ "instance", "$1", "instance", "([^:.]*).*"
+ )) * on(instance, device) group_left(ceph_daemon) label_replace(
+ label_replace(
+ ceph_disk_occupation_human{instance=~"($ceph_hosts)([\\\\.:].*)?"},
+ "device", "$1", "device", "/dev/(.*)"
+ ), "instance", "$1", "instance", "([^:.]*).*"
+ )
+ ||| % $.matchers(),
+ '{{device}}({{ceph_daemon}})',
+ 0,
+ 21,
+ 11,
+ 9
+ ),
+ $.simpleGraphPanel(
+ {},
+ '$ceph_hosts Disk utilization',
+ 'Show disk utilization % (util) of any OSD devices on the host by the physical device name and associated OSD id.',
+ 'percent',
+ '%Util',
+ null,
+ |||
+ label_replace(
+ (
+ (rate(node_disk_io_time_ms{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) / 10) or
+ rate(node_disk_io_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) * 100
+ ), "instance", "$1", "instance", "([^:.]*).*"
+ ) * on(instance, device) group_left(ceph_daemon) label_replace(
+ label_replace(ceph_disk_occupation_human{%(matchers)s, instance=~"($ceph_hosts)([\\\\.:].*)?"},
+ "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*"
+ )
+ ||| % $.matchers(),
+ '{{device}}({{ceph_daemon}})',
+ 12,
+ 21,
+ 11,
+ 9
+ ),
+ ]),
+}
diff --git a/monitoring/ceph-mixin/dashboards/osd.libsonnet b/monitoring/ceph-mixin/dashboards/osd.libsonnet
new file mode 100644
index 000000000..129b74ba6
--- /dev/null
+++ b/monitoring/ceph-mixin/dashboards/osd.libsonnet
@@ -0,0 +1,593 @@
+local g = import 'grafonnet/grafana.libsonnet';
+
+(import 'utils.libsonnet') {
+ 'osds-overview.json':
+ $.dashboardSchema(
+ 'OSD Overview',
+ '',
+ 'lo02I1Aiz',
+ 'now-1h',
+ '30s',
+ 16,
+ $._config.dashboardTags,
+ ''
+ )
+ .addAnnotation(
+ $.addAnnotationSchema(
+ 1,
+ '-- Grafana --',
+ true,
+ true,
+ 'rgba(0, 211, 255, 1)',
+ 'Annotations & Alerts',
+ 'dashboard'
+ )
+ )
+ .addRequired(
+ type='grafana', id='grafana', name='Grafana', version='5.0.0'
+ )
+ .addRequired(
+ type='panel', id='grafana-piechart-panel', name='Pie Chart', version='1.3.3'
+ )
+ .addRequired(
+ type='panel', id='graph', name='Graph', version='5.0.0'
+ )
+ .addRequired(
+ type='panel', id='table', name='Table', version='5.0.0'
+ )
+ .addTemplate(
+ g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
+ )
+ .addTemplate(
+ $.addClusterTemplate()
+ )
+ .addTemplate(
+ $.addJobTemplate()
+ )
+ .addPanels([
+ $.simpleGraphPanel(
+ { '@95%ile': '#e0752d' },
+ 'OSD Read Latencies',
+ '',
+ 'ms',
+ null,
+ '0',
+ |||
+ avg (
+ rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) /
+ on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) * 1000
+ )
+ ||| % $.matchers(),
+ 'AVG read',
+ 0,
+ 0,
+ 8,
+ 8
+ )
+ .addTargets(
+ [
+ $.addTargetSchema(
+ |||
+ max(
+ rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) /
+ on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) * 1000
+ )
+ ||| % $.matchers(),
+ 'MAX read'
+ ),
+ $.addTargetSchema(
+ |||
+ quantile(0.95,
+ (
+ rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) /
+ on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval])
+ * 1000
+ )
+ )
+ ||| % $.matchers(),
+ '@95%ile'
+ ),
+ ],
+ ),
+ $.addTableSchema(
+ '$datasource',
+ "This table shows the osd's that are delivering the 10 highest read latencies within the cluster",
+ { col: 2, desc: true },
+ [
+ $.overviewStyle('OSD ID', 'ceph_daemon', 'string', 'short'),
+ $.overviewStyle('Latency (ms)', 'Value', 'number', 'none'),
+ $.overviewStyle('', '/.*/', 'hidden', 'short'),
+ ],
+ 'Highest READ Latencies',
+ 'table'
+ )
+ .addTarget(
+ $.addTargetSchema(
+ |||
+ topk(10,
+ (sort(
+ (
+ rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) /
+ on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) *
+ 1000
+ )
+ ))
+ )
+ ||| % $.matchers(),
+ '',
+ 'table',
+ 1,
+ true
+ )
+ ) + { gridPos: { x: 8, y: 0, w: 4, h: 8 } },
+ $.simpleGraphPanel(
+ {
+ '@95%ile write': '#e0752d',
+ },
+ 'OSD Write Latencies',
+ '',
+ 'ms',
+ null,
+ '0',
+ |||
+ avg(
+ rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) /
+ on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval])
+ * 1000
+ )
+ ||| % $.matchers(),
+ 'AVG write',
+ 12,
+ 0,
+ 8,
+ 8
+ )
+ .addTargets(
+ [
+ $.addTargetSchema(
+ |||
+ max(
+ rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) /
+ on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) *
+ 1000
+ )
+ ||| % $.matchers(), 'MAX write'
+ ),
+ $.addTargetSchema(
+ |||
+ quantile(0.95, (
+ rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) /
+ on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) *
+ 1000
+ ))
+ ||| % $.matchers(), '@95%ile write'
+ ),
+ ],
+ ),
+ $.addTableSchema(
+ '$datasource',
+ "This table shows the osd's that are delivering the 10 highest write latencies within the cluster",
+ { col: 2, desc: true },
+ [
+ $.overviewStyle(
+ 'OSD ID', 'ceph_daemon', 'string', 'short'
+ ),
+ $.overviewStyle('Latency (ms)', 'Value', 'number', 'none'),
+ $.overviewStyle('', '/.*/', 'hidden', 'short'),
+ ],
+ 'Highest WRITE Latencies',
+ 'table'
+ )
+ .addTarget(
+ $.addTargetSchema(
+ |||
+ topk(10,
+ (sort(
+ (rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) /
+ on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) *
+ 1000)
+ ))
+ )
+ ||| % $.matchers(),
+ '',
+ 'table',
+ 1,
+ true
+ )
+ ) + { gridPos: { x: 20, y: 0, w: 4, h: 8 } },
+ $.simplePieChart(
+ {}, '', 'OSD Types Summary'
+ )
+ .addTarget(
+ $.addTargetSchema('count by (device_class) (ceph_osd_metadata{%(matchers)s})' % $.matchers(), '{{device_class}}')
+ ) + { gridPos: { x: 0, y: 8, w: 4, h: 8 } },
+ $.simplePieChart(
+ { 'Non-Encrypted': '#E5AC0E' }, '', 'OSD Objectstore Types'
+ )
+ .addTarget(
+ $.addTargetSchema(
+ 'count(ceph_bluefs_wal_total_bytes{%(matchers)s})' % $.matchers(), 'bluestore', 'time_series', 2
+ )
+ )
+ .addTarget(
+ $.addTargetSchema(
+ 'absent(ceph_bluefs_wal_total_bytes{%(matchers)s}) * count(ceph_osd_metadata{%(matchers)s})' % $.matchers(), 'filestore', 'time_series', 2
+ )
+ ) + { gridPos: { x: 4, y: 8, w: 4, h: 8 } },
+ $.simplePieChart(
+ {}, 'The pie chart shows the various OSD sizes used within the cluster', 'OSD Size Summary'
+ )
+ .addTarget($.addTargetSchema(
+ 'count(ceph_osd_stat_bytes{%(matchers)s} < 1099511627776)' % $.matchers(), '<1TB', 'time_series', 2
+ ))
+ .addTarget($.addTargetSchema(
+ 'count(ceph_osd_stat_bytes{%(matchers)s} >= 1099511627776 < 2199023255552)' % $.matchers(), '<2TB', 'time_series', 2
+ ))
+ .addTarget($.addTargetSchema(
+ 'count(ceph_osd_stat_bytes{%(matchers)s} >= 2199023255552 < 3298534883328)' % $.matchers(), '<3TB', 'time_series', 2
+ ))
+ .addTarget($.addTargetSchema(
+ 'count(ceph_osd_stat_bytes{%(matchers)s} >= 3298534883328 < 4398046511104)' % $.matchers(), '<4TB', 'time_series', 2
+ ))
+ .addTarget($.addTargetSchema(
+ 'count(ceph_osd_stat_bytes{%(matchers)s} >= 4398046511104 < 6597069766656)' % $.matchers(), '<6TB', 'time_series', 2
+ ))
+ .addTarget($.addTargetSchema(
+ 'count(ceph_osd_stat_bytes{%(matchers)s} >= 6597069766656 < 8796093022208)' % $.matchers(), '<8TB', 'time_series', 2
+ ))
+ .addTarget($.addTargetSchema(
+ 'count(ceph_osd_stat_bytes{%(matchers)s} >= 8796093022208 < 10995116277760)' % $.matchers(), '<10TB', 'time_series', 2
+ ))
+ .addTarget($.addTargetSchema(
+ 'count(ceph_osd_stat_bytes{%(matchers)s} >= 10995116277760 < 13194139533312)' % $.matchers(), '<12TB', 'time_series', 2
+ ))
+ .addTarget($.addTargetSchema(
+ 'count(ceph_osd_stat_bytes{%(matchers)s} >= 13194139533312)' % $.matchers(), '<12TB+', 'time_series', 2
+ )) + { gridPos: { x: 8, y: 8, w: 4, h: 8 } },
+ g.graphPanel.new(bars=true,
+ datasource='$datasource',
+ title='Distribution of PGs per OSD',
+ x_axis_buckets=20,
+ x_axis_mode='histogram',
+ x_axis_values=['total'],
+ formatY1='short',
+ formatY2='short',
+ labelY1='# of OSDs',
+ min='0',
+ nullPointMode='null')
+ .addTarget($.addTargetSchema(
+ 'ceph_osd_numpg{%(matchers)s}' % $.matchers(), 'PGs per OSD', 'time_series', 1, true
+ )) + { gridPos: { x: 12, y: 8, w: 8, h: 8 } },
+ $.gaugeSingleStatPanel(
+ 'percentunit',
+ 'OSD onode Hits Ratio',
+ 'This gauge panel shows onode Hits ratio to help determine if increasing RAM per OSD could help improve the performance of the cluster',
+ 'current',
+ true,
+ 1,
+ true,
+ false,
+ '.75',
+ |||
+ sum(ceph_bluestore_onode_hits{%(matchers)s}) / (
+ sum(ceph_bluestore_onode_hits{%(matchers)s}) +
+ sum(ceph_bluestore_onode_misses{%(matchers)s})
+ )
+ ||| % $.matchers(),
+ 'time_series',
+ 20,
+ 8,
+ 4,
+ 8
+ ),
+ $.addRowSchema(false,
+ true,
+ 'R/W Profile') + { gridPos: { x: 0, y: 16, w: 24, h: 1 } },
+ $.simpleGraphPanel(
+ {},
+ 'Read/Write Profile',
+ 'Show the read/write workload profile overtime',
+ 'short',
+ null,
+ null,
+ 'round(sum(rate(ceph_pool_rd{%(matchers)s}[$__rate_interval])))' % $.matchers(),
+ 'Reads',
+ 0,
+ 17,
+ 24,
+ 8
+ )
+ .addTargets([$.addTargetSchema(
+ 'round(sum(rate(ceph_pool_wr{%(matchers)s}[$__rate_interval])))' % $.matchers(), 'Writes'
+ )]),
+ ]),
+ 'osd-device-details.json':
+ local OsdDeviceDetailsPanel(title,
+ description,
+ formatY1,
+ labelY1,
+ expr1,
+ expr2,
+ legendFormat1,
+ legendFormat2,
+ x,
+ y,
+ w,
+ h) =
+ $.graphPanelSchema({},
+ title,
+ description,
+ 'null',
+ false,
+ formatY1,
+ 'short',
+ labelY1,
+ null,
+ null,
+ 1,
+ '$datasource')
+ .addTargets(
+ [
+ $.addTargetSchema(expr1,
+ legendFormat1),
+ $.addTargetSchema(expr2, legendFormat2),
+ ]
+ ) + { gridPos: { x: x, y: y, w: w, h: h } };
+
+ $.dashboardSchema(
+ 'OSD device details',
+ '',
+ 'CrAHE0iZz',
+ 'now-3h',
+ '30s',
+ 16,
+ $._config.dashboardTags,
+ ''
+ )
+ .addAnnotation(
+ $.addAnnotationSchema(
+ 1,
+ '-- Grafana --',
+ true,
+ true,
+ 'rgba(0, 211, 255, 1)',
+ 'Annotations & Alerts',
+ 'dashboard'
+ )
+ )
+ .addRequired(
+ type='grafana', id='grafana', name='Grafana', version='5.3.2'
+ )
+ .addRequired(
+ type='panel', id='graph', name='Graph', version='5.0.0'
+ )
+ .addTemplate(
+ g.template.datasource('datasource',
+ 'prometheus',
+ 'default',
+ label='Data Source')
+ )
+ .addTemplate(
+ $.addClusterTemplate()
+ )
+ .addTemplate(
+ $.addJobTemplate()
+ )
+ .addTemplate(
+ $.addTemplateSchema('osd',
+ '$datasource',
+ 'label_values(ceph_osd_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(),
+ 1,
+ false,
+ 1,
+ 'OSD',
+ '(.*)')
+ )
+ .addPanels([
+ $.addRowSchema(
+ false, true, 'OSD Performance'
+ ) + { gridPos: { x: 0, y: 0, w: 24, h: 1 } },
+ OsdDeviceDetailsPanel(
+ '$osd Latency',
+ '',
+ 's',
+ 'Read (-) / Write (+)',
+ |||
+ rate(ceph_osd_op_r_latency_sum{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval]) /
+ on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval])
+ ||| % $.matchers(),
+ |||
+ rate(ceph_osd_op_w_latency_sum{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval]) /
+ on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval])
+ ||| % $.matchers(),
+ 'read',
+ 'write',
+ 0,
+ 1,
+ 6,
+ 9
+ )
+ .addSeriesOverride(
+ {
+ alias: 'read',
+ transform: 'negative-Y',
+ }
+ ),
+ OsdDeviceDetailsPanel(
+ '$osd R/W IOPS',
+ '',
+ 'short',
+ 'Read (-) / Write (+)',
+ 'rate(ceph_osd_op_r{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % $.matchers(),
+ 'rate(ceph_osd_op_w{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % $.matchers(),
+ 'Reads',
+ 'Writes',
+ 6,
+ 1,
+ 6,
+ 9
+ )
+ .addSeriesOverride(
+ { alias: 'Reads', transform: 'negative-Y' }
+ ),
+ OsdDeviceDetailsPanel(
+ '$osd R/W Bytes',
+ '',
+ 'bytes',
+ 'Read (-) / Write (+)',
+ 'rate(ceph_osd_op_r_out_bytes{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % $.matchers(),
+ 'rate(ceph_osd_op_w_in_bytes{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % $.matchers(),
+ 'Read Bytes',
+ 'Write Bytes',
+ 12,
+ 1,
+ 6,
+ 9
+ )
+ .addSeriesOverride({ alias: 'Read Bytes', transform: 'negative-Y' }),
+ $.addRowSchema(
+ false, true, 'Physical Device Performance'
+ ) + { gridPos: { x: 0, y: 10, w: 24, h: 1 } },
+ OsdDeviceDetailsPanel(
+ 'Physical Device Latency for $osd',
+ '',
+ 's',
+ 'Read (-) / Write (+)',
+ |||
+ (
+ label_replace(
+ rate(node_disk_read_time_seconds_total{%(clusterMatcher)s}[$__rate_interval]) /
+ rate(node_disk_reads_completed_total{%(clusterMatcher)s}[$__rate_interval]),
+ "instance", "$1", "instance", "([^:.]*).*"
+ ) and on (instance, device) label_replace(
+ label_replace(
+ ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
+ "device", "$1", "device", "/dev/(.*)"
+ ), "instance", "$1", "instance", "([^:.]*).*"
+ )
+ )
+ ||| % $.matchers(),
+ |||
+ (
+ label_replace(
+ rate(node_disk_write_time_seconds_total{%(clusterMatcher)s}[$__rate_interval]) /
+ rate(node_disk_writes_completed_total{%(clusterMatcher)s}[$__rate_interval]),
+ "instance", "$1", "instance", "([^:.]*).*") and on (instance, device)
+ label_replace(
+ label_replace(
+ ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"
+ ), "instance", "$1", "instance", "([^:.]*).*"
+ )
+ )
+ ||| % $.matchers(),
+ '{{instance}}/{{device}} Reads',
+ '{{instance}}/{{device}} Writes',
+ 0,
+ 11,
+ 6,
+ 9
+ )
+ .addSeriesOverride(
+ { alias: '/.*Reads/', transform: 'negative-Y' }
+ ),
+ OsdDeviceDetailsPanel(
+ 'Physical Device R/W IOPS for $osd',
+ '',
+ 'short',
+ 'Read (-) / Write (+)',
+ |||
+ label_replace(
+ rate(node_disk_writes_completed_total{%(clusterMatcher)s}[$__rate_interval]),
+ "instance", "$1", "instance", "([^:.]*).*"
+ ) and on (instance, device) label_replace(
+ label_replace(
+ ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
+ "device", "$1", "device", "/dev/(.*)"
+ ), "instance", "$1", "instance", "([^:.]*).*"
+ )
+ ||| % $.matchers(),
+ |||
+ label_replace(
+ rate(node_disk_reads_completed_total{%(clusterMatcher)s}[$__rate_interval]),
+ "instance", "$1", "instance", "([^:.]*).*"
+ ) and on (instance, device) label_replace(
+ label_replace(
+ ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
+ "device", "$1", "device", "/dev/(.*)"
+ ), "instance", "$1", "instance", "([^:.]*).*"
+ )
+ ||| % $.matchers(),
+ '{{device}} on {{instance}} Writes',
+ '{{device}} on {{instance}} Reads',
+ 6,
+ 11,
+ 6,
+ 9
+ )
+ .addSeriesOverride(
+ { alias: '/.*Reads/', transform: 'negative-Y' }
+ ),
+ OsdDeviceDetailsPanel(
+ 'Physical Device R/W Bytes for $osd',
+ '',
+ 'Bps',
+ 'Read (-) / Write (+)',
+ |||
+ label_replace(
+ rate(node_disk_read_bytes_total{%(clusterMatcher)s}[$__rate_interval]), "instance", "$1", "instance", "([^:.]*).*"
+ ) and on (instance, device) label_replace(
+ label_replace(
+ ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
+ "device", "$1", "device", "/dev/(.*)"
+ ), "instance", "$1", "instance", "([^:.]*).*"
+ )
+ ||| % $.matchers(),
+ |||
+ label_replace(
+ rate(node_disk_written_bytes_total{%(clusterMatcher)s}[$__rate_interval]), "instance", "$1", "instance", "([^:.]*).*"
+ ) and on (instance, device) label_replace(
+ label_replace(
+ ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
+ "device", "$1", "device", "/dev/(.*)"
+ ), "instance", "$1", "instance", "([^:.]*).*"
+ )
+ ||| % $.matchers(),
+ '{{instance}} {{device}} Reads',
+ '{{instance}} {{device}} Writes',
+ 12,
+ 11,
+ 6,
+ 9
+ )
+ .addSeriesOverride(
+ { alias: '/.*Reads/', transform: 'negative-Y' }
+ ),
+ $.graphPanelSchema(
+ {},
+ 'Physical Device Util% for $osd',
+ '',
+ 'null',
+ false,
+ 'percentunit',
+ 'short',
+ null,
+ null,
+ null,
+ 1,
+ '$datasource'
+ )
+ .addTarget($.addTargetSchema(
+ |||
+ label_replace(
+ rate(node_disk_io_time_seconds_total{%(clusterMatcher)s}[$__rate_interval]),
+ "instance", "$1", "instance", "([^:.]*).*"
+ ) and on (instance, device) label_replace(
+ label_replace(
+ ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"
+ ), "instance", "$1", "instance", "([^:.]*).*"
+ )
+ ||| % $.matchers(),
+ '{{device}} on {{instance}}'
+ )) + { gridPos: { x: 18, y: 11, w: 6, h: 9 } },
+ ]),
+}
diff --git a/monitoring/ceph-mixin/dashboards/pool.libsonnet b/monitoring/ceph-mixin/dashboards/pool.libsonnet
new file mode 100644
index 000000000..6444335d9
--- /dev/null
+++ b/monitoring/ceph-mixin/dashboards/pool.libsonnet
@@ -0,0 +1,552 @@
+local g = import 'grafonnet/grafana.libsonnet';
+
+(import 'utils.libsonnet') {
+ 'pool-overview.json':
+ $.dashboardSchema(
+ 'Ceph Pools Overview',
+ '',
+ 'z99hzWtmk',
+ 'now-1h',
+ '30s',
+ 22,
+ $._config.dashboardTags,
+ ''
+ )
+ .addAnnotation(
+ $.addAnnotationSchema(
+ 1,
+ '-- Grafana --',
+ true,
+ true,
+ 'rgba(0, 211, 255, 1)',
+ 'Annotations & Alerts',
+ 'dashboard'
+ )
+ )
+ .addTemplate(
+ g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
+ )
+ .addTemplate(
+ $.addClusterTemplate()
+ )
+ .addTemplate(
+ $.addJobTemplate()
+ )
+ .addTemplate(
+ g.template.custom(label='TopK',
+ name='topk',
+ current='15',
+ query='15')
+ )
+ .addPanels([
+ $.simpleSingleStatPanel(
+ 'none',
+ 'Pools',
+ '',
+ 'avg',
+ 'count(ceph_pool_metadata{%(matchers)s})' % $.matchers(),
+ true,
+ 'table',
+ 0,
+ 0,
+ 3,
+ 3
+ ),
+ $.simpleSingleStatPanel(
+ 'none',
+ 'Pools with Compression',
+ 'Count of the pools that have compression enabled',
+ 'current',
+ 'count(ceph_pool_metadata{%(matchers)s, compression_mode!="none"})' % $.matchers(),
+ null,
+ '',
+ 3,
+ 0,
+ 3,
+ 3
+ ),
+ $.simpleSingleStatPanel(
+ 'bytes',
+ 'Total Raw Capacity',
+ 'Total raw capacity available to the cluster',
+ 'current',
+ 'sum(ceph_osd_stat_bytes{%(matchers)s})' % $.matchers(),
+ null,
+ '',
+ 6,
+ 0,
+ 3,
+ 3
+ ),
+ $.simpleSingleStatPanel(
+ 'bytes',
+ 'Raw Capacity Consumed',
+ 'Total raw capacity consumed by user data and associated overheads (metadata + redundancy)',
+ 'current',
+ 'sum(ceph_pool_bytes_used{%(matchers)s})' % $.matchers(),
+ true,
+ '',
+ 9,
+ 0,
+ 3,
+ 3
+ ),
+ $.simpleSingleStatPanel(
+ 'bytes',
+ 'Logical Stored ',
+ 'Total of client data stored in the cluster',
+ 'current',
+ 'sum(ceph_pool_stored{%(matchers)s})' % $.matchers(),
+ true,
+ '',
+ 12,
+ 0,
+ 3,
+ 3
+ ),
+ $.simpleSingleStatPanel(
+ 'bytes',
+ 'Compression Savings',
+ 'A compression saving is determined as the data eligible to be compressed minus the capacity used to store the data after compression',
+ 'current',
+ |||
+ sum(
+ ceph_pool_compress_under_bytes{%(matchers)s} -
+ ceph_pool_compress_bytes_used{%(matchers)s}
+ )
+ ||| % $.matchers(),
+ null,
+ '',
+ 15,
+ 0,
+ 3,
+ 3
+ ),
+ $.simpleSingleStatPanel(
+ 'percent',
+ 'Compression Eligibility',
+ 'Indicates how suitable the data is within the pools that are/have been enabled for compression - averaged across all pools holding compressed data',
+ 'current',
+ |||
+ (
+ sum(ceph_pool_compress_under_bytes{%(matchers)s} > 0) /
+ sum(ceph_pool_stored_raw{%(matchers)s} and ceph_pool_compress_under_bytes{%(matchers)s} > 0)
+ ) * 100
+ ||| % $.matchers(),
+ null,
+ 'table',
+ 18,
+ 0,
+ 3,
+ 3
+ ),
+ $.simpleSingleStatPanel(
+ 'none',
+ 'Compression Factor',
+ 'This factor describes the average ratio of data eligible to be compressed divided by the data actually stored. It does not account for data written that was ineligible for compression (too small, or compression yield too low)',
+ 'current',
+ |||
+ sum(
+ ceph_pool_compress_under_bytes{%(matchers)s} > 0)
+ / sum(ceph_pool_compress_bytes_used{%(matchers)s} > 0
+ )
+ ||| % $.matchers(),
+ null,
+ '',
+ 21,
+ 0,
+ 3,
+ 3
+ ),
+ $.addTableSchema(
+ '$datasource',
+ '',
+ { col: 5, desc: true },
+ [
+ $.overviewStyle('', 'Time', 'hidden', 'short'),
+ $.overviewStyle('', 'instance', 'hidden', 'short'),
+ $.overviewStyle('', 'job', 'hidden', 'short'),
+ $.overviewStyle('Pool Name', 'name', 'string', 'short'),
+ $.overviewStyle('Pool ID', 'pool_id', 'hidden', 'none'),
+ $.overviewStyle('Compression Factor', 'Value #A', 'number', 'none'),
+ $.overviewStyle('% Used', 'Value #D', 'number', 'percentunit', 'value', ['70', '85']),
+ $.overviewStyle('Usable Free', 'Value #B', 'number', 'bytes'),
+ $.overviewStyle('Compression Eligibility', 'Value #C', 'number', 'percent'),
+ $.overviewStyle('Compression Savings', 'Value #E', 'number', 'bytes'),
+ $.overviewStyle('Growth (5d)', 'Value #F', 'number', 'bytes', 'value', ['0', '0']),
+ $.overviewStyle('IOPS', 'Value #G', 'number', 'none'),
+ $.overviewStyle('Bandwidth', 'Value #H', 'number', 'Bps'),
+ $.overviewStyle('', '__name__', 'hidden', 'short'),
+ $.overviewStyle('', 'type', 'hidden', 'short'),
+ $.overviewStyle('', 'compression_mode', 'hidden', 'short'),
+ $.overviewStyle('Type', 'description', 'string', 'short'),
+ $.overviewStyle('Stored', 'Value #J', 'number', 'bytes'),
+ $.overviewStyle('', 'Value #I', 'hidden', 'short'),
+ $.overviewStyle('Compression', 'Value #K', 'string', 'short', null, [], [{ text: 'ON', value: '1' }]),
+ ],
+ 'Pool Overview',
+ 'table'
+ )
+ .addTargets(
+ [
+ $.addTargetSchema(
+ |||
+ (
+ ceph_pool_compress_under_bytes{%(matchers)s} /
+ ceph_pool_compress_bytes_used{%(matchers)s} > 0
+ ) and on(pool_id) (
+ (
+ (ceph_pool_compress_under_bytes{%(matchers)s} > 0) /
+ ceph_pool_stored_raw{%(matchers)s}
+ ) * 100 > 0.5
+ )
+ ||| % $.matchers(),
+ 'A',
+ 'table',
+ 1,
+ true
+ ),
+ $.addTargetSchema(
+ |||
+ ceph_pool_max_avail{%(matchers)s} *
+ on(pool_id) group_left(name) ceph_pool_metadata{%(matchers)s}
+ ||| % $.matchers(),
+ 'B',
+ 'table',
+ 1,
+ true
+ ),
+ $.addTargetSchema(
+ |||
+ (
+ (ceph_pool_compress_under_bytes{%(matchers)s} > 0) /
+ ceph_pool_stored_raw{%(matchers)s}
+ ) * 100
+ ||| % $.matchers(),
+ 'C',
+ 'table',
+ 1,
+ true
+ ),
+ $.addTargetSchema(
+ |||
+ ceph_pool_percent_used{%(matchers)s} *
+ on(pool_id) group_left(name) ceph_pool_metadata{%(matchers)s}
+ ||| % $.matchers(),
+ 'D',
+ 'table',
+ 1,
+ true
+ ),
+ $.addTargetSchema(
+ |||
+ ceph_pool_compress_under_bytes{%(matchers)s} -
+ ceph_pool_compress_bytes_used{%(matchers)s} > 0
+ ||| % $.matchers(),
+ 'E',
+ 'table',
+ 1,
+ true
+ ),
+ $.addTargetSchema(
+ 'delta(ceph_pool_stored{%(matchers)s}[5d])' % $.matchers(), 'F', 'table', 1, true
+ ),
+ $.addTargetSchema(
+ |||
+ rate(ceph_pool_rd{%(matchers)s}[$__rate_interval])
+ + rate(ceph_pool_wr{%(matchers)s}[$__rate_interval])
+ ||| % $.matchers(),
+ 'G',
+ 'table',
+ 1,
+ true
+ ),
+ $.addTargetSchema(
+ |||
+ rate(ceph_pool_rd_bytes{%(matchers)s}[$__rate_interval]) +
+ rate(ceph_pool_wr_bytes{%(matchers)s}[$__rate_interval])
+ ||| % $.matchers(),
+ 'H',
+ 'table',
+ 1,
+ true
+ ),
+ $.addTargetSchema(
+ 'ceph_pool_metadata{%(matchers)s}' % $.matchers(), 'I', 'table', 1, true
+ ),
+ $.addTargetSchema(
+ 'ceph_pool_stored{%(matchers)s} * on(pool_id) group_left ceph_pool_metadata{%(matchers)s}' % $.matchers(),
+ 'J',
+ 'table',
+ 1,
+ true
+ ),
+ $.addTargetSchema(
+ 'ceph_pool_metadata{%(matchers)s, compression_mode!="none"}' % $.matchers(), 'K', 'table', 1, true
+ ),
+ $.addTargetSchema('', 'L', '', '', null),
+ ]
+ ) + { gridPos: { x: 0, y: 3, w: 24, h: 6 } },
+ $.simpleGraphPanel(
+ {},
+ 'Top $topk Client IOPS by Pool',
+ 'This chart shows the sum of read and write IOPS from all clients by pool',
+ 'short',
+ 'IOPS',
+ 0,
+ |||
+ topk($topk,
+ round(
+ (
+ rate(ceph_pool_rd{%(matchers)s}[$__rate_interval]) +
+ rate(ceph_pool_wr{%(matchers)s}[$__rate_interval])
+ ), 1
+ ) * on(pool_id) group_left(instance,name) ceph_pool_metadata{%(matchers)s})
+ ||| % $.matchers(),
+ '{{name}} ',
+ 0,
+ 9,
+ 12,
+ 8
+ )
+ .addTarget(
+ $.addTargetSchema(
+ |||
+ topk($topk,
+ rate(ceph_pool_wr{%(matchers)s}[$__rate_interval]) +
+ on(pool_id) group_left(instance,name) ceph_pool_metadata{%(matchers)s}
+ )
+ ||| % $.matchers(),
+ '{{name}} - write'
+ )
+ ),
+ $.simpleGraphPanel(
+ {},
+ 'Top $topk Client Bandwidth by Pool',
+ 'The chart shows the sum of read and write bytes from all clients, by pool',
+ 'Bps',
+ 'Throughput',
+ 0,
+ |||
+ topk($topk,
+ (
+ rate(ceph_pool_rd_bytes{%(matchers)s}[$__rate_interval]) +
+ rate(ceph_pool_wr_bytes{%(matchers)s}[$__rate_interval])
+ ) * on(pool_id) group_left(instance, name) ceph_pool_metadata{%(matchers)s}
+ )
+ ||| % $.matchers(),
+ '{{name}}',
+ 12,
+ 9,
+ 12,
+ 8
+ ),
+ $.simpleGraphPanel(
+ {},
+ 'Pool Capacity Usage (RAW)',
+ 'Historical view of capacity usage, to help identify growth and trends in pool consumption',
+ 'bytes',
+ 'Capacity Used',
+ 0,
+ 'ceph_pool_bytes_used{%(matchers)s} * on(pool_id) group_right ceph_pool_metadata{%(matchers)s}' % $.matchers(),
+ '{{name}}',
+ 0,
+ 17,
+ 24,
+ 7
+ ),
+ ]),
+ 'pool-detail.json':
+ $.dashboardSchema(
+ 'Ceph Pool Details',
+ '',
+ '-xyV8KCiz',
+ 'now-1h',
+ '30s',
+ 22,
+ $._config.dashboardTags,
+ ''
+ )
+ .addRequired(
+ type='grafana', id='grafana', name='Grafana', version='5.3.2'
+ )
+ .addRequired(
+ type='panel', id='graph', name='Graph', version='5.0.0'
+ )
+ .addRequired(
+ type='panel', id='singlestat', name='Singlestat', version='5.0.0'
+ )
+ .addAnnotation(
+ $.addAnnotationSchema(
+ 1,
+ '-- Grafana --',
+ true,
+ true,
+ 'rgba(0, 211, 255, 1)',
+ 'Annotations & Alerts',
+ 'dashboard'
+ )
+ )
+ .addTemplate(
+ g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
+ )
+ .addTemplate(
+ $.addClusterTemplate()
+ )
+ .addTemplate(
+ $.addJobTemplate()
+ )
+ .addTemplate(
+ $.addTemplateSchema('pool_name',
+ '$datasource',
+ 'label_values(ceph_pool_metadata{%(matchers)s}, name)' % $.matchers(),
+ 1,
+ false,
+ 1,
+ 'Pool Name',
+ '')
+ )
+ .addPanels([
+ $.gaugeSingleStatPanel(
+ 'percentunit',
+ 'Capacity used',
+ '',
+ 'current',
+ true,
+ 1,
+ true,
+ true,
+ '.7,.8',
+ |||
+ (ceph_pool_stored{%(matchers)s} / (ceph_pool_stored{%(matchers)s} + ceph_pool_max_avail{%(matchers)s})) *
+ on(pool_id) group_left(instance, name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"}
+ ||| % $.matchers(),
+ 'time_series',
+ 0,
+ 0,
+ 7,
+ 7
+ ),
+ $.gaugeSingleStatPanel(
+ 's',
+ 'Time till full',
+ 'Time till pool is full assuming the average fill rate of the last 6 hours',
+ false,
+ 100,
+ false,
+ false,
+ '',
+ 'current',
+ |||
+ (ceph_pool_max_avail{%(matchers)s} / deriv(ceph_pool_stored{%(matchers)s}[6h])) *
+ on(pool_id) group_left(instance, name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"} > 0
+ ||| % $.matchers(),
+ 'time_series',
+ 7,
+ 0,
+ 5,
+ 7
+ ),
+ $.simpleGraphPanel(
+ {
+ read_op_per_sec:
+ '#3F6833',
+ write_op_per_sec: '#E5AC0E',
+ },
+ '$pool_name Object Ingress/Egress',
+ '',
+ 'ops',
+ 'Objects out(-) / in(+) ',
+ null,
+ |||
+ deriv(ceph_pool_objects{%(matchers)s}[1m]) *
+ on(pool_id) group_left(instance, name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"}
+ ||| % $.matchers(),
+ 'Objects per second',
+ 12,
+ 0,
+ 12,
+ 7
+ ),
+ $.simpleGraphPanel(
+ {
+ read_op_per_sec: '#3F6833',
+ write_op_per_sec: '#E5AC0E',
+ },
+ '$pool_name Client IOPS',
+ '',
+ 'iops',
+ 'Read (-) / Write (+)',
+ null,
+ |||
+ rate(ceph_pool_rd{%(matchers)s}[$__rate_interval]) *
+ on(pool_id) group_left(instance,name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"}
+ ||| % $.matchers(),
+ 'reads',
+ 0,
+ 7,
+ 12,
+ 7
+ )
+ .addSeriesOverride({ alias: 'reads', transform: 'negative-Y' })
+ .addTarget(
+ $.addTargetSchema(
+ |||
+ rate(ceph_pool_wr{%(matchers)s}[$__rate_interval]) *
+ on(pool_id) group_left(instance, name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"}
+ ||| % $.matchers(),
+ 'writes'
+ )
+ ),
+ $.simpleGraphPanel(
+ {
+ read_op_per_sec: '#3F6833',
+ write_op_per_sec: '#E5AC0E',
+ },
+ '$pool_name Client Throughput',
+ '',
+ 'Bps',
+ 'Read (-) / Write (+)',
+ null,
+ |||
+ rate(ceph_pool_rd_bytes{%(matchers)s}[$__rate_interval]) +
+ on(pool_id) group_left(instance, name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"}
+ ||| % $.matchers(),
+ 'reads',
+ 12,
+ 7,
+ 12,
+ 7
+ )
+ .addSeriesOverride({ alias: 'reads', transform: 'negative-Y' })
+ .addTarget(
+ $.addTargetSchema(
+ |||
+ rate(ceph_pool_wr_bytes{%(matchers)s}[$__rate_interval]) +
+ on(pool_id) group_left(instance,name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"}
+ ||| % $.matchers(),
+ 'writes'
+ )
+ ),
+ $.simpleGraphPanel(
+ {
+ read_op_per_sec: '#3F6833',
+ write_op_per_sec: '#E5AC0E',
+ },
+ '$pool_name Objects',
+ '',
+ 'short',
+ 'Objects',
+ null,
+ |||
+ ceph_pool_objects{%(matchers)s} *
+ on(pool_id) group_left(instance,name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"}
+ ||| % $.matchers(),
+ 'Number of Objects',
+ 0,
+ 14,
+ 12,
+ 7
+ ),
+ ]),
+}
diff --git a/monitoring/ceph-mixin/dashboards/rbd.libsonnet b/monitoring/ceph-mixin/dashboards/rbd.libsonnet
new file mode 100644
index 000000000..0eca5a877
--- /dev/null
+++ b/monitoring/ceph-mixin/dashboards/rbd.libsonnet
@@ -0,0 +1,337 @@
+local g = import 'grafonnet/grafana.libsonnet';
+local u = import 'utils.libsonnet';
+
+(import 'utils.libsonnet') {
+ 'rbd-details.json':
+ local RbdDetailsPanel(title, formatY1, expr1, expr2, x, y, w, h) =
+ $.graphPanelSchema({},
+ title,
+ '',
+ 'null as zero',
+ false,
+ formatY1,
+ formatY1,
+ null,
+ null,
+ 0,
+ 1,
+ '$datasource')
+ .addTargets(
+ [
+ $.addTargetSchema(expr1,
+ '{{pool}} Write'),
+ $.addTargetSchema(expr2, '{{pool}} Read'),
+ ]
+ ) + { gridPos: { x: x, y: y, w: w, h: h } };
+
+ $.dashboardSchema(
+ 'RBD Details',
+ 'Detailed Performance of RBD Images (IOPS/Throughput/Latency)',
+ 'YhCYGcuZz',
+ 'now-1h',
+ '30s',
+ 16,
+ $._config.dashboardTags,
+ ''
+ )
+ .addAnnotation(
+ $.addAnnotationSchema(
+ 1,
+ '-- Grafana --',
+ true,
+ true,
+ 'rgba(0, 211, 255, 1)',
+ 'Annotations & Alerts',
+ 'dashboard'
+ )
+ )
+ .addRequired(
+ type='grafana', id='grafana', name='Grafana', version='5.3.3'
+ )
+ .addRequired(
+ type='panel', id='graph', name='Graph', version='5.0.0'
+ )
+ .addTemplate(
+ g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
+ )
+ .addTemplate(
+ $.addClusterTemplate()
+ )
+ .addTemplate(
+ $.addJobTemplate()
+ )
+ .addTemplate(
+ $.addTemplateSchema('pool',
+ '$datasource',
+ 'label_values(pool)',
+ 1,
+ false,
+ 0,
+ '',
+ '')
+ )
+ .addTemplate(
+ $.addTemplateSchema('image',
+ '$datasource',
+ 'label_values(image)',
+ 1,
+ false,
+ 0,
+ '',
+ '')
+ )
+ .addPanels([
+ RbdDetailsPanel(
+ 'IOPS',
+ 'iops',
+ 'rate(ceph_rbd_write_ops{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval])' % $.matchers()
+ ,
+ 'rate(ceph_rbd_read_ops{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval])' % $.matchers(),
+ 0,
+ 0,
+ 8,
+ 9
+ ),
+ RbdDetailsPanel(
+ 'Throughput',
+ 'Bps',
+ 'rate(ceph_rbd_write_bytes{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval])' % $.matchers(),
+ 'rate(ceph_rbd_read_bytes{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval])' % $.matchers(),
+ 8,
+ 0,
+ 8,
+ 9
+ ),
+ RbdDetailsPanel(
+ 'Average Latency',
+ 'ns',
+ |||
+ rate(ceph_rbd_write_latency_sum{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval]) /
+ rate(ceph_rbd_write_latency_count{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval])
+ ||| % $.matchers(),
+ |||
+ rate(ceph_rbd_read_latency_sum{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval]) /
+ rate(ceph_rbd_read_latency_count{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval])
+ ||| % $.matchers(),
+ 16,
+ 0,
+ 8,
+ 9
+ ),
+ ]),
+ 'rbd-overview.json':
+ local RbdOverviewPanel(title,
+ formatY1,
+ expr1,
+ expr2,
+ legendFormat1,
+ legendFormat2,
+ x,
+ y,
+ w,
+ h) =
+ $.graphPanelSchema({},
+ title,
+ '',
+ 'null',
+ false,
+ formatY1,
+ 'short',
+ null,
+ null,
+ 0,
+ 1,
+ '$datasource')
+ .addTargets(
+ [
+ $.addTargetSchema(expr1,
+ legendFormat1),
+ $.addTargetSchema(expr2,
+ legendFormat2),
+ ]
+ ) + { gridPos: { x: x, y: y, w: w, h: h } };
+
+ $.dashboardSchema(
+ 'RBD Overview',
+ '',
+ '41FrpeUiz',
+ 'now-1h',
+ '30s',
+ 16,
+ $._config.dashboardTags + ['overview'],
+ ''
+ )
+ .addAnnotation(
+ $.addAnnotationSchema(
+ 1,
+ '-- Grafana --',
+ true,
+ true,
+ 'rgba(0, 211, 255, 1)',
+ 'Annotations & Alerts',
+ 'dashboard'
+ )
+ )
+ .addRequired(
+ type='grafana', id='grafana', name='Grafana', version='5.4.2'
+ )
+ .addRequired(
+ type='panel', id='graph', name='Graph', version='5.0.0'
+ )
+ .addRequired(
+ type='datasource', id='prometheus', name='Prometheus', version='5.0.0'
+ )
+ .addRequired(
+ type='panel', id='table', name='Table', version='5.0.0'
+ )
+ .addTemplate(
+ g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
+ )
+ .addTemplate(
+ $.addClusterTemplate()
+ )
+ .addTemplate(
+ $.addJobTemplate()
+ )
+ .addPanels([
+ RbdOverviewPanel(
+ 'IOPS',
+ 'short',
+ 'round(sum(rate(ceph_rbd_write_ops{%(matchers)s}[$__rate_interval])))' % $.matchers(),
+ 'round(sum(rate(ceph_rbd_read_ops{%(matchers)s}[$__rate_interval])))' % $.matchers(),
+ 'Writes',
+ 'Reads',
+ 0,
+ 0,
+ 8,
+ 7
+ ),
+ RbdOverviewPanel(
+ 'Throughput',
+ 'Bps',
+ 'round(sum(rate(ceph_rbd_write_bytes{%(matchers)s}[$__rate_interval])))' % $.matchers(),
+ 'round(sum(rate(ceph_rbd_read_bytes{%(matchers)s}[$__rate_interval])))' % $.matchers(),
+ 'Write',
+ 'Read',
+ 8,
+ 0,
+ 8,
+ 7
+ ),
+ RbdOverviewPanel(
+ 'Average Latency',
+ 'ns',
+ |||
+ round(
+ sum(rate(ceph_rbd_write_latency_sum{%(matchers)s}[$__rate_interval])) /
+ sum(rate(ceph_rbd_write_latency_count{%(matchers)s}[$__rate_interval]))
+ )
+ ||| % $.matchers(),
+ |||
+ round(
+ sum(rate(ceph_rbd_read_latency_sum{%(matchers)s}[$__rate_interval])) /
+ sum(rate(ceph_rbd_read_latency_count{%(matchers)s}[$__rate_interval]))
+ )
+ ||| % $.matchers(),
+ 'Write',
+ 'Read',
+ 16,
+ 0,
+ 8,
+ 7
+ ),
+ $.addTableSchema(
+ '$datasource',
+ '',
+ { col: 3, desc: true },
+ [
+ $.overviewStyle('Pool', 'pool', 'string', 'short'),
+ $.overviewStyle('Image', 'image', 'string', 'short'),
+ $.overviewStyle('IOPS', 'Value', 'number', 'iops'),
+ $.overviewStyle('', '/.*/', 'hidden', 'short'),
+ ],
+ 'Highest IOPS',
+ 'table'
+ )
+ .addTarget(
+ $.addTargetSchema(
+ |||
+ topk(10,
+ (
+ sort((
+ rate(ceph_rbd_write_ops{%(matchers)s}[$__rate_interval]) +
+ on (image, pool, namespace) rate(ceph_rbd_read_ops{%(matchers)s}[$__rate_interval])
+ ))
+ )
+ )
+ ||| % $.matchers(),
+ '',
+ 'table',
+ 1,
+ true
+ )
+ ) + { gridPos: { x: 0, y: 7, w: 8, h: 7 } },
+ $.addTableSchema(
+ '$datasource',
+ '',
+ { col: 3, desc: true },
+ [
+ $.overviewStyle('Pool', 'pool', 'string', 'short'),
+ $.overviewStyle('Image', 'image', 'string', 'short'),
+ $.overviewStyle('Throughput', 'Value', 'number', 'Bps'),
+ $.overviewStyle('', '/.*/', 'hidden', 'short'),
+ ],
+ 'Highest Throughput',
+ 'table'
+ )
+ .addTarget(
+ $.addTargetSchema(
+ |||
+ topk(10,
+ sort(
+ sum(
+ rate(ceph_rbd_read_bytes{%(matchers)s}[$__rate_interval]) +
+ rate(ceph_rbd_write_bytes{%(matchers)s}[$__rate_interval])
+ ) by (pool, image, namespace)
+ )
+ )
+ ||| % $.matchers(),
+ '',
+ 'table',
+ 1,
+ true
+ )
+ ) + { gridPos: { x: 8, y: 7, w: 8, h: 7 } },
+ $.addTableSchema(
+ '$datasource',
+ '',
+ { col: 3, desc: true },
+ [
+ $.overviewStyle('Pool', 'pool', 'string', 'short'),
+ $.overviewStyle('Image', 'image', 'string', 'short'),
+ $.overviewStyle('Latency', 'Value', 'number', 'ns'),
+ $.overviewStyle('', '/.*/', 'hidden', 'short'),
+ ],
+ 'Highest Latency',
+ 'table'
+ )
+ .addTarget(
+ $.addTargetSchema(
+ |||
+ topk(10,
+ sum(
+ rate(ceph_rbd_write_latency_sum{%(matchers)s}[$__rate_interval]) /
+ clamp_min(rate(ceph_rbd_write_latency_count{%(matchers)s}[$__rate_interval]), 1) +
+ rate(ceph_rbd_read_latency_sum{%(matchers)s}[$__rate_interval]) /
+ clamp_min(rate(ceph_rbd_read_latency_count{%(matchers)s}[$__rate_interval]), 1)
+ ) by (pool, image, namespace)
+ )
+ ||| % $.matchers(),
+ '',
+ 'table',
+ 1,
+ true
+ )
+ ) + { gridPos: { x: 16, y: 7, w: 8, h: 7 } },
+ ]),
+}
diff --git a/monitoring/ceph-mixin/dashboards/rgw.libsonnet b/monitoring/ceph-mixin/dashboards/rgw.libsonnet
new file mode 100644
index 000000000..892480d1c
--- /dev/null
+++ b/monitoring/ceph-mixin/dashboards/rgw.libsonnet
@@ -0,0 +1,872 @@
+local g = import 'grafonnet/grafana.libsonnet';
+local u = import 'utils.libsonnet';
+
+(import 'utils.libsonnet') {
+ 'radosgw-sync-overview.json':
+ local RgwSyncOverviewPanel(title, formatY1, labelY1, rgwMetric, x, y, w, h) =
+ $.graphPanelSchema({},
+ title,
+ '',
+ 'null as zero',
+ true,
+ formatY1,
+ 'short',
+ labelY1,
+ null,
+ 0,
+ 1,
+ '$datasource')
+ .addTargets(
+ [
+ $.addTargetSchema(
+ 'sum by (source_zone) (rate(%(rgwMetric)s{%(matchers)s}[$__rate_interval]))'
+ % ($.matchers() + { rgwMetric: rgwMetric }),
+ '{{source_zone}}'
+ ),
+ ]
+ ) + { gridPos: { x: x, y: y, w: w, h: h } };
+
+ $.dashboardSchema(
+ 'RGW Sync Overview',
+ '',
+ 'rgw-sync-overview',
+ 'now-1h',
+ '30s',
+ 16,
+ $._config.dashboardTags + ['overview'],
+ ''
+ )
+ .addAnnotation(
+ $.addAnnotationSchema(
+ 1,
+ '-- Grafana --',
+ true,
+ true,
+ 'rgba(0, 211, 255, 1)',
+ 'Annotations & Alerts',
+ 'dashboard'
+ )
+ )
+ .addRequired(
+ type='grafana', id='grafana', name='Grafana', version='5.0.0'
+ )
+ .addRequired(
+ type='panel', id='graph', name='Graph', version='5.0.0'
+ )
+ .addTemplate(
+ g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
+ )
+ .addTemplate(
+ $.addClusterTemplate()
+ )
+ .addTemplate(
+ $.addJobTemplate()
+ )
+ .addTemplate(
+ $.addTemplateSchema(
+ 'rgw_servers',
+ '$datasource',
+ 'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(),
+ 1,
+ true,
+ 1,
+ '',
+ 'RGW Server'
+ )
+ )
+ .addPanels([
+ RgwSyncOverviewPanel(
+ 'Replication (throughput) from Source Zone',
+ 'Bps',
+ null,
+ 'ceph_data_sync_from_zone_fetch_bytes_sum',
+ 0,
+ 0,
+ 8,
+ 7
+ ),
+ RgwSyncOverviewPanel(
+ 'Replication (objects) from Source Zone',
+ 'short',
+ 'Objects/s',
+ 'ceph_data_sync_from_zone_fetch_bytes_count',
+ 8,
+ 0,
+ 8,
+ 7
+ ),
+ RgwSyncOverviewPanel(
+ 'Polling Request Latency from Source Zone',
+ 'ms',
+ null,
+ 'ceph_data_sync_from_zone_poll_latency_sum',
+ 16,
+ 0,
+ 8,
+ 7
+ ),
+ RgwSyncOverviewPanel(
+ 'Unsuccessful Object Replications from Source Zone',
+ 'short',
+ 'Count/s',
+ 'ceph_data_sync_from_zone_fetch_errors',
+ 0,
+ 7,
+ 8,
+ 7
+ ),
+ ]),
+ 'radosgw-overview.json':
+ local RgwOverviewPanel(
+ title,
+ description,
+ formatY1,
+ formatY2,
+ expr1,
+ legendFormat1,
+ x,
+ y,
+ w,
+ h,
+ datasource='$datasource',
+ legend_alignAsTable=false,
+ legend_avg=false,
+ legend_min=false,
+ legend_max=false,
+ legend_current=false,
+ legend_values=false
+ ) =
+ $.graphPanelSchema(
+ {},
+ title,
+ description,
+ 'null',
+ false,
+ formatY1,
+ formatY2,
+ null,
+ null,
+ 0,
+ 1,
+ datasource,
+ legend_alignAsTable,
+ legend_avg,
+ legend_min,
+ legend_max,
+ legend_current,
+ legend_values
+ )
+ .addTargets(
+ [$.addTargetSchema(expr1, legendFormat1)]
+ ) + { gridPos: { x: x, y: y, w: w, h: h } };
+
+ $.dashboardSchema(
+ 'RGW Overview',
+ '',
+ 'WAkugZpiz',
+ 'now-1h',
+ '30s',
+ 16,
+ $._config.dashboardTags + ['overview'],
+ ''
+ )
+ .addAnnotation(
+ $.addAnnotationSchema(
+ 1,
+ '-- Grafana --',
+ true,
+ true,
+ 'rgba(0, 211, 255, 1)',
+ 'Annotations & Alerts',
+ 'dashboard'
+ )
+ )
+ .addRequired(
+ type='grafana', id='grafana', name='Grafana', version='5.0.0'
+ )
+ .addRequired(
+ type='panel', id='graph', name='Graph', version='5.0.0'
+ )
+ .addTemplate(
+ g.template.datasource('datasource',
+ 'prometheus',
+ 'default',
+ label='Data Source')
+ )
+ .addTemplate(
+ $.addClusterTemplate()
+ )
+ .addTemplate(
+ $.addJobTemplate()
+ )
+ .addTemplate(
+ $.addTemplateSchema(
+ 'rgw_servers',
+ '$datasource',
+ 'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(),
+ 1,
+ true,
+ 1,
+ '',
+ 'RGW Server'
+ )
+ )
+ .addTemplate(
+ $.addTemplateSchema(
+ 'code',
+ '$datasource',
+ 'label_values(haproxy_server_http_responses_total{job=~"$job_haproxy", instance=~"$ingress_service"}, code)',
+ 1,
+ true,
+ 1,
+ 'HTTP Code',
+ ''
+ )
+ )
+ .addTemplate(
+ $.addTemplateSchema(
+ 'job_haproxy',
+ '$datasource',
+ 'label_values(haproxy_server_status, job)',
+ 1,
+ true,
+ 1,
+ 'job haproxy',
+ '(.*)',
+ multi=true,
+ allValues='.+',
+ ),
+ )
+ .addTemplate(
+ $.addTemplateSchema(
+ 'ingress_service',
+ '$datasource',
+ 'label_values(haproxy_server_status{job=~"$job_haproxy"}, instance)',
+ 1,
+ true,
+ 1,
+ 'Ingress Service',
+ ''
+ )
+ )
+ .addPanels([
+ $.addRowSchema(false,
+ true,
+ 'RGW Overview - All Gateways') +
+ {
+ gridPos: { x: 0, y: 0, w: 24, h: 1 },
+ },
+ RgwOverviewPanel(
+ 'Average GET/PUT Latencies by RGW Instance',
+ '',
+ 's',
+ 'short',
+ |||
+ label_replace(
+ rate(ceph_rgw_get_initial_lat_sum{%(matchers)s}[$__rate_interval]) /
+ rate(ceph_rgw_get_initial_lat_count{%(matchers)s}[$__rate_interval]) *
+ on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s},
+ "rgw_host", "$1", "ceph_daemon", "rgw.(.*)"
+ )
+ ||| % $.matchers(),
+ 'GET {{rgw_host}}',
+ 0,
+ 1,
+ 8,
+ 7
+ ).addTargets(
+ [
+ $.addTargetSchema(
+ |||
+ label_replace(
+ rate(ceph_rgw_put_initial_lat_sum{%(matchers)s}[$__rate_interval]) /
+ rate(ceph_rgw_put_initial_lat_count{%(matchers)s}[$__rate_interval]) *
+ on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s},
+ "rgw_host", "$1", "ceph_daemon", "rgw.(.*)"
+ )
+ ||| % $.matchers(),
+ 'PUT {{rgw_host}}'
+ ),
+ ]
+ ),
+ RgwOverviewPanel(
+ 'Total Requests/sec by RGW Instance',
+ '',
+ 'none',
+ 'short',
+ |||
+ sum by (rgw_host) (
+ label_replace(
+ rate(ceph_rgw_req{%(matchers)s}[$__rate_interval]) *
+ on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s},
+ "rgw_host", "$1", "ceph_daemon", "rgw.(.*)"
+ )
+ )
+ ||| % $.matchers(),
+ '{{rgw_host}}',
+ 8,
+ 1,
+ 7,
+ 7
+ ),
+ RgwOverviewPanel(
+ 'GET Latencies by RGW Instance',
+ 'Latencies are shown stacked, without a yaxis to provide a visual indication of GET latency imbalance across RGW hosts',
+ 's',
+ 'short',
+ |||
+ label_replace(
+ rate(ceph_rgw_get_initial_lat_sum{%(matchers)s}[$__rate_interval]) /
+ rate(ceph_rgw_get_initial_lat_count{%(matchers)s}[$__rate_interval]) *
+ on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s},
+ "rgw_host", "$1", "ceph_daemon", "rgw.(.*)"
+ )
+ ||| % $.matchers(),
+ '{{rgw_host}}',
+ 15,
+ 1,
+ 6,
+ 7
+ ),
+ RgwOverviewPanel(
+ 'Bandwidth Consumed by Type',
+ 'Total bytes transferred in/out of all radosgw instances within the cluster',
+ 'bytes',
+ 'short',
+ 'sum(rate(ceph_rgw_get_b{%(matchers)s}[$__rate_interval]))' % $.matchers(),
+ 'GETs',
+ 0,
+ 8,
+ 8,
+ 6
+ ).addTargets(
+ [$.addTargetSchema('sum(rate(ceph_rgw_put_b{%(matchers)s}[$__rate_interval]))' % $.matchers(),
+ 'PUTs')]
+ ),
+ RgwOverviewPanel(
+ 'Bandwidth by RGW Instance',
+ 'Total bytes transferred in/out through get/put operations, by radosgw instance',
+ 'bytes',
+ 'short',
+ |||
+ label_replace(sum by (instance_id) (
+ rate(ceph_rgw_get_b{%(matchers)s}[$__rate_interval]) +
+ rate(ceph_rgw_put_b{%(matchers)s}[$__rate_interval])) *
+ on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s},
+ "rgw_host", "$1", "ceph_daemon", "rgw.(.*)"
+ )
+ ||| % $.matchers(),
+ '{{rgw_host}}',
+ 8,
+ 8,
+ 7,
+ 6
+ ),
+ RgwOverviewPanel(
+ 'PUT Latencies by RGW Instance',
+ 'Latencies are shown stacked, without a yaxis to provide a visual indication of PUT latency imbalance across RGW hosts',
+ 's',
+ 'short',
+ |||
+ label_replace(
+ rate(ceph_rgw_put_initial_lat_sum{%(matchers)s}[$__rate_interval]) /
+ rate(ceph_rgw_put_initial_lat_count{%(matchers)s}[$__rate_interval]) *
+ on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s},
+ "rgw_host", "$1", "ceph_daemon", "rgw.(.*)"
+ )
+ ||| % $.matchers(),
+ '{{rgw_host}}',
+ 15,
+ 8,
+ 6,
+ 6
+ ),
+ $.addRowSchema(
+ false, true, 'RGW Overview - HAProxy Metrics'
+ ) + { gridPos: { x: 0, y: 12, w: 9, h: 12 } },
+ RgwOverviewPanel(
+ 'Total responses by HTTP code',
+ '',
+ 'short',
+ 'short',
+ |||
+ sum(
+ rate(
+ haproxy_frontend_http_responses_total{code=~"$code", job=~"$job_haproxy", instance=~"$ingress_service", proxy=~"frontend"}[$__rate_interval]
+ )
+ ) by (code)
+ |||,
+ 'Frontend {{ code }}',
+ 0,
+ 12,
+ 5,
+ 12,
+ '$datasource',
+ true,
+ true,
+ true,
+ true,
+ true,
+ true
+ )
+ .addTargets(
+ [
+ $.addTargetSchema(
+ |||
+ sum(
+ rate(
+ haproxy_backend_http_responses_total{code=~"$code", job=~"$job_haproxy", instance=~"$ingress_service", proxy=~"backend"}[$__rate_interval]
+ )
+ ) by (code)
+ |||, 'Backend {{ code }}'
+ ),
+ ]
+ )
+ .addSeriesOverride([
+ {
+ alias: '/.*Back.*/',
+ transform: 'negative-Y',
+ },
+ { alias: '/.*1.*/' },
+ { alias: '/.*2.*/' },
+ { alias: '/.*3.*/' },
+ { alias: '/.*4.*/' },
+ { alias: '/.*5.*/' },
+ { alias: '/.*other.*/' },
+ ]),
+ RgwOverviewPanel(
+ 'Total requests / responses',
+ '',
+ 'short',
+ 'short',
+ |||
+ sum(
+ rate(
+ haproxy_frontend_http_requests_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
+ )
+ ) by (instance)
+ |||,
+ 'Requests',
+ 5,
+ 12,
+ 5,
+ 12,
+ '$datasource',
+ true,
+ true,
+ true,
+ true,
+ true,
+ true
+ )
+ .addTargets(
+ [
+ $.addTargetSchema(
+ |||
+ sum(
+ rate(
+ haproxy_backend_response_errors_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
+ )
+ ) by (instance)
+ |||, 'Response errors', 'time_series', 2
+ ),
+ $.addTargetSchema(
+ |||
+ sum(
+ rate(
+ haproxy_frontend_request_errors_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
+ )
+ ) by (instance)
+ |||, 'Requests errors'
+ ),
+ $.addTargetSchema(
+ |||
+ sum(
+ rate(
+ haproxy_backend_redispatch_warnings_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
+ )
+ ) by (instance)
+ |||, 'Backend redispatch', 'time_series', 2
+ ),
+ $.addTargetSchema(
+ |||
+ sum(
+ rate(
+ haproxy_backend_retry_warnings_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
+ )
+ ) by (instance)
+ |||, 'Backend retry', 'time_series', 2
+ ),
+ $.addTargetSchema(
+ |||
+ sum(
+ rate(
+ haproxy_frontend_requests_denied_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
+ )
+ ) by (instance)
+ |||, 'Request denied', 'time_series', 2
+ ),
+ $.addTargetSchema(
+ |||
+ sum(
+ haproxy_backend_current_queue{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}
+ ) by (instance)
+ |||, 'Backend Queued', 'time_series', 2
+ ),
+ ]
+ )
+ .addSeriesOverride([
+ {
+ alias: '/.*Response.*/',
+ transform: 'negative-Y',
+ },
+ {
+ alias: '/.*Backend.*/',
+ transform: 'negative-Y',
+ },
+ ]),
+ RgwOverviewPanel(
+ 'Total number of connections',
+ '',
+ 'short',
+ 'short',
+ |||
+ sum(
+ rate(
+ haproxy_frontend_connections_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
+ )
+ ) by (instance)
+ |||,
+ 'Front',
+ 10,
+ 12,
+ 5,
+ 12,
+ '$datasource',
+ true,
+ true,
+ true,
+ true,
+ true,
+ true
+ )
+ .addTargets(
+ [
+ $.addTargetSchema(
+ |||
+ sum(
+ rate(
+ haproxy_backend_connection_attempts_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
+ )
+ ) by (instance)
+ |||, 'Back'
+ ),
+ $.addTargetSchema(
+ |||
+ sum(
+ rate(
+ haproxy_backend_connection_errors_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
+ )
+ ) by (instance)
+ |||, 'Back errors'
+ ),
+ ]
+ )
+ .addSeriesOverride([
+ {
+ alias: '/.*Back.*/',
+ transform: 'negative-Y',
+ },
+ ]),
+ RgwOverviewPanel(
+ 'Current total of incoming / outgoing bytes',
+ '',
+ 'short',
+ 'short',
+ |||
+ sum(
+ rate(
+ haproxy_frontend_bytes_in_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
+ ) * 8
+ ) by (instance)
+ |||,
+ 'IN Front',
+ 15,
+ 12,
+ 6,
+ 12,
+ '$datasource',
+ true,
+ true,
+ true,
+ true,
+ true,
+ true
+ )
+ .addTargets(
+ [
+ $.addTargetSchema(
+ |||
+ sum(
+ rate(
+ haproxy_frontend_bytes_out_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
+ ) * 8
+ ) by (instance)
+ |||, 'OUT Front', 'time_series', 2
+ ),
+ $.addTargetSchema(
+ |||
+ sum(
+ rate(
+ haproxy_backend_bytes_in_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
+ ) * 8
+ ) by (instance)
+ |||, 'IN Back', 'time_series', 2
+ ),
+ $.addTargetSchema(
+ |||
+ sum(
+ rate(
+ haproxy_backend_bytes_out_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
+ ) * 8
+ ) by (instance)
+ |||, 'OUT Back', 'time_series', 2
+ ),
+ ]
+ )
+ .addSeriesOverride([
+ {
+ alias: '/.*OUT.*/',
+ transform: 'negative-Y',
+ },
+ ]),
+ ]),
+ 'radosgw-detail.json':
+ local RgwDetailsPanel(aliasColors,
+ title,
+ description,
+ formatY1,
+ formatY2,
+ expr1,
+ expr2,
+ legendFormat1,
+ legendFormat2,
+ x,
+ y,
+ w,
+ h) =
+ $.graphPanelSchema(aliasColors,
+ title,
+ description,
+ 'null',
+ false,
+ formatY1,
+ formatY2,
+ null,
+ null,
+ 0,
+ 1,
+ '$datasource')
+ .addTargets(
+ [$.addTargetSchema(expr1, legendFormat1), $.addTargetSchema(expr2, legendFormat2)]
+ ) + { gridPos: { x: x, y: y, w: w, h: h } };
+
+ $.dashboardSchema(
+ 'RGW Instance Detail',
+ '',
+ 'x5ARzZtmk',
+ 'now-1h',
+ '30s',
+ 16,
+ $._config.dashboardTags + ['overview'],
+ ''
+ )
+ .addAnnotation(
+ $.addAnnotationSchema(
+ 1,
+ '-- Grafana --',
+ true,
+ true,
+ 'rgba(0, 211, 255, 1)',
+ 'Annotations & Alerts',
+ 'dashboard'
+ )
+ )
+ .addRequired(
+ type='grafana', id='grafana', name='Grafana', version='5.0.0'
+ )
+ .addRequired(
+ type='panel',
+ id='grafana-piechart-panel',
+ name='Pie Chart',
+ version='1.3.3'
+ )
+ .addRequired(
+ type='panel', id='graph', name='Graph', version='5.0.0'
+ )
+ .addTemplate(
+ g.template.datasource('datasource',
+ 'prometheus',
+ 'default',
+ label='Data Source')
+ )
+ .addTemplate(
+ $.addClusterTemplate()
+ )
+ .addTemplate(
+ $.addJobTemplate()
+ )
+ .addTemplate(
+ $.addTemplateSchema('rgw_servers',
+ '$datasource',
+ 'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(),
+ 1,
+ true,
+ 1,
+ '',
+ '')
+ )
+ .addPanels([
+ $.addRowSchema(false, true, 'RGW Host Detail : $rgw_servers') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } },
+ RgwDetailsPanel(
+ {},
+ '$rgw_servers GET/PUT Latencies',
+ '',
+ 's',
+ 'short',
+ |||
+ sum by (instance_id) (
+ rate(ceph_rgw_get_initial_lat_sum{%(matchers)s}[$__rate_interval]) /
+ rate(ceph_rgw_get_initial_lat_count{%(matchers)s}[$__rate_interval])
+ ) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
+ ||| % $.matchers(),
+ |||
+ sum by (instance_id) (
+ rate(ceph_rgw_put_initial_lat_sum{%(matchers)s}[$__rate_interval]) /
+ rate(ceph_rgw_put_initial_lat_count{%(matchers)s}[$__rate_interval])
+ ) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
+ ||| % $.matchers(),
+ 'GET {{ceph_daemon}}',
+ 'PUT {{ceph_daemon}}',
+ 0,
+ 1,
+ 6,
+ 8
+ ),
+ RgwDetailsPanel(
+ {},
+ 'Bandwidth by HTTP Operation',
+ '',
+ 'bytes',
+ 'short',
+ |||
+ rate(ceph_rgw_get_b{%(matchers)s}[$__rate_interval]) *
+ on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
+ ||| % $.matchers(),
+ |||
+ rate(ceph_rgw_put_b{%(matchers)s}[$__rate_interval]) *
+ on (instance_id) group_left (ceph_daemon)
+ ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
+ ||| % $.matchers(),
+ 'GETs {{ceph_daemon}}',
+ 'PUTs {{ceph_daemon}}',
+ 6,
+ 1,
+ 7,
+ 8
+ ),
+ RgwDetailsPanel(
+ {
+ GETs: '#7eb26d',
+ Other: '#447ebc',
+ PUTs: '#eab839',
+ Requests: '#3f2b5b',
+ 'Requests Failed': '#bf1b00',
+ },
+ 'HTTP Request Breakdown',
+ '',
+ 'short',
+ 'short',
+ |||
+ rate(ceph_rgw_failed_req{%(matchers)s}[$__rate_interval]) *
+ on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s,ceph_daemon=~"$rgw_servers"}
+ ||| % $.matchers(),
+ |||
+ rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) *
+ on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
+ ||| % $.matchers(),
+ 'Requests Failed {{ceph_daemon}}',
+ 'GETs {{ceph_daemon}}',
+ 13,
+ 1,
+ 7,
+ 8
+ )
+ .addTargets(
+ [
+ $.addTargetSchema(
+ |||
+ rate(ceph_rgw_put{%(matchers)s}[$__rate_interval]) *
+ on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
+ ||| % $.matchers(),
+ 'PUTs {{ceph_daemon}}'
+ ),
+ $.addTargetSchema(
+ |||
+ (
+ rate(ceph_rgw_req{%(matchers)s}[$__rate_interval]) -
+ (
+ rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) +
+ rate(ceph_rgw_put{%(matchers)s}[$__rate_interval])
+ )
+ ) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
+ ||| % $.matchers(),
+ 'Other {{ceph_daemon}}'
+ ),
+ ]
+ ),
+ $.simplePieChart(
+ {
+ GETs: '#7eb26d',
+ 'Other (HEAD,POST,DELETE)': '#447ebc',
+ PUTs: '#eab839',
+ Requests: '#3f2b5b',
+ Failures: '#bf1b00',
+ }, '', 'Workload Breakdown'
+ )
+ .addTarget($.addTargetSchema(
+ |||
+ rate(ceph_rgw_failed_req{%(matchers)s}[$__rate_interval]) *
+ on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
+ ||| % $.matchers(),
+ 'Failures {{ceph_daemon}}'
+ ))
+ .addTarget($.addTargetSchema(
+ |||
+ rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) *
+ on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
+ ||| % $.matchers(),
+ 'GETs {{ceph_daemon}}'
+ ))
+ .addTarget($.addTargetSchema(
+ |||
+ rate(ceph_rgw_put{%(matchers)s}[$__rate_interval]) *
+ on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
+ ||| % $.matchers(),
+ 'PUTs {{ceph_daemon}}'
+ ))
+ .addTarget($.addTargetSchema(
+ |||
+ (
+ rate(ceph_rgw_req{%(matchers)s}[$__rate_interval]) -
+ (
+ rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) +
+ rate(ceph_rgw_put{%(matchers)s}[$__rate_interval])
+ )
+ ) * on (instance_id) group_left (ceph_daemon)
+ ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
+ ||| % $.matchers(),
+ 'Other (DELETE,LIST) {{ceph_daemon}}'
+ )) + { gridPos: { x: 20, y: 1, w: 4, h: 8 } },
+ ]),
+}
diff --git a/monitoring/ceph-mixin/dashboards/utils.libsonnet b/monitoring/ceph-mixin/dashboards/utils.libsonnet
new file mode 100644
index 000000000..a7774c7ce
--- /dev/null
+++ b/monitoring/ceph-mixin/dashboards/utils.libsonnet
@@ -0,0 +1,333 @@
+local g = import 'grafonnet/grafana.libsonnet';
+
+{
+ _config:: error 'must provide _config',
+
+ dashboardSchema(title,
+ description,
+ uid,
+ time_from,
+ refresh,
+ schemaVersion,
+ tags,
+ timezone)::
+ g.dashboard.new(title=title,
+ description=description,
+ uid=uid,
+ time_from=time_from,
+ refresh=refresh,
+ schemaVersion=schemaVersion,
+ tags=tags,
+ timezone=timezone),
+
+ graphPanelSchema(aliasColors,
+ title,
+ description,
+ nullPointMode,
+ stack,
+ formatY1,
+ formatY2,
+ labelY1,
+ labelY2,
+ min,
+ fill,
+ datasource,
+ legend_alignAsTable=false,
+ legend_avg=false,
+ legend_min=false,
+ legend_max=false,
+ legend_current=false,
+ legend_values=false)::
+ g.graphPanel.new(aliasColors=aliasColors,
+ title=title,
+ description=description,
+ nullPointMode=nullPointMode,
+ stack=stack,
+ formatY1=formatY1,
+ formatY2=formatY2,
+ labelY1=labelY1,
+ labelY2=labelY2,
+ min=min,
+ fill=fill,
+ datasource=datasource,
+ legend_alignAsTable=legend_alignAsTable,
+ legend_avg=legend_avg,
+ legend_min=legend_min,
+ legend_max=legend_max,
+ legend_current=legend_current,
+ legend_values=legend_values),
+
+
+ addTargetSchema(expr, legendFormat='', format='time_series', intervalFactor=1, instant=null)::
+ g.prometheus.target(expr=expr,
+ legendFormat=legendFormat,
+ format=format,
+ intervalFactor=intervalFactor,
+ instant=instant),
+
+ addTemplateSchema(name,
+ datasource,
+ query,
+ refresh,
+ includeAll,
+ sort,
+ label,
+ regex,
+ hide='',
+ multi=false,
+ allValues=null)::
+ g.template.new(name=name,
+ datasource=datasource,
+ query=query,
+ refresh=refresh,
+ includeAll=includeAll,
+ sort=sort,
+ label=label,
+ regex=regex,
+ hide=hide,
+ multi=multi,
+ allValues=allValues),
+
+ addAnnotationSchema(builtIn,
+ datasource,
+ enable,
+ hide,
+ iconColor,
+ name,
+ type)::
+ g.annotation.datasource(builtIn=builtIn,
+ datasource=datasource,
+ enable=enable,
+ hide=hide,
+ iconColor=iconColor,
+ name=name,
+ type=type),
+
+ addRowSchema(collapse, showTitle, title)::
+ g.row.new(collapse=collapse, showTitle=showTitle, title=title),
+
+ addSingleStatSchema(colors,
+ datasource,
+ format,
+ title,
+ description,
+ valueName,
+ colorValue,
+ gaugeMaxValue,
+ gaugeShow,
+ sparklineShow,
+ thresholds)::
+ g.singlestat.new(colors=colors,
+ datasource=datasource,
+ format=format,
+ title=title,
+ description=description,
+ valueName=valueName,
+ colorValue=colorValue,
+ gaugeMaxValue=gaugeMaxValue,
+ gaugeShow=gaugeShow,
+ sparklineShow=sparklineShow,
+ thresholds=thresholds),
+
+ addPieChartSchema(aliasColors,
+ datasource,
+ description,
+ legendType,
+ pieType,
+ title,
+ valueName)::
+ g.pieChartPanel.new(aliasColors=aliasColors,
+ datasource=datasource,
+ description=description,
+ legendType=legendType,
+ pieType=pieType,
+ title=title,
+ valueName=valueName),
+
+ addTableSchema(datasource, description, sort, styles, title, transform)::
+ g.tablePanel.new(datasource=datasource,
+ description=description,
+ sort=sort,
+ styles=styles,
+ title=title,
+ transform=transform),
+
+ addStyle(alias,
+ colorMode,
+ colors,
+ dateFormat,
+ decimals,
+ mappingType,
+ pattern,
+ thresholds,
+ type,
+ unit,
+ valueMaps)::
+ {
+ alias: alias,
+ colorMode: colorMode,
+ colors: colors,
+ dateFormat: dateFormat,
+ decimals: decimals,
+ mappingType: mappingType,
+ pattern: pattern,
+ thresholds: thresholds,
+ type: type,
+ unit: unit,
+ valueMaps: valueMaps,
+ },
+
+ matchers()::
+ local jobMatcher = 'job=~"$job"';
+ local clusterMatcher = '%s=~"$cluster"' % $._config.clusterLabel;
+ {
+ // Common labels
+ jobMatcher: jobMatcher,
+ clusterMatcher: (if $._config.showMultiCluster then clusterMatcher else ''),
+ matchers: jobMatcher +
+ (if $._config.showMultiCluster then ', ' + clusterMatcher else ''),
+ },
+
+ addClusterTemplate()::
+ $.addTemplateSchema(
+ 'cluster',
+ '$datasource',
+ 'label_values(ceph_osd_metadata, %s)' % $._config.clusterLabel,
+ 1,
+ true,
+ 1,
+ 'cluster',
+ '(.*)',
+ if !$._config.showMultiCluster then 'variable' else '',
+ multi=true,
+ allValues='.+',
+ ),
+
+ addJobTemplate()::
+ $.addTemplateSchema(
+ 'job',
+ '$datasource',
+ 'label_values(ceph_osd_metadata{%(clusterMatcher)s}, job)' % $.matchers(),
+ 1,
+ true,
+ 1,
+ 'job',
+ '(.*)',
+ multi=true,
+ allValues='.+',
+ ),
+
+ overviewStyle(alias,
+ pattern,
+ type,
+ unit,
+ colorMode=null,
+ thresholds=[],
+ valueMaps=[])::
+ $.addStyle(alias,
+ colorMode,
+ [
+ 'rgba(245, 54, 54, 0.9)',
+ 'rgba(237, 129, 40, 0.89)',
+ 'rgba(50, 172, 45, 0.97)',
+ ],
+ 'YYYY-MM-DD HH:mm:ss',
+ 2,
+ 1,
+ pattern,
+ thresholds,
+ type,
+ unit,
+ valueMaps),
+
+ simpleGraphPanel(alias,
+ title,
+ description,
+ formatY1,
+ labelY1,
+ min,
+ expr,
+ legendFormat,
+ x,
+ y,
+ w,
+ h)::
+ $.graphPanelSchema(alias,
+ title,
+ description,
+ 'null',
+ false,
+ formatY1,
+ 'short',
+ labelY1,
+ null,
+ min,
+ 1,
+ '$datasource')
+ .addTargets(
+ [$.addTargetSchema(expr, legendFormat)]
+ ) + { gridPos: { x: x, y: y, w: w, h: h } },
+
+ simpleSingleStatPanel(format,
+ title,
+ description,
+ valueName,
+ expr,
+ instant,
+ targetFormat,
+ x,
+ y,
+ w,
+ h)::
+ $.addSingleStatSchema(['#299c46', 'rgba(237, 129, 40, 0.89)', '#d44a3a'],
+ '$datasource',
+ format,
+ title,
+ description,
+ valueName,
+ false,
+ 100,
+ false,
+ false,
+ '')
+ .addTarget($.addTargetSchema(expr, '', targetFormat, 1, instant)) + {
+ gridPos: { x: x, y: y, w: w, h: h },
+ },
+ gaugeSingleStatPanel(format,
+ title,
+ description,
+ valueName,
+ colorValue,
+ gaugeMaxValue,
+ gaugeShow,
+ sparkLineShow,
+ thresholds,
+ expr,
+ targetFormat,
+ x,
+ y,
+ w,
+ h)::
+ $.addSingleStatSchema(['#299c46', 'rgba(237, 129, 40, 0.89)', '#d44a3a'],
+ '$datasource',
+ format,
+ title,
+ description,
+ valueName,
+ colorValue,
+ gaugeMaxValue,
+ gaugeShow,
+ sparkLineShow,
+ thresholds)
+ .addTarget($.addTargetSchema(expr, '', targetFormat)) + { gridPos: { x:
+ x, y: y, w: w, h: h } },
+
+ simplePieChart(alias, description, title)::
+ $.addPieChartSchema(alias,
+ '$datasource',
+ description,
+ 'Under graph',
+ 'pie',
+ title,
+ 'current'),
+}
diff --git a/monitoring/ceph-mixin/dashboards_out/.lint b/monitoring/ceph-mixin/dashboards_out/.lint
new file mode 100644
index 000000000..6352e858f
--- /dev/null
+++ b/monitoring/ceph-mixin/dashboards_out/.lint
@@ -0,0 +1,5 @@
+exclusions:
+ template-instance-rule:
+ reason: "Instance template not needed because of ceph-mgr leader election."
+ target-instance-rule:
+ reason: "Instance matcher not needed because of ceph-mgr leader election."
diff --git a/monitoring/ceph-mixin/dashboards_out/ceph-cluster.json b/monitoring/ceph-mixin/dashboards_out/ceph-cluster.json
new file mode 100644
index 000000000..6988a6299
--- /dev/null
+++ b/monitoring/ceph-mixin/dashboards_out/ceph-cluster.json
@@ -0,0 +1,1244 @@
+{
+ "__requires": [
+ {
+ "type": "grafana",
+ "id": "grafana",
+ "name": "Grafana",
+ "version": "5.0.0"
+ },
+ {
+ "type": "panel",
+ "id": "graph",
+ "name": "Graph",
+ "version": "5.0.0"
+ },
+ {
+ "type": "panel",
+ "id": "heatmap",
+ "name": "Heatmap",
+ "version": "5.0.0"
+ },
+ {
+ "type": "panel",
+ "id": "singlestat",
+ "name": "Singlestat",
+ "version": "5.0.0"
+ },
+ {
+ "type": "panel",
+ "id": "vonage-status-panel",
+ "name": "Status Panel",
+ "version": "1.0.8"
+ }
+ ],
+ "annotations": {
+ "list": []
+ },
+ "description": "Ceph cluster overview",
+ "editable": false,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "id": null,
+ "iteration": 1525415495309,
+ "links": [],
+ "panels": [
+ {
+ "cacheTimeout": null,
+ "colorBackground": true,
+ "colorValue": false,
+ "colors": [
+ "rgba(50, 128, 45, 0.9)",
+ "rgba(237, 129, 40, 0.9)",
+ "rgb(255, 0, 0)"
+ ],
+ "datasource": "$datasource",
+ "editable": false,
+ "error": false,
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 3,
+ "w": 2,
+ "x": 0,
+ "y": 0
+ },
+ "hideTimeOverride": true,
+ "id": 21,
+ "interval": "1m",
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "span": 2,
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "ceph_health_status",
+ "format": "time_series",
+ "instant": true,
+ "interval": "$interval",
+ "intervalFactor": 1,
+ "refId": "A",
+ "step": 60
+ }
+ ],
+ "thresholds": "1,2",
+ "timeFrom": null,
+ "title": "Health Status",
+ "transparent": false,
+ "type": "singlestat",
+ "valueFontSize": "50%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "OK",
+ "value": "0"
+ },
+ {
+ "op": "=",
+ "text": "WARN",
+ "value": "1"
+ },
+ {
+ "op": "=",
+ "text": "ERR",
+ "value": "2"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "colorMode": "Panel",
+ "colors": {
+ "crit": "rgb(255, 0, 0)",
+ "disable": "rgba(128, 128, 128, 0.9)",
+ "ok": "rgba(50, 128, 45, 0.9)",
+ "warn": "rgba(237, 129, 40, 0.9)"
+ },
+ "cornerRadius": 0,
+ "datasource": "$datasource",
+ "displayName": "",
+ "flipCard": false,
+ "flipTime": 5,
+ "fontFormat": "Regular",
+ "gridPos": {
+ "h": 3,
+ "w": 2,
+ "x": 2,
+ "y": 0
+ },
+ "id": 43,
+ "isAutoScrollOnOverflow": false,
+ "isGrayOnNoData": false,
+ "isHideAlertsOnDisable": false,
+ "isIgnoreOKColors": false,
+ "links": [],
+ "targets": [
+ {
+ "aggregation": "Last",
+ "alias": "All",
+ "decimals": 2,
+ "displayAliasType": "Always",
+ "displayType": "Regular",
+ "displayValueWithAlias": "When Alias Displayed",
+ "expr": "count(ceph_osd_metadata)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "All",
+ "refId": "A",
+ "units": "none",
+ "valueHandler": "Number Threshold"
+ },
+ {
+ "aggregation": "Last",
+ "alias": "In",
+ "decimals": 2,
+ "displayAliasType": "Always",
+ "displayType": "Regular",
+ "displayValueWithAlias": "When Alias Displayed",
+ "expr": "sum(ceph_osd_in)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "In",
+ "refId": "B",
+ "units": "none",
+ "valueHandler": "Number Threshold"
+ },
+ {
+ "aggregation": "Last",
+ "alias": "Out",
+ "decimals": 2,
+ "displayAliasType": "Warning / Critical",
+ "displayType": "Regular",
+ "displayValueWithAlias": "When Alias Displayed",
+ "expr": "sum(ceph_osd_in == bool 0)",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "Out",
+ "refId": "C",
+ "units": "none",
+ "valueHandler": "Number Threshold",
+ "warn": 1
+ },
+ {
+ "aggregation": "Last",
+ "alias": "Up",
+ "decimals": 2,
+ "displayAliasType": "Always",
+ "displayType": "Regular",
+ "displayValueWithAlias": "When Alias Displayed",
+ "expr": "sum(ceph_osd_up)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Up",
+ "refId": "D",
+ "units": "none",
+ "valueHandler": "Number Threshold"
+ },
+ {
+ "aggregation": "Last",
+ "alias": "Down",
+ "crit": 2,
+ "decimals": 2,
+ "displayAliasType": "Warning / Critical",
+ "displayType": "Regular",
+ "displayValueWithAlias": "When Alias Displayed",
+ "expr": "sum(ceph_osd_up == bool 0)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Down",
+ "refId": "E",
+ "units": "none",
+ "valueHandler": "Number Threshold",
+ "warn": 1
+ }
+ ],
+ "title": "OSDs",
+ "type": "vonage-status-panel"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "$datasource",
+ "decimals": 2,
+ "format": "percentunit",
+ "gauge": {
+ "maxValue": 1,
+ "minValue": 0,
+ "show": true,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 4,
+ "x": 4,
+ "y": 0
+ },
+ "id": 47,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": true
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "sum(ceph_osd_stat_bytes_used)/sum(ceph_osd_stat_bytes)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Used",
+ "refId": "A"
+ }
+ ],
+ "thresholds": "0.7,0.8",
+ "title": "Capacity used",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 0,
+ "gridPos": {
+ "h": 6,
+ "w": 8,
+ "x": 8,
+ "y": 0
+ },
+ "id": 53,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "Active",
+ "color": "#508642",
+ "fill": 1,
+ "stack": "A"
+ },
+ {
+ "alias": "Total",
+ "color": "#f9e2d2"
+ },
+ {
+ "alias": "Degraded",
+ "color": "#eab839"
+ },
+ {
+ "alias": "Undersized",
+ "color": "#f9934e"
+ },
+ {
+ "alias": "Inconsistent",
+ "color": "#e24d42"
+ },
+ {
+ "alias": "Down",
+ "color": "#bf1b00"
+ },
+ {
+ "alias": "Inactive",
+ "color": "#bf1b00",
+ "fill": 4,
+ "linewidth": 0,
+ "stack": "A"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(ceph_pg_total)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Total",
+ "refId": "A"
+ },
+ {
+ "expr": "sum(ceph_pg_active)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Active",
+ "refId": "B"
+ },
+ {
+ "expr": "sum(ceph_pg_total - ceph_pg_active)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Inactive",
+ "refId": "G"
+ },
+ {
+ "expr": "sum(ceph_pg_undersized)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Undersized",
+ "refId": "F"
+ },
+ {
+ "expr": "sum(ceph_pg_degraded)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Degraded",
+ "refId": "C"
+ },
+ {
+ "expr": "sum(ceph_pg_inconsistent)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Inconsistent",
+ "refId": "D"
+ },
+ {
+ "expr": "sum(ceph_pg_down)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Down",
+ "refId": "E"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "PG States",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 0,
+ "gridPos": {
+ "h": 6,
+ "w": 8,
+ "x": 16,
+ "y": 0
+ },
+ "id": 66,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": false,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "Avg Apply Latency",
+ "color": "#7eb26d"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "quantile(0.95, ceph_osd_apply_latency_ms)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Apply Latency P_95",
+ "refId": "A"
+ },
+ {
+ "expr": "quantile(0.95, ceph_osd_commit_latency_ms)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Commit Latency P_95",
+ "refId": "B"
+ },
+ {
+ "expr": "avg(ceph_osd_apply_latency_ms)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Avg Apply Latency",
+ "refId": "C"
+ },
+ {
+ "expr": "avg(ceph_osd_commit_latency_ms)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Avg Commit Latency",
+ "refId": "D"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "OSD Latencies",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "ms",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "clusterName": "",
+ "colorMode": "Panel",
+ "colors": {
+ "crit": "rgba(245, 54, 54, 0.9)",
+ "disable": "rgba(128, 128, 128, 0.9)",
+ "ok": "rgba(50, 128, 45, 0.9)",
+ "warn": "rgba(237, 129, 40, 0.9)"
+ },
+ "cornerRadius": 1,
+ "datasource": "$datasource",
+ "displayName": "",
+ "flipCard": false,
+ "flipTime": 5,
+ "fontFormat": "Regular",
+ "gridPos": {
+ "h": 3,
+ "w": 2,
+ "x": 0,
+ "y": 3
+ },
+ "id": 41,
+ "isAutoScrollOnOverflow": false,
+ "isGrayOnNoData": false,
+ "isHideAlertsOnDisable": false,
+ "isIgnoreOKColors": false,
+ "links": [],
+ "targets": [
+ {
+ "aggregation": "Last",
+ "alias": "In Quorum",
+ "decimals": 2,
+ "displayAliasType": "Always",
+ "displayType": "Regular",
+ "displayValueWithAlias": "When Alias Displayed",
+ "expr": "sum(ceph_mon_quorum_status)",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "In Quorum",
+ "refId": "A",
+ "units": "none",
+ "valueHandler": "Text Only"
+ },
+ {
+ "aggregation": "Last",
+ "alias": "Total",
+ "crit": 1,
+ "decimals": 2,
+ "displayAliasType": "Always",
+ "displayType": "Regular",
+ "displayValueWithAlias": "When Alias Displayed",
+ "expr": "count(ceph_mon_quorum_status)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Total",
+ "refId": "B",
+ "units": "none",
+ "valueHandler": "Text Only",
+ "warn": 2
+ },
+ {
+ "aggregation": "Last",
+ "alias": "MONs out of Quorum",
+ "crit": 1.6,
+ "decimals": 2,
+ "displayAliasType": "Warning / Critical",
+ "displayType": "Annotation",
+ "displayValueWithAlias": "Never",
+ "expr": "count(ceph_mon_quorum_status) / sum(ceph_mon_quorum_status)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "MONs out of Quorum",
+ "refId": "C",
+ "units": "none",
+ "valueHandler": "Number Threshold",
+ "warn": 1.1
+ }
+ ],
+ "title": "Monitors",
+ "type": "vonage-status-panel"
+ },
+ {
+ "colorMode": "Panel",
+ "colors": {
+ "crit": "rgba(245, 54, 54, 0.9)",
+ "disable": "rgba(128, 128, 128, 0.9)",
+ "ok": "rgba(50, 128, 45, 0.9)",
+ "warn": "rgba(237, 129, 40, 0.9)"
+ },
+ "cornerRadius": 0,
+ "datasource": "$datasource",
+ "displayName": "",
+ "flipCard": false,
+ "flipTime": 5,
+ "fontFormat": "Regular",
+ "gridPos": {
+ "h": 3,
+ "w": 2,
+ "x": 2,
+ "y": 3
+ },
+ "id": 68,
+ "isAutoScrollOnOverflow": false,
+ "isGrayOnNoData": false,
+ "isHideAlertsOnDisable": false,
+ "isIgnoreOKColors": false,
+ "links": [],
+ "targets": [
+ {
+ "aggregation": "Last",
+ "alias": "Active",
+ "decimals": 2,
+ "displayAliasType": "Always",
+ "displayType": "Regular",
+ "displayValueWithAlias": "When Alias Displayed",
+ "expr": "count(ceph_mgr_status == 1) or vector(0)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Active",
+ "refId": "A",
+ "units": "none",
+ "valueHandler": "Number Threshold"
+ },
+ {
+ "aggregation": "Last",
+ "alias": "Standby",
+ "decimals": 2,
+ "displayAliasType": "Always",
+ "displayType": "Regular",
+ "displayValueWithAlias": "When Alias Displayed",
+ "expr": "count(ceph_mgr_status == 0) or vector(0)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Standby",
+ "refId": "B",
+ "units": "none",
+ "valueHandler": "Number Threshold"
+ }
+ ],
+ "title": "MGRs",
+ "type": "vonage-status-panel"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 0,
+ "y": 6
+ },
+ "id": 45,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 0.5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "Reads",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(irate(ceph_osd_op_w_in_bytes[1m]))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Writes",
+ "refId": "A"
+ },
+ {
+ "expr": "sum(irate(ceph_osd_op_r_out_bytes[1m]))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Reads",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Cluster I/O",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "Bps",
+ "label": "Read (-) / Write (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 12,
+ "y": 6
+ },
+ "id": 62,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": false,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(deriv(ceph_pool_stored[1m]))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "In-/Egress",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "Bps",
+ "label": " Egress (-) / Ingress (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ]
+ },
+ {
+ "cards": {
+ "cardPadding": null,
+ "cardRound": 1
+ },
+ "color": {
+ "cardColor": "rgb(0, 254, 255)",
+ "colorScale": "sqrt",
+ "colorScheme": "interpolateBlues",
+ "exponent": 0.5,
+ "min": null,
+ "mode": "spectrum"
+ },
+ "dataFormat": "timeseries",
+ "datasource": "$datasource",
+ "gridPos": {
+ "h": 9,
+ "w": 6,
+ "x": 0,
+ "y": 15
+ },
+ "heatmap": {},
+ "highlightCards": true,
+ "id": 55,
+ "legend": {
+ "show": true
+ },
+ "links": [],
+ "span": 12,
+ "targets": [
+ {
+ "expr": "ceph_osd_stat_bytes_used / ceph_osd_stat_bytes",
+ "format": "time_series",
+ "interval": "1m",
+ "intervalFactor": 1,
+ "legendFormat": "Util (%)",
+ "refId": "A",
+ "step": 60
+ }
+ ],
+ "timeFrom": null,
+ "title": "OSD Capacity Utilization",
+ "tooltip": {
+ "show": true,
+ "showHistogram": false
+ },
+ "type": "heatmap",
+ "xAxis": {
+ "show": true
+ },
+ "xBucketNumber": null,
+ "xBucketSize": "",
+ "yAxis": {
+ "decimals": 2,
+ "format": "percentunit",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true,
+ "splitFactor": null
+ },
+ "yBucketNumber": null,
+ "yBucketSize": null
+ },
+ {
+ "cards": {
+ "cardPadding": null,
+ "cardRound": 1
+ },
+ "color": {
+ "cardColor": "#b4ff00",
+ "colorScale": "sqrt",
+ "colorScheme": "interpolateBlues",
+ "exponent": 0.5,
+ "mode": "spectrum"
+ },
+ "dataFormat": "timeseries",
+ "datasource": "$datasource",
+ "gridPos": {
+ "h": 9,
+ "w": 6,
+ "x": 6,
+ "y": 15
+ },
+ "heatmap": {},
+ "highlightCards": true,
+ "id": 59,
+ "legend": {
+ "show": true
+ },
+ "links": [],
+ "targets": [
+ {
+ "expr": "ceph_osd_numpg",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "#PGs",
+ "refId": "A"
+ }
+ ],
+ "title": "PGs per OSD",
+ "tooltip": {
+ "show": true,
+ "showHistogram": false
+ },
+ "type": "heatmap",
+ "xAxis": {
+ "show": true
+ },
+ "xBucketNumber": null,
+ "xBucketSize": "",
+ "yAxis": {
+ "decimals": null,
+ "format": "none",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true,
+ "splitFactor": null
+ },
+ "yBucketNumber": null,
+ "yBucketSize": null
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 12,
+ "y": 15
+ },
+ "id": 64,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": false,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(irate(ceph_osd_recovery_ops[1m]))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Op/s",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Recovery Rate",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "ops",
+ "label": "Recovery Ops/s",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "refresh": "30s",
+ "schemaVersion": 16,
+ "style": "dark",
+ "tags": [
+ "ceph",
+ "cluster"
+ ],
+ "templating": {
+ "list": [
+ {
+ "hide": 0,
+ "label": null,
+ "name": "datasource",
+ "options": [],
+ "query": "prometheus",
+ "refresh": 1,
+ "regex": "",
+ "type": "datasource"
+ },
+ {
+ "auto": true,
+ "auto_count": 10,
+ "auto_min": "1m",
+ "current": {
+ "text": "auto",
+ "value": "$__auto_interval_interval"
+ },
+ "datasource": null,
+ "hide": 0,
+ "includeAll": false,
+ "label": "Interval",
+ "multi": false,
+ "name": "interval",
+ "options": [
+ {
+ "selected": true,
+ "text": "auto",
+ "value": "$__auto_interval_interval"
+ },
+ {
+ "selected": false,
+ "text": "1m",
+ "value": "1m"
+ },
+ {
+ "selected": false,
+ "text": "10m",
+ "value": "10m"
+ },
+ {
+ "selected": false,
+ "text": "30m",
+ "value": "30m"
+ },
+ {
+ "selected": false,
+ "text": "1h",
+ "value": "1h"
+ },
+ {
+ "selected": false,
+ "text": "6h",
+ "value": "6h"
+ },
+ {
+ "selected": false,
+ "text": "12h",
+ "value": "12h"
+ },
+ {
+ "selected": false,
+ "text": "1d",
+ "value": "1d"
+ },
+ {
+ "selected": false,
+ "text": "7d",
+ "value": "7d"
+ },
+ {
+ "selected": false,
+ "text": "14d",
+ "value": "14d"
+ },
+ {
+ "selected": false,
+ "text": "30d",
+ "value": "30d"
+ }
+ ],
+ "query": "1m,10m,30m,1h,6h,12h,1d,7d,14d,30d",
+ "refresh": 2,
+ "type": "interval"
+ }
+ ]
+ },
+ "time": {
+ "from": "now-6h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "",
+ "title": "Ceph - Cluster",
+ "version": 13
+ }
diff --git a/monitoring/ceph-mixin/dashboards_out/cephfs-overview.json b/monitoring/ceph-mixin/dashboards_out/cephfs-overview.json
new file mode 100644
index 000000000..3e7aeef45
--- /dev/null
+++ b/monitoring/ceph-mixin/dashboards_out/cephfs-overview.json
@@ -0,0 +1,362 @@
+{
+ "__inputs": [ ],
+ "__requires": [
+ {
+ "id": "grafana",
+ "name": "Grafana",
+ "type": "grafana",
+ "version": "5.3.2"
+ },
+ {
+ "id": "graph",
+ "name": "Graph",
+ "type": "panel",
+ "version": "5.0.0"
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "showIn": 0,
+ "tags": [ ],
+ "type": "dashboard"
+ }
+ ]
+ },
+ "description": "",
+ "editable": false,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "hideControls": false,
+ "id": null,
+ "links": [ ],
+ "panels": [
+ {
+ "collapse": false,
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 0
+ },
+ "id": 2,
+ "panels": [ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": true,
+ "title": "MDS Performance",
+ "titleSize": "h6",
+ "type": "row"
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 0,
+ "y": 1
+ },
+ "id": 3,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
+ {
+ "alias": "/.*Reads/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(rate(ceph_objecter_op_r{job=~\"$job\", ceph_daemon=~\"($mds_servers).*\"}[$__rate_interval]))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Read Ops",
+ "refId": "A"
+ },
+ {
+ "expr": "sum(rate(ceph_objecter_op_w{job=~\"$job\", ceph_daemon=~\"($mds_servers).*\"}[$__rate_interval]))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Write Ops",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "MDS Workload - $mds_servers",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "none",
+ "label": "Reads(-) / Writes (+)",
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 12,
+ "y": 1
+ },
+ "id": 4,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "ceph_mds_server_handle_client_request{job=~\"$job\", ceph_daemon=~\"($mds_servers).*\"}",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{ceph_daemon}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Client Request Load - $mds_servers",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "none",
+ "label": "Client Requests",
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "refresh": "30s",
+ "rows": [ ],
+ "schemaVersion": 16,
+ "style": "dark",
+ "tags": [
+ "ceph-mixin"
+ ],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "text": "default",
+ "value": "default"
+ },
+ "hide": 0,
+ "label": "Data Source",
+ "name": "datasource",
+ "options": [ ],
+ "query": "prometheus",
+ "refresh": 1,
+ "regex": "",
+ "type": "datasource"
+ },
+ {
+ "allValue": ".+",
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 2,
+ "includeAll": true,
+ "label": "cluster",
+ "multi": true,
+ "name": "cluster",
+ "options": [ ],
+ "query": "label_values(ceph_osd_metadata, cluster)",
+ "refresh": 1,
+ "regex": "(.*)",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": ".+",
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 0,
+ "includeAll": true,
+ "label": "job",
+ "multi": true,
+ "name": "job",
+ "options": [ ],
+ "query": "label_values(ceph_osd_metadata{}, job)",
+ "refresh": 1,
+ "regex": "(.*)",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 0,
+ "includeAll": true,
+ "label": "MDS Server",
+ "multi": false,
+ "name": "mds_servers",
+ "options": [ ],
+ "query": "label_values(ceph_mds_inodes{job=~\"$job\"}, ceph_daemon)",
+ "refresh": 1,
+ "regex": "",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "",
+ "title": "MDS Performance",
+ "uid": "tbO9LAiZz",
+ "version": 0
+}
diff --git a/monitoring/ceph-mixin/dashboards_out/host-details.json b/monitoring/ceph-mixin/dashboards_out/host-details.json
new file mode 100644
index 000000000..93c51f009
--- /dev/null
+++ b/monitoring/ceph-mixin/dashboards_out/host-details.json
@@ -0,0 +1,1243 @@
+{
+ "__inputs": [ ],
+ "__requires": [
+ {
+ "id": "grafana",
+ "name": "Grafana",
+ "type": "grafana",
+ "version": "5.3.2"
+ },
+ {
+ "id": "graph",
+ "name": "Graph",
+ "type": "panel",
+ "version": "5.0.0"
+ },
+ {
+ "id": "singlestat",
+ "name": "Singlestat",
+ "type": "panel",
+ "version": "5.0.0"
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "showIn": 0,
+ "tags": [ ],
+ "type": "dashboard"
+ }
+ ]
+ },
+ "description": "",
+ "editable": false,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "hideControls": false,
+ "id": null,
+ "links": [ ],
+ "panels": [
+ {
+ "collapse": false,
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 0
+ },
+ "id": 2,
+ "panels": [ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": true,
+ "title": "$ceph_hosts System Overview",
+ "titleSize": "h6",
+ "type": "row"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "$datasource",
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 5,
+ "w": 3,
+ "x": 0,
+ "y": 1
+ },
+ "id": 3,
+ "interval": null,
+ "links": [ ],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "count(sum by (ceph_daemon) (ceph_osd_metadata{job=~\"$job\", hostname='$ceph_hosts'}))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "OSDs",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "aliasColors": {
+ "interrupt": "#447EBC",
+ "steal": "#6D1F62",
+ "system": "#890F02",
+ "user": "#3F6833",
+ "wait": "#C15C17"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "Shows the CPU breakdown. When multiple servers are selected, only the first host's cpu data is shown",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 6,
+ "x": 3,
+ "y": 1
+ },
+ "id": 4,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum by (mode) (\n rate(node_cpu{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\", mode=~\"(irq|nice|softirq|steal|system|user|iowait)\"}[$__rate_interval]) or\n rate(node_cpu_seconds_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\", mode=~\"(irq|nice|softirq|steal|system|user|iowait)\"}[$__rate_interval])\n) / (\n scalar(\n sum(rate(node_cpu{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_cpu_seconds_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]))\n ) * 100\n)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{mode}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "CPU Utilization",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "percent",
+ "label": "% Utilization",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {
+ "Available": "#508642",
+ "Free": "#508642",
+ "Total": "#bf1b00",
+ "Used": "#bf1b00",
+ "total": "#bf1b00",
+ "used": "#0a50a1"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 6,
+ "x": 9,
+ "y": 1
+ },
+ "id": 5,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
+ {
+ "alias": "total",
+ "color": "#bf1b00",
+ "fill": 0,
+ "linewidth": 2,
+ "stack": false
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_memory_MemFree{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_MemFree_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Free",
+ "refId": "A"
+ },
+ {
+ "expr": "node_memory_MemTotal{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_MemTotal_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "total",
+ "refId": "B"
+ },
+ {
+ "expr": "(\n node_memory_Cached{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_Cached_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n) + (\n node_memory_Buffers{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_Buffers_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n) + (\n node_memory_Slab{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_Slab_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "buffers/cache",
+ "refId": "C"
+ },
+ {
+ "expr": "(\n node_memory_MemTotal{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_MemTotal_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n) - (\n (\n node_memory_MemFree{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_MemFree_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n ) + (\n node_memory_Cached{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_Cached_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n ) + (\n node_memory_Buffers{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_Buffers_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n ) +\n (\n node_memory_Slab{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_Slab_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n )\n)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "used",
+ "refId": "D"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "RAM Usage",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": "RAM used",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "Show the network load (rx,tx) across all interfaces (excluding loopback 'lo')",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 6,
+ "x": 15,
+ "y": 1
+ },
+ "id": 6,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
+ {
+ "alias": "/.*tx/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum by (device) (\n rate(\n node_network_receive_bytes{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\",device!=\"lo\"}[$__rate_interval]) or\n rate(node_network_receive_bytes_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\",device!=\"lo\"}[$__rate_interval]\n )\n)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{device}}.rx",
+ "refId": "A"
+ },
+ {
+ "expr": "sum by (device) (\n rate(node_network_transmit_bytes{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\",device!=\"lo\"}[$__rate_interval]) or\n rate(node_network_transmit_bytes_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\",device!=\"lo\"}[$__rate_interval])\n)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{device}}.tx",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Network Load",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "decbytes",
+ "label": "Send (-) / Receive (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 5,
+ "w": 3,
+ "x": 21,
+ "y": 1
+ },
+ "id": 7,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
+ {
+ "alias": "/.*tx/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(node_network_receive_drop{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_network_receive_drop_total{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval])\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{device}}.rx",
+ "refId": "A"
+ },
+ {
+ "expr": "rate(node_network_transmit_drop{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_network_transmit_drop_total{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval])\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{device}}.tx",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Network drop rate",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "pps",
+ "label": "Send (-) / Receive (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "$datasource",
+ "description": "Each OSD consists of a Journal/WAL partition and a data partition. The RAW Capacity shown is the sum of the data partitions across all OSDs on the selected OSD hosts.",
+ "format": "bytes",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 5,
+ "w": 3,
+ "x": 0,
+ "y": 6
+ },
+ "id": 8,
+ "interval": null,
+ "links": [ ],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "sum(\n ceph_osd_stat_bytes{job=~\"$job\"} and\n on (ceph_daemon) ceph_disk_occupation{job=~\"$job\", instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}\n)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "Raw Capacity",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 5,
+ "w": 3,
+ "x": 21,
+ "y": 6
+ },
+ "id": 9,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
+ {
+ "alias": "/.*tx/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(node_network_receive_errs{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_network_receive_errs_total{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval])\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{device}}.rx",
+ "refId": "A"
+ },
+ {
+ "expr": "rate(node_network_transmit_errs{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_network_transmit_errs_total{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval])\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{device}}.tx",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Network error rate",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "pps",
+ "label": "Send (-) / Receive (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "collapse": false,
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 11
+ },
+ "id": 10,
+ "panels": [ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": true,
+ "title": "OSD Disk Performance Statistics",
+ "titleSize": "h6",
+ "type": "row"
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "For any OSD devices on the host, this chart shows the iops per physical device. Each device is shown by it's name and corresponding OSD id value",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 11,
+ "x": 0,
+ "y": 12
+ },
+ "id": 11,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
+ {
+ "alias": "/.*reads/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "label_replace(\n (\n rate(node_disk_writes_completed{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_disk_writes_completed_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval])\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) * on(instance, device) group_left(ceph_daemon) label_replace(\n label_replace(\n ceph_disk_occupation_human{job=~\"$job\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{device}}({{ceph_daemon}}) writes",
+ "refId": "A"
+ },
+ {
+ "expr": "label_replace(\n (\n rate(node_disk_reads_completed{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_disk_reads_completed_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval])\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) * on(instance, device) group_left(ceph_daemon) label_replace(\n label_replace(\n ceph_disk_occupation_human{job=~\"$job\"},\"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{device}}({{ceph_daemon}}) reads",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "$ceph_hosts Disk IOPS",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "ops",
+ "label": "Read (-) / Write (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "For OSD hosts, this chart shows the disk bandwidth (read bytes/sec + write bytes/sec) of the physical OSD device. Each device is shown by device name, and corresponding OSD id",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 11,
+ "x": 12,
+ "y": 12
+ },
+ "id": 12,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
+ {
+ "alias": "/.*read/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "label_replace(\n (\n rate(node_disk_bytes_written{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_disk_written_bytes_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval])\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") * on(instance, device)\n group_left(ceph_daemon) label_replace(\n label_replace(ceph_disk_occupation_human{job=~\"$job\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n )\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{device}}({{ceph_daemon}}) write",
+ "refId": "A"
+ },
+ {
+ "expr": "label_replace(\n (\n rate(node_disk_bytes_read{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_disk_read_bytes_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval])\n ),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") * on(instance, device)\n group_left(ceph_daemon) label_replace(\n label_replace(ceph_disk_occupation_human{job=~\"$job\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n )\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{device}}({{ceph_daemon}}) read",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "$ceph_hosts Throughput by Disk",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "Bps",
+ "label": "Read (-) / Write (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "For OSD hosts, this chart shows the latency at the physical drive. Each drive is shown by device name, with it's corresponding OSD id",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 11,
+ "x": 0,
+ "y": 21
+ },
+ "id": 13,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "max by(instance, device) (label_replace(\n (rate(node_disk_write_time_seconds_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval])) /\n clamp_min(rate(node_disk_writes_completed_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]), 0.001) or\n (rate(node_disk_read_time_seconds_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval])) /\n clamp_min(rate(node_disk_reads_completed_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]), 0.001),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)) * on(instance, device) group_left(ceph_daemon) label_replace(\n label_replace(\n ceph_disk_occupation_human{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"},\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{device}}({{ceph_daemon}})",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "$ceph_hosts Disk Latency",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": "",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "Show disk utilization % (util) of any OSD devices on the host by the physical device name and associated OSD id.",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 11,
+ "x": 12,
+ "y": 21
+ },
+ "id": 14,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "label_replace(\n (\n (rate(node_disk_io_time_ms{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]) / 10) or\n rate(node_disk_io_time_seconds_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]) * 100\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) * on(instance, device) group_left(ceph_daemon) label_replace(\n label_replace(ceph_disk_occupation_human{job=~\"$job\", instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"},\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{device}}({{ceph_daemon}})",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "$ceph_hosts Disk utilization",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "percent",
+ "label": "%Util",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "refresh": "30s",
+ "rows": [ ],
+ "schemaVersion": 16,
+ "style": "dark",
+ "tags": [
+ "ceph-mixin",
+ "overview"
+ ],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "text": "default",
+ "value": "default"
+ },
+ "hide": 0,
+ "label": "Data Source",
+ "name": "datasource",
+ "options": [ ],
+ "query": "prometheus",
+ "refresh": 1,
+ "regex": "",
+ "type": "datasource"
+ },
+ {
+ "allValue": ".+",
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 2,
+ "includeAll": true,
+ "label": "cluster",
+ "multi": true,
+ "name": "cluster",
+ "options": [ ],
+ "query": "label_values(ceph_osd_metadata, cluster)",
+ "refresh": 1,
+ "regex": "(.*)",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": ".+",
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 0,
+ "includeAll": true,
+ "label": "job",
+ "multi": true,
+ "name": "job",
+ "options": [ ],
+ "query": "label_values(ceph_osd_metadata{}, job)",
+ "refresh": 1,
+ "regex": "(.*)",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 0,
+ "includeAll": false,
+ "label": "Hostname",
+ "multi": false,
+ "name": "ceph_hosts",
+ "options": [ ],
+ "query": "label_values({}, instance)",
+ "refresh": 1,
+ "regex": "([^.:]*).*",
+ "sort": 3,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "",
+ "title": "Host Details",
+ "uid": "rtOg0AiWz",
+ "version": 0
+}
diff --git a/monitoring/ceph-mixin/dashboards_out/hosts-overview.json b/monitoring/ceph-mixin/dashboards_out/hosts-overview.json
new file mode 100644
index 000000000..f1cd4c499
--- /dev/null
+++ b/monitoring/ceph-mixin/dashboards_out/hosts-overview.json
@@ -0,0 +1,894 @@
+{
+ "__inputs": [ ],
+ "__requires": [
+ {
+ "id": "grafana",
+ "name": "Grafana",
+ "type": "grafana",
+ "version": "5.3.2"
+ },
+ {
+ "id": "graph",
+ "name": "Graph",
+ "type": "panel",
+ "version": "5.0.0"
+ },
+ {
+ "id": "singlestat",
+ "name": "Singlestat",
+ "type": "panel",
+ "version": "5.0.0"
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "showIn": 0,
+ "tags": [ ],
+ "type": "dashboard"
+ }
+ ]
+ },
+ "description": "",
+ "editable": false,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "hideControls": false,
+ "id": null,
+ "links": [ ],
+ "panels": [
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "$datasource",
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 5,
+ "w": 4,
+ "x": 0,
+ "y": 0
+ },
+ "id": 2,
+ "interval": null,
+ "links": [ ],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "count(sum by (hostname) (ceph_osd_metadata{job=~\"$job\"}))",
+ "format": "time_series",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "OSD Hosts",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "$datasource",
+ "description": "Average CPU busy across all hosts (OSD, RGW, MON etc) within the cluster",
+ "format": "percentunit",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 5,
+ "w": 4,
+ "x": 4,
+ "y": 0
+ },
+ "id": 3,
+ "interval": null,
+ "links": [ ],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "avg(1 - (\n avg by(instance) (\n rate(node_cpu_seconds_total{mode='idle',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[$__rate_interval]) or\n rate(node_cpu{mode='idle',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[$__rate_interval])\n )\n))\n",
+ "format": "time_series",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "AVG CPU Busy",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "$datasource",
+ "description": "Average Memory Usage across all hosts in the cluster (excludes buffer/cache usage)",
+ "format": "percentunit",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 5,
+ "w": 4,
+ "x": 8,
+ "y": 0
+ },
+ "id": 4,
+ "interval": null,
+ "links": [ ],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "avg ((\n (\n node_memory_MemTotal{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or\n node_memory_MemTotal_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}\n ) - ((\n node_memory_MemFree{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or\n node_memory_MemFree_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}) +\n (\n node_memory_Cached{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or\n node_memory_Cached_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}\n ) + (\n node_memory_Buffers{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or\n node_memory_Buffers_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}\n ) + (\n node_memory_Slab{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or\n node_memory_Slab_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}\n )\n )\n) / (\n node_memory_MemTotal{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or\n node_memory_MemTotal_bytes{instance=~\"($osd_hosts|$rgw_hosts|$mon_hosts|$mds_hosts).*\"}\n))\n",
+ "format": "time_series",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "AVG RAM Utilization",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "$datasource",
+ "description": "IOPS Load at the device as reported by the OS on all OSD hosts",
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 5,
+ "w": 4,
+ "x": 12,
+ "y": 0
+ },
+ "id": 5,
+ "interval": null,
+ "links": [ ],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "sum ((\n rate(node_disk_reads_completed{instance=~\"($osd_hosts).*\"}[$__rate_interval]) or\n rate(node_disk_reads_completed_total{instance=~\"($osd_hosts).*\"}[$__rate_interval])\n) + (\n rate(node_disk_writes_completed{instance=~\"($osd_hosts).*\"}[$__rate_interval]) or\n rate(node_disk_writes_completed_total{instance=~\"($osd_hosts).*\"}[$__rate_interval])\n))\n",
+ "format": "time_series",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "Physical IOPS",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "$datasource",
+ "description": "Average Disk utilization for all OSD data devices (i.e. excludes journal/WAL)",
+ "format": "percent",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 5,
+ "w": 4,
+ "x": 16,
+ "y": 0
+ },
+ "id": 6,
+ "interval": null,
+ "links": [ ],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "avg (\n label_replace(\n (rate(node_disk_io_time_ms[$__rate_interval]) / 10 ) or\n (rate(node_disk_io_time_seconds_total[$__rate_interval]) * 100),\n \"instance\", \"$1\", \"instance\", \"([^.:]*).*\"\n ) * on(instance, device) group_left(ceph_daemon) label_replace(\n label_replace(\n ceph_disk_occupation_human{job=~\"$job\", instance=~\"($osd_hosts).*\"},\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^.:]*).*\"\n )\n)\n",
+ "format": "time_series",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "AVG Disk Utilization",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "$datasource",
+ "description": "Total send/receive network load across all hosts in the ceph cluster",
+ "format": "bytes",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 5,
+ "w": 4,
+ "x": 20,
+ "y": 0
+ },
+ "id": 7,
+ "interval": null,
+ "links": [ ],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "sum (\n (\n rate(node_network_receive_bytes{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[$__rate_interval]) or\n rate(node_network_receive_bytes_total{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[$__rate_interval])\n ) unless on (device, instance)\n label_replace((bonding_slaves > 0), \"device\", \"$1\", \"master\", \"(.+)\")\n) +\nsum (\n (\n rate(node_network_transmit_bytes{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[$__rate_interval]) or\n rate(node_network_transmit_bytes_total{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[$__rate_interval])\n ) unless on (device, instance)\n label_replace((bonding_slaves > 0), \"device\", \"$1\", \"master\", \"(.+)\")\n)\n",
+ "format": "time_series",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "Network Load",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "Show the top 10 busiest hosts by cpu",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 0,
+ "y": 5
+ },
+ "id": 8,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "topk(10,\n 100 * (\n 1 - (\n avg by(instance) (\n rate(node_cpu_seconds_total{mode='idle',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[$__rate_interval]) or\n rate(node_cpu{mode='idle',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[$__rate_interval])\n )\n )\n )\n)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "CPU Busy - Top 10 Hosts",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "percent",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "Top 10 hosts by network load",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 12,
+ "y": 5
+ },
+ "id": 9,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "topk(10, (sum by(instance) (\n(\n rate(node_network_receive_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[$__rate_interval]) or\n rate(node_network_receive_bytes_total{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[$__rate_interval])\n) +\n(\n rate(node_network_transmit_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[$__rate_interval]) or\n rate(node_network_transmit_bytes_total{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[$__rate_interval])\n) unless on (device, instance)\n label_replace((bonding_slaves > 0), \"device\", \"$1\", \"master\", \"(.+)\"))\n))\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Network Load - Top 10 Hosts",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "Bps",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "refresh": "30s",
+ "rows": [ ],
+ "schemaVersion": 16,
+ "style": "dark",
+ "tags": [
+ "ceph-mixin"
+ ],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "text": "default",
+ "value": "default"
+ },
+ "hide": 0,
+ "label": "Data Source",
+ "name": "datasource",
+ "options": [ ],
+ "query": "prometheus",
+ "refresh": 1,
+ "regex": "",
+ "type": "datasource"
+ },
+ {
+ "allValue": ".+",
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 2,
+ "includeAll": true,
+ "label": "cluster",
+ "multi": true,
+ "name": "cluster",
+ "options": [ ],
+ "query": "label_values(ceph_osd_metadata, cluster)",
+ "refresh": 1,
+ "regex": "(.*)",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": ".+",
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 0,
+ "includeAll": true,
+ "label": "job",
+ "multi": true,
+ "name": "job",
+ "options": [ ],
+ "query": "label_values(ceph_osd_metadata{}, job)",
+ "refresh": 1,
+ "regex": "(.*)",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 0,
+ "includeAll": true,
+ "label": null,
+ "multi": false,
+ "name": "osd_hosts",
+ "options": [ ],
+ "query": "label_values(ceph_disk_occupation{job=~\"$job\"}, exported_instance)",
+ "refresh": 1,
+ "regex": "([^.]*).*",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 0,
+ "includeAll": true,
+ "label": null,
+ "multi": false,
+ "name": "mon_hosts",
+ "options": [ ],
+ "query": "label_values(ceph_mon_metadata{job=~\"$job\"}, ceph_daemon)",
+ "refresh": 1,
+ "regex": "mon.(.*)",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 0,
+ "includeAll": true,
+ "label": null,
+ "multi": false,
+ "name": "mds_hosts",
+ "options": [ ],
+ "query": "label_values(ceph_mds_inodes{job=~\"$job\"}, ceph_daemon)",
+ "refresh": 1,
+ "regex": "mds.(.*)",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 0,
+ "includeAll": true,
+ "label": null,
+ "multi": false,
+ "name": "rgw_hosts",
+ "options": [ ],
+ "query": "label_values(ceph_rgw_metadata{job=~\"$job\"}, ceph_daemon)",
+ "refresh": 1,
+ "regex": "rgw.(.*)",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "",
+ "title": "Host Overview",
+ "uid": "y0KGL0iZz",
+ "version": 0
+}
diff --git a/monitoring/ceph-mixin/dashboards_out/osd-device-details.json b/monitoring/ceph-mixin/dashboards_out/osd-device-details.json
new file mode 100644
index 000000000..384516fb0
--- /dev/null
+++ b/monitoring/ceph-mixin/dashboards_out/osd-device-details.json
@@ -0,0 +1,871 @@
+{
+ "__inputs": [ ],
+ "__requires": [
+ {
+ "id": "grafana",
+ "name": "Grafana",
+ "type": "grafana",
+ "version": "5.3.2"
+ },
+ {
+ "id": "graph",
+ "name": "Graph",
+ "type": "panel",
+ "version": "5.0.0"
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "showIn": 0,
+ "tags": [ ],
+ "type": "dashboard"
+ }
+ ]
+ },
+ "description": "",
+ "editable": false,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "hideControls": false,
+ "id": null,
+ "links": [ ],
+ "panels": [
+ {
+ "collapse": false,
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 0
+ },
+ "id": 2,
+ "panels": [ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": true,
+ "title": "OSD Performance",
+ "titleSize": "h6",
+ "type": "row"
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 6,
+ "x": 0,
+ "y": 1
+ },
+ "id": 3,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
+ {
+ "alias": "read",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(ceph_osd_op_r_latency_sum{job=~\"$job\", ceph_daemon=~\"$osd\"}[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_r_latency_count{job=~\"$job\"}[$__rate_interval])\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "read",
+ "refId": "A"
+ },
+ {
+ "expr": "rate(ceph_osd_op_w_latency_sum{job=~\"$job\", ceph_daemon=~\"$osd\"}[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_w_latency_count{job=~\"$job\"}[$__rate_interval])\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "write",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "$osd Latency",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": "Read (-) / Write (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 6,
+ "x": 6,
+ "y": 1
+ },
+ "id": 4,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
+ {
+ "alias": "Reads",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(ceph_osd_op_r{job=~\"$job\", ceph_daemon=~\"$osd\"}[$__rate_interval])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Reads",
+ "refId": "A"
+ },
+ {
+ "expr": "rate(ceph_osd_op_w{job=~\"$job\", ceph_daemon=~\"$osd\"}[$__rate_interval])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Writes",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "$osd R/W IOPS",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "Read (-) / Write (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 6,
+ "x": 12,
+ "y": 1
+ },
+ "id": 5,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
+ {
+ "alias": "Read Bytes",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(ceph_osd_op_r_out_bytes{job=~\"$job\", ceph_daemon=~\"$osd\"}[$__rate_interval])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Read Bytes",
+ "refId": "A"
+ },
+ {
+ "expr": "rate(ceph_osd_op_w_in_bytes{job=~\"$job\", ceph_daemon=~\"$osd\"}[$__rate_interval])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Write Bytes",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "$osd R/W Bytes",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": "Read (-) / Write (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "collapse": false,
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 10
+ },
+ "id": 6,
+ "panels": [ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": true,
+ "title": "Physical Device Performance",
+ "titleSize": "h6",
+ "type": "row"
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 6,
+ "x": 0,
+ "y": 11
+ },
+ "id": 7,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
+ {
+ "alias": "/.*Reads/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "(\n label_replace(\n rate(node_disk_read_time_seconds_total{}[$__rate_interval]) /\n rate(node_disk_reads_completed_total{}[$__rate_interval]),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n ) and on (instance, device) label_replace(\n label_replace(\n ceph_disk_occupation_human{job=~\"$job\", ceph_daemon=~\"$osd\"},\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n )\n)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}/{{device}} Reads",
+ "refId": "A"
+ },
+ {
+ "expr": "(\n label_replace(\n rate(node_disk_write_time_seconds_total{}[$__rate_interval]) /\n rate(node_disk_writes_completed_total{}[$__rate_interval]),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device)\n label_replace(\n label_replace(\n ceph_disk_occupation_human{job=~\"$job\", ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n )\n )\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}/{{device}} Writes",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Physical Device Latency for $osd",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": "Read (-) / Write (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 6,
+ "x": 6,
+ "y": 11
+ },
+ "id": 8,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
+ {
+ "alias": "/.*Reads/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "label_replace(\n rate(node_disk_writes_completed_total{}[$__rate_interval]),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) and on (instance, device) label_replace(\n label_replace(\n ceph_disk_occupation_human{job=~\"$job\", ceph_daemon=~\"$osd\"},\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{device}} on {{instance}} Writes",
+ "refId": "A"
+ },
+ {
+ "expr": "label_replace(\n rate(node_disk_reads_completed_total{}[$__rate_interval]),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) and on (instance, device) label_replace(\n label_replace(\n ceph_disk_occupation_human{job=~\"$job\", ceph_daemon=~\"$osd\"},\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{device}} on {{instance}} Reads",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Physical Device R/W IOPS for $osd",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "Read (-) / Write (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 6,
+ "x": 12,
+ "y": 11
+ },
+ "id": 9,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
+ {
+ "alias": "/.*Reads/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "label_replace(\n rate(node_disk_read_bytes_total{}[$__rate_interval]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) and on (instance, device) label_replace(\n label_replace(\n ceph_disk_occupation_human{job=~\"$job\", ceph_daemon=~\"$osd\"},\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}} {{device}} Reads",
+ "refId": "A"
+ },
+ {
+ "expr": "label_replace(\n rate(node_disk_written_bytes_total{}[$__rate_interval]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) and on (instance, device) label_replace(\n label_replace(\n ceph_disk_occupation_human{job=~\"$job\", ceph_daemon=~\"$osd\"},\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}} {{device}} Writes",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Physical Device R/W Bytes for $osd",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "Bps",
+ "label": "Read (-) / Write (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 6,
+ "x": 18,
+ "y": 11
+ },
+ "id": 10,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "label_replace(\n rate(node_disk_io_time_seconds_total{}[$__rate_interval]),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) and on (instance, device) label_replace(\n label_replace(\n ceph_disk_occupation_human{job=~\"$job\", ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{device}} on {{instance}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Physical Device Util% for $osd",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "percentunit",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "refresh": "30s",
+ "rows": [ ],
+ "schemaVersion": 16,
+ "style": "dark",
+ "tags": [
+ "ceph-mixin"
+ ],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "text": "default",
+ "value": "default"
+ },
+ "hide": 0,
+ "label": "Data Source",
+ "name": "datasource",
+ "options": [ ],
+ "query": "prometheus",
+ "refresh": 1,
+ "regex": "",
+ "type": "datasource"
+ },
+ {
+ "allValue": ".+",
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 2,
+ "includeAll": true,
+ "label": "cluster",
+ "multi": true,
+ "name": "cluster",
+ "options": [ ],
+ "query": "label_values(ceph_osd_metadata, cluster)",
+ "refresh": 1,
+ "regex": "(.*)",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": ".+",
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 0,
+ "includeAll": true,
+ "label": "job",
+ "multi": true,
+ "name": "job",
+ "options": [ ],
+ "query": "label_values(ceph_osd_metadata{}, job)",
+ "refresh": 1,
+ "regex": "(.*)",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 0,
+ "includeAll": false,
+ "label": "OSD",
+ "multi": false,
+ "name": "osd",
+ "options": [ ],
+ "query": "label_values(ceph_osd_metadata{job=~\"$job\"}, ceph_daemon)",
+ "refresh": 1,
+ "regex": "(.*)",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-3h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "",
+ "title": "OSD device details",
+ "uid": "CrAHE0iZz",
+ "version": 0
+}
diff --git a/monitoring/ceph-mixin/dashboards_out/osds-overview.json b/monitoring/ceph-mixin/dashboards_out/osds-overview.json
new file mode 100644
index 000000000..5ea8955b2
--- /dev/null
+++ b/monitoring/ceph-mixin/dashboards_out/osds-overview.json
@@ -0,0 +1,963 @@
+{
+ "__inputs": [ ],
+ "__requires": [
+ {
+ "id": "grafana",
+ "name": "Grafana",
+ "type": "grafana",
+ "version": "5.0.0"
+ },
+ {
+ "id": "grafana-piechart-panel",
+ "name": "Pie Chart",
+ "type": "panel",
+ "version": "1.3.3"
+ },
+ {
+ "id": "graph",
+ "name": "Graph",
+ "type": "panel",
+ "version": "5.0.0"
+ },
+ {
+ "id": "table",
+ "name": "Table",
+ "type": "panel",
+ "version": "5.0.0"
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "showIn": 0,
+ "tags": [ ],
+ "type": "dashboard"
+ }
+ ]
+ },
+ "description": "",
+ "editable": false,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "hideControls": false,
+ "id": null,
+ "links": [ ],
+ "panels": [
+ {
+ "aliasColors": {
+ "@95%ile": "#e0752d"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 8,
+ "x": 0,
+ "y": 0
+ },
+ "id": 2,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "avg (\n rate(ceph_osd_op_r_latency_sum{job=~\"$job\"}[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_r_latency_count{job=~\"$job\"}[$__rate_interval]) * 1000\n)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "AVG read",
+ "refId": "A"
+ },
+ {
+ "expr": "max(\n rate(ceph_osd_op_r_latency_sum{job=~\"$job\"}[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_r_latency_count{job=~\"$job\"}[$__rate_interval]) * 1000\n)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "MAX read",
+ "refId": "B"
+ },
+ {
+ "expr": "quantile(0.95,\n (\n rate(ceph_osd_op_r_latency_sum{job=~\"$job\"}[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_r_latency_count{job=~\"$job\"}[$__rate_interval])\n * 1000\n )\n)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "@95%ile",
+ "refId": "C"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "OSD Read Latencies",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "ms",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ }
+ ]
+ },
+ {
+ "columns": [ ],
+ "datasource": "$datasource",
+ "description": "This table shows the osd's that are delivering the 10 highest read latencies within the cluster",
+ "gridPos": {
+ "h": 8,
+ "w": 4,
+ "x": 8,
+ "y": 0
+ },
+ "id": 3,
+ "links": [ ],
+ "sort": {
+ "col": 2,
+ "desc": true
+ },
+ "styles": [
+ {
+ "alias": "OSD ID",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "ceph_daemon",
+ "thresholds": [ ],
+ "type": "string",
+ "unit": "short",
+ "valueMaps": [ ]
+ },
+ {
+ "alias": "Latency (ms)",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "Value",
+ "thresholds": [ ],
+ "type": "number",
+ "unit": "none",
+ "valueMaps": [ ]
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "/.*/",
+ "thresholds": [ ],
+ "type": "hidden",
+ "unit": "short",
+ "valueMaps": [ ]
+ }
+ ],
+ "targets": [
+ {
+ "expr": "topk(10,\n (sort(\n (\n rate(ceph_osd_op_r_latency_sum{job=~\"$job\"}[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_r_latency_count{job=~\"$job\"}[$__rate_interval]) *\n 1000\n )\n ))\n)\n",
+ "format": "table",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Highest READ Latencies",
+ "transform": "table",
+ "type": "table"
+ },
+ {
+ "aliasColors": {
+ "@95%ile write": "#e0752d"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 8,
+ "x": 12,
+ "y": 0
+ },
+ "id": 4,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "avg(\n rate(ceph_osd_op_w_latency_sum{job=~\"$job\"}[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_w_latency_count{job=~\"$job\"}[$__rate_interval])\n * 1000\n)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "AVG write",
+ "refId": "A"
+ },
+ {
+ "expr": "max(\n rate(ceph_osd_op_w_latency_sum{job=~\"$job\"}[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_w_latency_count{job=~\"$job\"}[$__rate_interval]) *\n 1000\n)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "MAX write",
+ "refId": "B"
+ },
+ {
+ "expr": "quantile(0.95, (\n rate(ceph_osd_op_w_latency_sum{job=~\"$job\"}[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_w_latency_count{job=~\"$job\"}[$__rate_interval]) *\n 1000\n))\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "@95%ile write",
+ "refId": "C"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "OSD Write Latencies",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "ms",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ }
+ ]
+ },
+ {
+ "columns": [ ],
+ "datasource": "$datasource",
+ "description": "This table shows the osd's that are delivering the 10 highest write latencies within the cluster",
+ "gridPos": {
+ "h": 8,
+ "w": 4,
+ "x": 20,
+ "y": 0
+ },
+ "id": 5,
+ "links": [ ],
+ "sort": {
+ "col": 2,
+ "desc": true
+ },
+ "styles": [
+ {
+ "alias": "OSD ID",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "ceph_daemon",
+ "thresholds": [ ],
+ "type": "string",
+ "unit": "short",
+ "valueMaps": [ ]
+ },
+ {
+ "alias": "Latency (ms)",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "Value",
+ "thresholds": [ ],
+ "type": "number",
+ "unit": "none",
+ "valueMaps": [ ]
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "/.*/",
+ "thresholds": [ ],
+ "type": "hidden",
+ "unit": "short",
+ "valueMaps": [ ]
+ }
+ ],
+ "targets": [
+ {
+ "expr": "topk(10,\n (sort(\n (rate(ceph_osd_op_w_latency_sum{job=~\"$job\"}[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_w_latency_count{job=~\"$job\"}[$__rate_interval]) *\n 1000)\n ))\n)\n",
+ "format": "table",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Highest WRITE Latencies",
+ "transform": "table",
+ "type": "table"
+ },
+ {
+ "aliasColors": { },
+ "datasource": "$datasource",
+ "description": "",
+ "gridPos": {
+ "h": 8,
+ "w": 4,
+ "x": 0,
+ "y": 8
+ },
+ "id": 6,
+ "legend": {
+ "percentage": true,
+ "show": true,
+ "values": true
+ },
+ "legendType": "Under graph",
+ "pieType": "pie",
+ "targets": [
+ {
+ "expr": "count by (device_class) (ceph_osd_metadata{job=~\"$job\"})",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{device_class}}",
+ "refId": "A"
+ }
+ ],
+ "title": "OSD Types Summary",
+ "type": "grafana-piechart-panel",
+ "valueName": "current"
+ },
+ {
+ "aliasColors": {
+ "Non-Encrypted": "#E5AC0E"
+ },
+ "datasource": "$datasource",
+ "description": "",
+ "gridPos": {
+ "h": 8,
+ "w": 4,
+ "x": 4,
+ "y": 8
+ },
+ "id": 7,
+ "legend": {
+ "percentage": true,
+ "show": true,
+ "values": true
+ },
+ "legendType": "Under graph",
+ "pieType": "pie",
+ "targets": [
+ {
+ "expr": "count(ceph_bluefs_wal_total_bytes{job=~\"$job\"})",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "bluestore",
+ "refId": "A"
+ },
+ {
+ "expr": "absent(ceph_bluefs_wal_total_bytes{job=~\"$job\"}) * count(ceph_osd_metadata{job=~\"$job\"})",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "filestore",
+ "refId": "B"
+ }
+ ],
+ "title": "OSD Objectstore Types",
+ "type": "grafana-piechart-panel",
+ "valueName": "current"
+ },
+ {
+ "aliasColors": { },
+ "datasource": "$datasource",
+ "description": "The pie chart shows the various OSD sizes used within the cluster",
+ "gridPos": {
+ "h": 8,
+ "w": 4,
+ "x": 8,
+ "y": 8
+ },
+ "id": 8,
+ "legend": {
+ "percentage": true,
+ "show": true,
+ "values": true
+ },
+ "legendType": "Under graph",
+ "pieType": "pie",
+ "targets": [
+ {
+ "expr": "count(ceph_osd_stat_bytes{job=~\"$job\"} < 1099511627776)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "<1TB",
+ "refId": "A"
+ },
+ {
+ "expr": "count(ceph_osd_stat_bytes{job=~\"$job\"} >= 1099511627776 < 2199023255552)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "<2TB",
+ "refId": "B"
+ },
+ {
+ "expr": "count(ceph_osd_stat_bytes{job=~\"$job\"} >= 2199023255552 < 3298534883328)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "<3TB",
+ "refId": "C"
+ },
+ {
+ "expr": "count(ceph_osd_stat_bytes{job=~\"$job\"} >= 3298534883328 < 4398046511104)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "<4TB",
+ "refId": "D"
+ },
+ {
+ "expr": "count(ceph_osd_stat_bytes{job=~\"$job\"} >= 4398046511104 < 6597069766656)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "<6TB",
+ "refId": "E"
+ },
+ {
+ "expr": "count(ceph_osd_stat_bytes{job=~\"$job\"} >= 6597069766656 < 8796093022208)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "<8TB",
+ "refId": "F"
+ },
+ {
+ "expr": "count(ceph_osd_stat_bytes{job=~\"$job\"} >= 8796093022208 < 10995116277760)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "<10TB",
+ "refId": "G"
+ },
+ {
+ "expr": "count(ceph_osd_stat_bytes{job=~\"$job\"} >= 10995116277760 < 13194139533312)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "<12TB",
+ "refId": "H"
+ },
+ {
+ "expr": "count(ceph_osd_stat_bytes{job=~\"$job\"} >= 13194139533312)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "<12TB+",
+ "refId": "I"
+ }
+ ],
+ "title": "OSD Size Summary",
+ "type": "grafana-piechart-panel",
+ "valueName": "current"
+ },
+ {
+ "aliasColors": { },
+ "bars": true,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 8,
+ "x": 12,
+ "y": 8
+ },
+ "id": 9,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "ceph_osd_numpg{job=~\"$job\"}",
+ "format": "time_series",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "PGs per OSD",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Distribution of PGs per OSD",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": 20,
+ "mode": "histogram",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "# of OSDs",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ }
+ ]
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": true,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "$datasource",
+ "description": "This gauge panel shows onode Hits ratio to help determine if increasing RAM per OSD could help improve the performance of the cluster",
+ "format": "percentunit",
+ "gauge": {
+ "maxValue": 1,
+ "minValue": 0,
+ "show": true,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 4,
+ "x": 20,
+ "y": 8
+ },
+ "id": 10,
+ "interval": null,
+ "links": [ ],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "sum(ceph_bluestore_onode_hits{job=~\"$job\"}) / (\n sum(ceph_bluestore_onode_hits{job=~\"$job\"}) +\n sum(ceph_bluestore_onode_misses{job=~\"$job\"})\n)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "thresholds": ".75",
+ "title": "OSD onode Hits Ratio",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "collapse": false,
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 16
+ },
+ "id": 11,
+ "panels": [ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": true,
+ "title": "R/W Profile",
+ "titleSize": "h6",
+ "type": "row"
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "Show the read/write workload profile overtime",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 24,
+ "x": 0,
+ "y": 17
+ },
+ "id": 12,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "round(sum(rate(ceph_pool_rd{job=~\"$job\"}[$__rate_interval])))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Reads",
+ "refId": "A"
+ },
+ {
+ "expr": "round(sum(rate(ceph_pool_wr{job=~\"$job\"}[$__rate_interval])))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Writes",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Read/Write Profile",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "refresh": "30s",
+ "rows": [ ],
+ "schemaVersion": 16,
+ "style": "dark",
+ "tags": [
+ "ceph-mixin"
+ ],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "text": "default",
+ "value": "default"
+ },
+ "hide": 0,
+ "label": "Data Source",
+ "name": "datasource",
+ "options": [ ],
+ "query": "prometheus",
+ "refresh": 1,
+ "regex": "",
+ "type": "datasource"
+ },
+ {
+ "allValue": ".+",
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 2,
+ "includeAll": true,
+ "label": "cluster",
+ "multi": true,
+ "name": "cluster",
+ "options": [ ],
+ "query": "label_values(ceph_osd_metadata, cluster)",
+ "refresh": 1,
+ "regex": "(.*)",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": ".+",
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 0,
+ "includeAll": true,
+ "label": "job",
+ "multi": true,
+ "name": "job",
+ "options": [ ],
+ "query": "label_values(ceph_osd_metadata{}, job)",
+ "refresh": 1,
+ "regex": "(.*)",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "",
+ "title": "OSD Overview",
+ "uid": "lo02I1Aiz",
+ "version": 0
+}
diff --git a/monitoring/ceph-mixin/dashboards_out/pool-detail.json b/monitoring/ceph-mixin/dashboards_out/pool-detail.json
new file mode 100644
index 000000000..dc8b4152a
--- /dev/null
+++ b/monitoring/ceph-mixin/dashboards_out/pool-detail.json
@@ -0,0 +1,708 @@
+{
+ "__inputs": [ ],
+ "__requires": [
+ {
+ "id": "grafana",
+ "name": "Grafana",
+ "type": "grafana",
+ "version": "5.3.2"
+ },
+ {
+ "id": "graph",
+ "name": "Graph",
+ "type": "panel",
+ "version": "5.0.0"
+ },
+ {
+ "id": "singlestat",
+ "name": "Singlestat",
+ "type": "panel",
+ "version": "5.0.0"
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "showIn": 0,
+ "tags": [ ],
+ "type": "dashboard"
+ }
+ ]
+ },
+ "description": "",
+ "editable": false,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "hideControls": false,
+ "id": null,
+ "links": [ ],
+ "panels": [
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": true,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "$datasource",
+ "format": "percentunit",
+ "gauge": {
+ "maxValue": 1,
+ "minValue": 0,
+ "show": true,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 7,
+ "w": 7,
+ "x": 0,
+ "y": 0
+ },
+ "id": 2,
+ "interval": null,
+ "links": [ ],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": true
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "(ceph_pool_stored{job=~\"$job\"} / (ceph_pool_stored{job=~\"$job\"} + ceph_pool_max_avail{job=~\"$job\"})) *\n on(pool_id) group_left(instance, name) ceph_pool_metadata{job=~\"$job\", name=~\"$pool_name\"}\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "thresholds": ".7,.8",
+ "title": "Capacity used",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": 100,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "$datasource",
+ "description": "Time till pool is full assuming the average fill rate of the last 6 hours",
+ "format": "s",
+ "gauge": {
+ "maxValue": false,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 7,
+ "w": 5,
+ "x": 7,
+ "y": 0
+ },
+ "id": 3,
+ "interval": null,
+ "links": [ ],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": ""
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "(ceph_pool_max_avail{job=~\"$job\"} / deriv(ceph_pool_stored{job=~\"$job\"}[6h])) *\n on(pool_id) group_left(instance, name) ceph_pool_metadata{job=~\"$job\", name=~\"$pool_name\"} > 0\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "thresholds": "current",
+ "title": "Time till full",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": false
+ },
+ {
+ "aliasColors": {
+ "read_op_per_sec": "#3F6833",
+ "write_op_per_sec": "#E5AC0E"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 12,
+ "y": 0
+ },
+ "id": 4,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "deriv(ceph_pool_objects{job=~\"$job\"}[1m]) *\n on(pool_id) group_left(instance, name) ceph_pool_metadata{job=~\"$job\", name=~\"$pool_name\"}\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Objects per second",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "$pool_name Object Ingress/Egress",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "ops",
+ "label": "Objects out(-) / in(+) ",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {
+ "read_op_per_sec": "#3F6833",
+ "write_op_per_sec": "#E5AC0E"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 0,
+ "y": 7
+ },
+ "id": 5,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
+ {
+ "alias": "reads",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(ceph_pool_rd{job=~\"$job\"}[$__rate_interval]) *\n on(pool_id) group_left(instance,name) ceph_pool_metadata{job=~\"$job\", name=~\"$pool_name\"}\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "reads",
+ "refId": "A"
+ },
+ {
+ "expr": "rate(ceph_pool_wr{job=~\"$job\"}[$__rate_interval]) *\n on(pool_id) group_left(instance, name) ceph_pool_metadata{job=~\"$job\", name=~\"$pool_name\"}\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "writes",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "$pool_name Client IOPS",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "iops",
+ "label": "Read (-) / Write (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {
+ "read_op_per_sec": "#3F6833",
+ "write_op_per_sec": "#E5AC0E"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 12,
+ "y": 7
+ },
+ "id": 6,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
+ {
+ "alias": "reads",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(ceph_pool_rd_bytes{job=~\"$job\"}[$__rate_interval]) +\n on(pool_id) group_left(instance, name) ceph_pool_metadata{job=~\"$job\", name=~\"$pool_name\"}\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "reads",
+ "refId": "A"
+ },
+ {
+ "expr": "rate(ceph_pool_wr_bytes{job=~\"$job\"}[$__rate_interval]) +\n on(pool_id) group_left(instance,name) ceph_pool_metadata{job=~\"$job\", name=~\"$pool_name\"}\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "writes",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "$pool_name Client Throughput",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "Bps",
+ "label": "Read (-) / Write (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {
+ "read_op_per_sec": "#3F6833",
+ "write_op_per_sec": "#E5AC0E"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 0,
+ "y": 14
+ },
+ "id": 7,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "ceph_pool_objects{job=~\"$job\"} *\n on(pool_id) group_left(instance,name) ceph_pool_metadata{job=~\"$job\", name=~\"$pool_name\"}\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Number of Objects",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "$pool_name Objects",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "Objects",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "refresh": "30s",
+ "rows": [ ],
+ "schemaVersion": 22,
+ "style": "dark",
+ "tags": [
+ "ceph-mixin"
+ ],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "text": "default",
+ "value": "default"
+ },
+ "hide": 0,
+ "label": "Data Source",
+ "name": "datasource",
+ "options": [ ],
+ "query": "prometheus",
+ "refresh": 1,
+ "regex": "",
+ "type": "datasource"
+ },
+ {
+ "allValue": ".+",
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 2,
+ "includeAll": true,
+ "label": "cluster",
+ "multi": true,
+ "name": "cluster",
+ "options": [ ],
+ "query": "label_values(ceph_osd_metadata, cluster)",
+ "refresh": 1,
+ "regex": "(.*)",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": ".+",
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 0,
+ "includeAll": true,
+ "label": "job",
+ "multi": true,
+ "name": "job",
+ "options": [ ],
+ "query": "label_values(ceph_osd_metadata{}, job)",
+ "refresh": 1,
+ "regex": "(.*)",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 0,
+ "includeAll": false,
+ "label": "Pool Name",
+ "multi": false,
+ "name": "pool_name",
+ "options": [ ],
+ "query": "label_values(ceph_pool_metadata{job=~\"$job\"}, name)",
+ "refresh": 1,
+ "regex": "",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "",
+ "title": "Ceph Pool Details",
+ "uid": "-xyV8KCiz",
+ "version": 0
+}
diff --git a/monitoring/ceph-mixin/dashboards_out/pool-overview.json b/monitoring/ceph-mixin/dashboards_out/pool-overview.json
new file mode 100644
index 000000000..7f042aa5b
--- /dev/null
+++ b/monitoring/ceph-mixin/dashboards_out/pool-overview.json
@@ -0,0 +1,1542 @@
+{
+ "__inputs": [ ],
+ "__requires": [ ],
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "showIn": 0,
+ "tags": [ ],
+ "type": "dashboard"
+ }
+ ]
+ },
+ "description": "",
+ "editable": false,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "hideControls": false,
+ "id": null,
+ "links": [ ],
+ "panels": [
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "$datasource",
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 3,
+ "w": 3,
+ "x": 0,
+ "y": 0
+ },
+ "id": 2,
+ "interval": null,
+ "links": [ ],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "count(ceph_pool_metadata{job=~\"$job\"})",
+ "format": "table",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "Pools",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "avg"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "$datasource",
+ "description": "Count of the pools that have compression enabled",
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 3,
+ "w": 3,
+ "x": 3,
+ "y": 0
+ },
+ "id": 3,
+ "interval": null,
+ "links": [ ],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "count(ceph_pool_metadata{job=~\"$job\", compression_mode!=\"none\"})",
+ "format": "",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "Pools with Compression",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "$datasource",
+ "description": "Total raw capacity available to the cluster",
+ "format": "bytes",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 3,
+ "w": 3,
+ "x": 6,
+ "y": 0
+ },
+ "id": 4,
+ "interval": null,
+ "links": [ ],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "sum(ceph_osd_stat_bytes{job=~\"$job\"})",
+ "format": "",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "Total Raw Capacity",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "$datasource",
+ "description": "Total raw capacity consumed by user data and associated overheads (metadata + redundancy)",
+ "format": "bytes",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 3,
+ "w": 3,
+ "x": 9,
+ "y": 0
+ },
+ "id": 5,
+ "interval": null,
+ "links": [ ],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "sum(ceph_pool_bytes_used{job=~\"$job\"})",
+ "format": "",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "Raw Capacity Consumed",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "$datasource",
+ "description": "Total of client data stored in the cluster",
+ "format": "bytes",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 3,
+ "w": 3,
+ "x": 12,
+ "y": 0
+ },
+ "id": 6,
+ "interval": null,
+ "links": [ ],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "sum(ceph_pool_stored{job=~\"$job\"})",
+ "format": "",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "Logical Stored ",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "$datasource",
+ "description": "A compression saving is determined as the data eligible to be compressed minus the capacity used to store the data after compression",
+ "format": "bytes",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 3,
+ "w": 3,
+ "x": 15,
+ "y": 0
+ },
+ "id": 7,
+ "interval": null,
+ "links": [ ],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "sum(\n ceph_pool_compress_under_bytes{job=~\"$job\"} -\n ceph_pool_compress_bytes_used{job=~\"$job\"}\n)\n",
+ "format": "",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "Compression Savings",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "$datasource",
+ "description": "Indicates how suitable the data is within the pools that are/have been enabled for compression - averaged across all pools holding compressed data",
+ "format": "percent",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 3,
+ "w": 3,
+ "x": 18,
+ "y": 0
+ },
+ "id": 8,
+ "interval": null,
+ "links": [ ],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "(\n sum(ceph_pool_compress_under_bytes{job=~\"$job\"} > 0) /\n sum(ceph_pool_stored_raw{job=~\"$job\"} and ceph_pool_compress_under_bytes{job=~\"$job\"} > 0)\n) * 100\n",
+ "format": "table",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "Compression Eligibility",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "$datasource",
+ "description": "This factor describes the average ratio of data eligible to be compressed divided by the data actually stored. It does not account for data written that was ineligible for compression (too small, or compression yield too low)",
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 3,
+ "w": 3,
+ "x": 21,
+ "y": 0
+ },
+ "id": 9,
+ "interval": null,
+ "links": [ ],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "sum(\n ceph_pool_compress_under_bytes{job=~\"$job\"} > 0)\n / sum(ceph_pool_compress_bytes_used{job=~\"$job\"} > 0\n)\n",
+ "format": "",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "Compression Factor",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "columns": [ ],
+ "datasource": "$datasource",
+ "description": "",
+ "gridPos": {
+ "h": 6,
+ "w": 24,
+ "x": 0,
+ "y": 3
+ },
+ "id": 10,
+ "links": [ ],
+ "sort": {
+ "col": 5,
+ "desc": true
+ },
+ "styles": [
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "Time",
+ "thresholds": [ ],
+ "type": "hidden",
+ "unit": "short",
+ "valueMaps": [ ]
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "instance",
+ "thresholds": [ ],
+ "type": "hidden",
+ "unit": "short",
+ "valueMaps": [ ]
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "job",
+ "thresholds": [ ],
+ "type": "hidden",
+ "unit": "short",
+ "valueMaps": [ ]
+ },
+ {
+ "alias": "Pool Name",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "name",
+ "thresholds": [ ],
+ "type": "string",
+ "unit": "short",
+ "valueMaps": [ ]
+ },
+ {
+ "alias": "Pool ID",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "pool_id",
+ "thresholds": [ ],
+ "type": "hidden",
+ "unit": "none",
+ "valueMaps": [ ]
+ },
+ {
+ "alias": "Compression Factor",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "Value #A",
+ "thresholds": [ ],
+ "type": "number",
+ "unit": "none",
+ "valueMaps": [ ]
+ },
+ {
+ "alias": "% Used",
+ "colorMode": "value",
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "Value #D",
+ "thresholds": [
+ "70",
+ "85"
+ ],
+ "type": "number",
+ "unit": "percentunit",
+ "valueMaps": [ ]
+ },
+ {
+ "alias": "Usable Free",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "Value #B",
+ "thresholds": [ ],
+ "type": "number",
+ "unit": "bytes",
+ "valueMaps": [ ]
+ },
+ {
+ "alias": "Compression Eligibility",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "Value #C",
+ "thresholds": [ ],
+ "type": "number",
+ "unit": "percent",
+ "valueMaps": [ ]
+ },
+ {
+ "alias": "Compression Savings",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "Value #E",
+ "thresholds": [ ],
+ "type": "number",
+ "unit": "bytes",
+ "valueMaps": [ ]
+ },
+ {
+ "alias": "Growth (5d)",
+ "colorMode": "value",
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "Value #F",
+ "thresholds": [
+ "0",
+ "0"
+ ],
+ "type": "number",
+ "unit": "bytes",
+ "valueMaps": [ ]
+ },
+ {
+ "alias": "IOPS",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "Value #G",
+ "thresholds": [ ],
+ "type": "number",
+ "unit": "none",
+ "valueMaps": [ ]
+ },
+ {
+ "alias": "Bandwidth",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "Value #H",
+ "thresholds": [ ],
+ "type": "number",
+ "unit": "Bps",
+ "valueMaps": [ ]
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "__name__",
+ "thresholds": [ ],
+ "type": "hidden",
+ "unit": "short",
+ "valueMaps": [ ]
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "type",
+ "thresholds": [ ],
+ "type": "hidden",
+ "unit": "short",
+ "valueMaps": [ ]
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "compression_mode",
+ "thresholds": [ ],
+ "type": "hidden",
+ "unit": "short",
+ "valueMaps": [ ]
+ },
+ {
+ "alias": "Type",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "description",
+ "thresholds": [ ],
+ "type": "string",
+ "unit": "short",
+ "valueMaps": [ ]
+ },
+ {
+ "alias": "Stored",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "Value #J",
+ "thresholds": [ ],
+ "type": "number",
+ "unit": "bytes",
+ "valueMaps": [ ]
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "Value #I",
+ "thresholds": [ ],
+ "type": "hidden",
+ "unit": "short",
+ "valueMaps": [ ]
+ },
+ {
+ "alias": "Compression",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "Value #K",
+ "thresholds": [ ],
+ "type": "string",
+ "unit": "short",
+ "valueMaps": [
+ {
+ "text": "ON",
+ "value": "1"
+ }
+ ]
+ }
+ ],
+ "targets": [
+ {
+ "expr": "(\n ceph_pool_compress_under_bytes{job=~\"$job\"} /\n ceph_pool_compress_bytes_used{job=~\"$job\"} > 0\n) and on(pool_id) (\n (\n (ceph_pool_compress_under_bytes{job=~\"$job\"} > 0) /\n ceph_pool_stored_raw{job=~\"$job\"}\n ) * 100 > 0.5\n)\n",
+ "format": "table",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "A",
+ "refId": "A"
+ },
+ {
+ "expr": "ceph_pool_max_avail{job=~\"$job\"} *\n on(pool_id) group_left(name) ceph_pool_metadata{job=~\"$job\"}\n",
+ "format": "table",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "B",
+ "refId": "B"
+ },
+ {
+ "expr": "(\n (ceph_pool_compress_under_bytes{job=~\"$job\"} > 0) /\n ceph_pool_stored_raw{job=~\"$job\"}\n) * 100\n",
+ "format": "table",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "C",
+ "refId": "C"
+ },
+ {
+ "expr": "ceph_pool_percent_used{job=~\"$job\"} *\n on(pool_id) group_left(name) ceph_pool_metadata{job=~\"$job\"}\n",
+ "format": "table",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "D",
+ "refId": "D"
+ },
+ {
+ "expr": "ceph_pool_compress_under_bytes{job=~\"$job\"} -\n ceph_pool_compress_bytes_used{job=~\"$job\"} > 0\n",
+ "format": "table",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "E",
+ "refId": "E"
+ },
+ {
+ "expr": "delta(ceph_pool_stored{job=~\"$job\"}[5d])",
+ "format": "table",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "F",
+ "refId": "F"
+ },
+ {
+ "expr": "rate(ceph_pool_rd{job=~\"$job\"}[$__rate_interval])\n + rate(ceph_pool_wr{job=~\"$job\"}[$__rate_interval])\n",
+ "format": "table",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "G",
+ "refId": "G"
+ },
+ {
+ "expr": "rate(ceph_pool_rd_bytes{job=~\"$job\"}[$__rate_interval]) +\n rate(ceph_pool_wr_bytes{job=~\"$job\"}[$__rate_interval])\n",
+ "format": "table",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "H",
+ "refId": "H"
+ },
+ {
+ "expr": "ceph_pool_metadata{job=~\"$job\"}",
+ "format": "table",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "I",
+ "refId": "I"
+ },
+ {
+ "expr": "ceph_pool_stored{job=~\"$job\"} * on(pool_id) group_left ceph_pool_metadata{job=~\"$job\"}",
+ "format": "table",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "J",
+ "refId": "J"
+ },
+ {
+ "expr": "ceph_pool_metadata{job=~\"$job\", compression_mode!=\"none\"}",
+ "format": "table",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "K",
+ "refId": "K"
+ },
+ {
+ "expr": "",
+ "format": "",
+ "intervalFactor": "",
+ "legendFormat": "L",
+ "refId": "L"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Pool Overview",
+ "transform": "table",
+ "type": "table"
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "This chart shows the sum of read and write IOPS from all clients by pool",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 9
+ },
+ "id": 11,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "topk($topk,\n round(\n (\n rate(ceph_pool_rd{job=~\"$job\"}[$__rate_interval]) +\n rate(ceph_pool_wr{job=~\"$job\"}[$__rate_interval])\n ), 1\n ) * on(pool_id) group_left(instance,name) ceph_pool_metadata{job=~\"$job\"})\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{name}} ",
+ "refId": "A"
+ },
+ {
+ "expr": "topk($topk,\n rate(ceph_pool_wr{job=~\"$job\"}[$__rate_interval]) +\n on(pool_id) group_left(instance,name) ceph_pool_metadata{job=~\"$job\"}\n)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{name}} - write",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Top $topk Client IOPS by Pool",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "IOPS",
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "The chart shows the sum of read and write bytes from all clients, by pool",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 9
+ },
+ "id": 12,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "topk($topk,\n (\n rate(ceph_pool_rd_bytes{job=~\"$job\"}[$__rate_interval]) +\n rate(ceph_pool_wr_bytes{job=~\"$job\"}[$__rate_interval])\n ) * on(pool_id) group_left(instance, name) ceph_pool_metadata{job=~\"$job\"}\n)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{name}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Top $topk Client Bandwidth by Pool",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "Bps",
+ "label": "Throughput",
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "Historical view of capacity usage, to help identify growth and trends in pool consumption",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 24,
+ "x": 0,
+ "y": 17
+ },
+ "id": 13,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "ceph_pool_bytes_used{job=~\"$job\"} * on(pool_id) group_right ceph_pool_metadata{job=~\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{name}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Pool Capacity Usage (RAW)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": "Capacity Used",
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "refresh": "30s",
+ "rows": [ ],
+ "schemaVersion": 22,
+ "style": "dark",
+ "tags": [
+ "ceph-mixin"
+ ],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "text": "default",
+ "value": "default"
+ },
+ "hide": 0,
+ "label": "Data Source",
+ "name": "datasource",
+ "options": [ ],
+ "query": "prometheus",
+ "refresh": 1,
+ "regex": "",
+ "type": "datasource"
+ },
+ {
+ "allValue": ".+",
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 2,
+ "includeAll": true,
+ "label": "cluster",
+ "multi": true,
+ "name": "cluster",
+ "options": [ ],
+ "query": "label_values(ceph_osd_metadata, cluster)",
+ "refresh": 1,
+ "regex": "(.*)",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": ".+",
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 0,
+ "includeAll": true,
+ "label": "job",
+ "multi": true,
+ "name": "job",
+ "options": [ ],
+ "query": "label_values(ceph_osd_metadata{}, job)",
+ "refresh": 1,
+ "regex": "(.*)",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": {
+ "text": "15",
+ "value": "15"
+ },
+ "hide": 0,
+ "includeAll": false,
+ "label": "TopK",
+ "multi": false,
+ "name": "topk",
+ "options": [
+ {
+ "text": "15",
+ "value": "15"
+ }
+ ],
+ "query": "15",
+ "refresh": 0,
+ "type": "custom"
+ }
+ ]
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "",
+ "title": "Ceph Pools Overview",
+ "uid": "z99hzWtmk",
+ "version": 0
+}
diff --git a/monitoring/ceph-mixin/dashboards_out/radosgw-detail.json b/monitoring/ceph-mixin/dashboards_out/radosgw-detail.json
new file mode 100644
index 000000000..a0f8f3537
--- /dev/null
+++ b/monitoring/ceph-mixin/dashboards_out/radosgw-detail.json
@@ -0,0 +1,542 @@
+{
+ "__inputs": [ ],
+ "__requires": [
+ {
+ "id": "grafana",
+ "name": "Grafana",
+ "type": "grafana",
+ "version": "5.0.0"
+ },
+ {
+ "id": "grafana-piechart-panel",
+ "name": "Pie Chart",
+ "type": "panel",
+ "version": "1.3.3"
+ },
+ {
+ "id": "graph",
+ "name": "Graph",
+ "type": "panel",
+ "version": "5.0.0"
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "showIn": 0,
+ "tags": [ ],
+ "type": "dashboard"
+ }
+ ]
+ },
+ "description": "",
+ "editable": false,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "hideControls": false,
+ "id": null,
+ "links": [ ],
+ "panels": [
+ {
+ "collapse": false,
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 0
+ },
+ "id": 2,
+ "panels": [ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": true,
+ "title": "RGW Host Detail : $rgw_servers",
+ "titleSize": "h6",
+ "type": "row"
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 6,
+ "x": 0,
+ "y": 1
+ },
+ "id": 3,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum by (instance_id) (\n rate(ceph_rgw_get_initial_lat_sum{job=~\"$job\"}[$__rate_interval]) /\n rate(ceph_rgw_get_initial_lat_count{job=~\"$job\"}[$__rate_interval])\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\", ceph_daemon=~\"$rgw_servers\"}\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "GET {{ceph_daemon}}",
+ "refId": "A"
+ },
+ {
+ "expr": "sum by (instance_id) (\n rate(ceph_rgw_put_initial_lat_sum{job=~\"$job\"}[$__rate_interval]) /\n rate(ceph_rgw_put_initial_lat_count{job=~\"$job\"}[$__rate_interval])\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\", ceph_daemon=~\"$rgw_servers\"}\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "PUT {{ceph_daemon}}",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "$rgw_servers GET/PUT Latencies",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 7,
+ "x": 6,
+ "y": 1
+ },
+ "id": 4,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(ceph_rgw_get_b{job=~\"$job\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\", ceph_daemon=~\"$rgw_servers\"}\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "GETs {{ceph_daemon}}",
+ "refId": "A"
+ },
+ {
+ "expr": "rate(ceph_rgw_put_b{job=~\"$job\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon)\n ceph_rgw_metadata{job=~\"$job\", ceph_daemon=~\"$rgw_servers\"}\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "PUTs {{ceph_daemon}}",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Bandwidth by HTTP Operation",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {
+ "GETs": "#7eb26d",
+ "Other": "#447ebc",
+ "PUTs": "#eab839",
+ "Requests": "#3f2b5b",
+ "Requests Failed": "#bf1b00"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 7,
+ "x": 13,
+ "y": 1
+ },
+ "id": 5,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(ceph_rgw_failed_req{job=~\"$job\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\",ceph_daemon=~\"$rgw_servers\"}\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Requests Failed {{ceph_daemon}}",
+ "refId": "A"
+ },
+ {
+ "expr": "rate(ceph_rgw_get{job=~\"$job\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\", ceph_daemon=~\"$rgw_servers\"}\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "GETs {{ceph_daemon}}",
+ "refId": "B"
+ },
+ {
+ "expr": "rate(ceph_rgw_put{job=~\"$job\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\", ceph_daemon=~\"$rgw_servers\"}\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "PUTs {{ceph_daemon}}",
+ "refId": "C"
+ },
+ {
+ "expr": "(\n rate(ceph_rgw_req{job=~\"$job\"}[$__rate_interval]) -\n (\n rate(ceph_rgw_get{job=~\"$job\"}[$__rate_interval]) +\n rate(ceph_rgw_put{job=~\"$job\"}[$__rate_interval])\n )\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\", ceph_daemon=~\"$rgw_servers\"}\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Other {{ceph_daemon}}",
+ "refId": "D"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "HTTP Request Breakdown",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {
+ "Failures": "#bf1b00",
+ "GETs": "#7eb26d",
+ "Other (HEAD,POST,DELETE)": "#447ebc",
+ "PUTs": "#eab839",
+ "Requests": "#3f2b5b"
+ },
+ "datasource": "$datasource",
+ "description": "",
+ "gridPos": {
+ "h": 8,
+ "w": 4,
+ "x": 20,
+ "y": 1
+ },
+ "id": 6,
+ "legend": {
+ "percentage": true,
+ "show": true,
+ "values": true
+ },
+ "legendType": "Under graph",
+ "pieType": "pie",
+ "targets": [
+ {
+ "expr": "rate(ceph_rgw_failed_req{job=~\"$job\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\", ceph_daemon=~\"$rgw_servers\"}\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Failures {{ceph_daemon}}",
+ "refId": "A"
+ },
+ {
+ "expr": "rate(ceph_rgw_get{job=~\"$job\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\", ceph_daemon=~\"$rgw_servers\"}\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "GETs {{ceph_daemon}}",
+ "refId": "B"
+ },
+ {
+ "expr": "rate(ceph_rgw_put{job=~\"$job\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\", ceph_daemon=~\"$rgw_servers\"}\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "PUTs {{ceph_daemon}}",
+ "refId": "C"
+ },
+ {
+ "expr": "(\n rate(ceph_rgw_req{job=~\"$job\"}[$__rate_interval]) -\n (\n rate(ceph_rgw_get{job=~\"$job\"}[$__rate_interval]) +\n rate(ceph_rgw_put{job=~\"$job\"}[$__rate_interval])\n )\n) * on (instance_id) group_left (ceph_daemon)\n ceph_rgw_metadata{job=~\"$job\", ceph_daemon=~\"$rgw_servers\"}\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Other (DELETE,LIST) {{ceph_daemon}}",
+ "refId": "D"
+ }
+ ],
+ "title": "Workload Breakdown",
+ "type": "grafana-piechart-panel",
+ "valueName": "current"
+ }
+ ],
+ "refresh": "30s",
+ "rows": [ ],
+ "schemaVersion": 16,
+ "style": "dark",
+ "tags": [
+ "ceph-mixin",
+ "overview"
+ ],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "text": "default",
+ "value": "default"
+ },
+ "hide": 0,
+ "label": "Data Source",
+ "name": "datasource",
+ "options": [ ],
+ "query": "prometheus",
+ "refresh": 1,
+ "regex": "",
+ "type": "datasource"
+ },
+ {
+ "allValue": ".+",
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 2,
+ "includeAll": true,
+ "label": "cluster",
+ "multi": true,
+ "name": "cluster",
+ "options": [ ],
+ "query": "label_values(ceph_osd_metadata, cluster)",
+ "refresh": 1,
+ "regex": "(.*)",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": ".+",
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 0,
+ "includeAll": true,
+ "label": "job",
+ "multi": true,
+ "name": "job",
+ "options": [ ],
+ "query": "label_values(ceph_osd_metadata{}, job)",
+ "refresh": 1,
+ "regex": "(.*)",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 0,
+ "includeAll": true,
+ "label": "",
+ "multi": false,
+ "name": "rgw_servers",
+ "options": [ ],
+ "query": "label_values(ceph_rgw_metadata{job=~\"$job\"}, ceph_daemon)",
+ "refresh": 1,
+ "regex": "",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "",
+ "title": "RGW Instance Detail",
+ "uid": "x5ARzZtmk",
+ "version": 0
+}
diff --git a/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json b/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json
new file mode 100644
index 000000000..77d69e4f3
--- /dev/null
+++ b/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json
@@ -0,0 +1,1266 @@
+{
+ "__inputs": [ ],
+ "__requires": [
+ {
+ "id": "grafana",
+ "name": "Grafana",
+ "type": "grafana",
+ "version": "5.0.0"
+ },
+ {
+ "id": "graph",
+ "name": "Graph",
+ "type": "panel",
+ "version": "5.0.0"
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "showIn": 0,
+ "tags": [ ],
+ "type": "dashboard"
+ }
+ ]
+ },
+ "description": "",
+ "editable": false,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "hideControls": false,
+ "id": null,
+ "links": [ ],
+ "panels": [
+ {
+ "collapse": false,
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 0
+ },
+ "id": 2,
+ "panels": [ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": true,
+ "title": "RGW Overview - All Gateways",
+ "titleSize": "h6",
+ "type": "row"
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 8,
+ "x": 0,
+ "y": 1
+ },
+ "id": 3,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "label_replace(\n rate(ceph_rgw_get_initial_lat_sum{job=~\"$job\"}[$__rate_interval]) /\n rate(ceph_rgw_get_initial_lat_count{job=~\"$job\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\"},\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "GET {{rgw_host}}",
+ "refId": "A"
+ },
+ {
+ "expr": "label_replace(\n rate(ceph_rgw_put_initial_lat_sum{job=~\"$job\"}[$__rate_interval]) /\n rate(ceph_rgw_put_initial_lat_count{job=~\"$job\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\"},\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "PUT {{rgw_host}}",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Average GET/PUT Latencies by RGW Instance",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 7,
+ "x": 8,
+ "y": 1
+ },
+ "id": 4,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum by (rgw_host) (\n label_replace(\n rate(ceph_rgw_req{job=~\"$job\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\"},\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n )\n)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{rgw_host}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Total Requests/sec by RGW Instance",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "none",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "Latencies are shown stacked, without a yaxis to provide a visual indication of GET latency imbalance across RGW hosts",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 6,
+ "x": 15,
+ "y": 1
+ },
+ "id": 5,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "label_replace(\n rate(ceph_rgw_get_initial_lat_sum{job=~\"$job\"}[$__rate_interval]) /\n rate(ceph_rgw_get_initial_lat_count{job=~\"$job\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\"},\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{rgw_host}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "GET Latencies by RGW Instance",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "Total bytes transferred in/out of all radosgw instances within the cluster",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 6,
+ "w": 8,
+ "x": 0,
+ "y": 8
+ },
+ "id": 6,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(rate(ceph_rgw_get_b{job=~\"$job\"}[$__rate_interval]))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "GETs",
+ "refId": "A"
+ },
+ {
+ "expr": "sum(rate(ceph_rgw_put_b{job=~\"$job\"}[$__rate_interval]))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "PUTs",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Bandwidth Consumed by Type",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "Total bytes transferred in/out through get/put operations, by radosgw instance",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 6,
+ "w": 7,
+ "x": 8,
+ "y": 8
+ },
+ "id": 7,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "label_replace(sum by (instance_id) (\n rate(ceph_rgw_get_b{job=~\"$job\"}[$__rate_interval]) +\n rate(ceph_rgw_put_b{job=~\"$job\"}[$__rate_interval])) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\"},\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{rgw_host}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Bandwidth by RGW Instance",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "Latencies are shown stacked, without a yaxis to provide a visual indication of PUT latency imbalance across RGW hosts",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 6,
+ "w": 6,
+ "x": 15,
+ "y": 8
+ },
+ "id": 8,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "label_replace(\n rate(ceph_rgw_put_initial_lat_sum{job=~\"$job\"}[$__rate_interval]) /\n rate(ceph_rgw_put_initial_lat_count{job=~\"$job\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\"},\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{rgw_host}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "PUT Latencies by RGW Instance",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ }
+ ]
+ },
+ {
+ "collapse": false,
+ "collapsed": false,
+ "gridPos": {
+ "h": 12,
+ "w": 9,
+ "x": 0,
+ "y": 12
+ },
+ "id": 9,
+ "panels": [ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": true,
+ "title": "RGW Overview - HAProxy Metrics",
+ "titleSize": "h6",
+ "type": "row"
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 12,
+ "w": 5,
+ "x": 0,
+ "y": 12
+ },
+ "id": 10,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
+ [
+ {
+ "alias": "/.*Back.*/",
+ "transform": "negative-Y"
+ },
+ {
+ "alias": "/.*1.*/"
+ },
+ {
+ "alias": "/.*2.*/"
+ },
+ {
+ "alias": "/.*3.*/"
+ },
+ {
+ "alias": "/.*4.*/"
+ },
+ {
+ "alias": "/.*5.*/"
+ },
+ {
+ "alias": "/.*other.*/"
+ }
+ ]
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(\n rate(\n haproxy_frontend_http_responses_total{code=~\"$code\", job=~\"$job_haproxy\", instance=~\"$ingress_service\", proxy=~\"frontend\"}[$__rate_interval]\n )\n) by (code)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Frontend {{ code }}",
+ "refId": "A"
+ },
+ {
+ "expr": "sum(\n rate(\n haproxy_backend_http_responses_total{code=~\"$code\", job=~\"$job_haproxy\", instance=~\"$ingress_service\", proxy=~\"backend\"}[$__rate_interval]\n )\n) by (code)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Backend {{ code }}",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Total responses by HTTP code",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 12,
+ "w": 5,
+ "x": 5,
+ "y": 12
+ },
+ "id": 11,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
+ [
+ {
+ "alias": "/.*Response.*/",
+ "transform": "negative-Y"
+ },
+ {
+ "alias": "/.*Backend.*/",
+ "transform": "negative-Y"
+ }
+ ]
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(\n rate(\n haproxy_frontend_http_requests_total{proxy=~\"frontend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n )\n) by (instance)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Requests",
+ "refId": "A"
+ },
+ {
+ "expr": "sum(\n rate(\n haproxy_backend_response_errors_total{proxy=~\"backend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n )\n) by (instance)\n",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Response errors",
+ "refId": "B"
+ },
+ {
+ "expr": "sum(\n rate(\n haproxy_frontend_request_errors_total{proxy=~\"frontend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n )\n) by (instance)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Requests errors",
+ "refId": "C"
+ },
+ {
+ "expr": "sum(\n rate(\n haproxy_backend_redispatch_warnings_total{proxy=~\"backend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n )\n) by (instance)\n",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Backend redispatch",
+ "refId": "D"
+ },
+ {
+ "expr": "sum(\n rate(\n haproxy_backend_retry_warnings_total{proxy=~\"backend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n )\n) by (instance)\n",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Backend retry",
+ "refId": "E"
+ },
+ {
+ "expr": "sum(\n rate(\n haproxy_frontend_requests_denied_total{proxy=~\"frontend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n )\n) by (instance)\n",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Request denied",
+ "refId": "F"
+ },
+ {
+ "expr": "sum(\n haproxy_backend_current_queue{proxy=~\"backend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}\n) by (instance)\n",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Backend Queued",
+ "refId": "G"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Total requests / responses",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 12,
+ "w": 5,
+ "x": 10,
+ "y": 12
+ },
+ "id": 12,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
+ [
+ {
+ "alias": "/.*Back.*/",
+ "transform": "negative-Y"
+ }
+ ]
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(\n rate(\n haproxy_frontend_connections_total{proxy=~\"frontend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n )\n) by (instance)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Front",
+ "refId": "A"
+ },
+ {
+ "expr": "sum(\n rate(\n haproxy_backend_connection_attempts_total{proxy=~\"backend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n )\n) by (instance)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Back",
+ "refId": "B"
+ },
+ {
+ "expr": "sum(\n rate(\n haproxy_backend_connection_errors_total{proxy=~\"backend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n )\n) by (instance)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Back errors",
+ "refId": "C"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Total number of connections",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 12,
+ "w": 6,
+ "x": 15,
+ "y": 12
+ },
+ "id": 13,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
+ [
+ {
+ "alias": "/.*OUT.*/",
+ "transform": "negative-Y"
+ }
+ ]
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(\n rate(\n haproxy_frontend_bytes_in_total{proxy=~\"frontend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n ) * 8\n) by (instance)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "IN Front",
+ "refId": "A"
+ },
+ {
+ "expr": "sum(\n rate(\n haproxy_frontend_bytes_out_total{proxy=~\"frontend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n ) * 8\n) by (instance)\n",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "OUT Front",
+ "refId": "B"
+ },
+ {
+ "expr": "sum(\n rate(\n haproxy_backend_bytes_in_total{proxy=~\"backend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n ) * 8\n) by (instance)\n",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "IN Back",
+ "refId": "C"
+ },
+ {
+ "expr": "sum(\n rate(\n haproxy_backend_bytes_out_total{proxy=~\"backend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n ) * 8\n) by (instance)\n",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "OUT Back",
+ "refId": "D"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Current total of incoming / outgoing bytes",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "refresh": "30s",
+ "rows": [ ],
+ "schemaVersion": 16,
+ "style": "dark",
+ "tags": [
+ "ceph-mixin",
+ "overview"
+ ],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "text": "default",
+ "value": "default"
+ },
+ "hide": 0,
+ "label": "Data Source",
+ "name": "datasource",
+ "options": [ ],
+ "query": "prometheus",
+ "refresh": 1,
+ "regex": "",
+ "type": "datasource"
+ },
+ {
+ "allValue": ".+",
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 2,
+ "includeAll": true,
+ "label": "cluster",
+ "multi": true,
+ "name": "cluster",
+ "options": [ ],
+ "query": "label_values(ceph_osd_metadata, cluster)",
+ "refresh": 1,
+ "regex": "(.*)",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": ".+",
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 0,
+ "includeAll": true,
+ "label": "job",
+ "multi": true,
+ "name": "job",
+ "options": [ ],
+ "query": "label_values(ceph_osd_metadata{}, job)",
+ "refresh": 1,
+ "regex": "(.*)",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 0,
+ "includeAll": true,
+ "label": "",
+ "multi": false,
+ "name": "rgw_servers",
+ "options": [ ],
+ "query": "label_values(ceph_rgw_metadata{job=~\"$job\"}, ceph_daemon)",
+ "refresh": 1,
+ "regex": "RGW Server",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 0,
+ "includeAll": true,
+ "label": "HTTP Code",
+ "multi": false,
+ "name": "code",
+ "options": [ ],
+ "query": "label_values(haproxy_server_http_responses_total{job=~\"$job_haproxy\", instance=~\"$ingress_service\"}, code)",
+ "refresh": 1,
+ "regex": "",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": ".+",
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 0,
+ "includeAll": true,
+ "label": "job haproxy",
+ "multi": true,
+ "name": "job_haproxy",
+ "options": [ ],
+ "query": "label_values(haproxy_server_status, job)",
+ "refresh": 1,
+ "regex": "(.*)",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 0,
+ "includeAll": true,
+ "label": "Ingress Service",
+ "multi": false,
+ "name": "ingress_service",
+ "options": [ ],
+ "query": "label_values(haproxy_server_status{job=~\"$job_haproxy\"}, instance)",
+ "refresh": 1,
+ "regex": "",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "",
+ "title": "RGW Overview",
+ "uid": "WAkugZpiz",
+ "version": 0
+}
diff --git a/monitoring/ceph-mixin/dashboards_out/radosgw-sync-overview.json b/monitoring/ceph-mixin/dashboards_out/radosgw-sync-overview.json
new file mode 100644
index 000000000..e0c3037d5
--- /dev/null
+++ b/monitoring/ceph-mixin/dashboards_out/radosgw-sync-overview.json
@@ -0,0 +1,504 @@
+{
+ "__inputs": [ ],
+ "__requires": [
+ {
+ "id": "grafana",
+ "name": "Grafana",
+ "type": "grafana",
+ "version": "5.0.0"
+ },
+ {
+ "id": "graph",
+ "name": "Graph",
+ "type": "panel",
+ "version": "5.0.0"
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "showIn": 0,
+ "tags": [ ],
+ "type": "dashboard"
+ }
+ ]
+ },
+ "description": "",
+ "editable": false,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "hideControls": false,
+ "id": null,
+ "links": [ ],
+ "panels": [
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 8,
+ "x": 0,
+ "y": 0
+ },
+ "id": 2,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null as zero",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [ ],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum by (source_zone) (rate(ceph_data_sync_from_zone_fetch_bytes_sum{job=~\"$job\"}[$__rate_interval]))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{source_zone}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Replication (throughput) from Source Zone",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "Bps",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 8,
+ "x": 8,
+ "y": 0
+ },
+ "id": 3,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null as zero",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [ ],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum by (source_zone) (rate(ceph_data_sync_from_zone_fetch_bytes_count{job=~\"$job\"}[$__rate_interval]))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{source_zone}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Replication (objects) from Source Zone",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "Objects/s",
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 8,
+ "x": 16,
+ "y": 0
+ },
+ "id": 4,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null as zero",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [ ],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum by (source_zone) (rate(ceph_data_sync_from_zone_poll_latency_sum{job=~\"$job\"}[$__rate_interval]))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{source_zone}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Polling Request Latency from Source Zone",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "ms",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 8,
+ "x": 0,
+ "y": 7
+ },
+ "id": 5,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null as zero",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [ ],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum by (source_zone) (rate(ceph_data_sync_from_zone_fetch_errors{job=~\"$job\"}[$__rate_interval]))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{source_zone}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Unsuccessful Object Replications from Source Zone",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "Count/s",
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "refresh": "30s",
+ "rows": [ ],
+ "schemaVersion": 16,
+ "style": "dark",
+ "tags": [
+ "ceph-mixin",
+ "overview"
+ ],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "text": "default",
+ "value": "default"
+ },
+ "hide": 0,
+ "label": "Data Source",
+ "name": "datasource",
+ "options": [ ],
+ "query": "prometheus",
+ "refresh": 1,
+ "regex": "",
+ "type": "datasource"
+ },
+ {
+ "allValue": ".+",
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 2,
+ "includeAll": true,
+ "label": "cluster",
+ "multi": true,
+ "name": "cluster",
+ "options": [ ],
+ "query": "label_values(ceph_osd_metadata, cluster)",
+ "refresh": 1,
+ "regex": "(.*)",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": ".+",
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 0,
+ "includeAll": true,
+ "label": "job",
+ "multi": true,
+ "name": "job",
+ "options": [ ],
+ "query": "label_values(ceph_osd_metadata{}, job)",
+ "refresh": 1,
+ "regex": "(.*)",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 0,
+ "includeAll": true,
+ "label": "",
+ "multi": false,
+ "name": "rgw_servers",
+ "options": [ ],
+ "query": "label_values(ceph_rgw_metadata{job=~\"$job\"}, ceph_daemon)",
+ "refresh": 1,
+ "regex": "RGW Server",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "",
+ "title": "RGW Sync Overview",
+ "uid": "rgw-sync-overview",
+ "version": 0
+}
diff --git a/monitoring/ceph-mixin/dashboards_out/rbd-details.json b/monitoring/ceph-mixin/dashboards_out/rbd-details.json
new file mode 100644
index 000000000..f64de312a
--- /dev/null
+++ b/monitoring/ceph-mixin/dashboards_out/rbd-details.json
@@ -0,0 +1,458 @@
+{
+ "__inputs": [ ],
+ "__requires": [
+ {
+ "id": "grafana",
+ "name": "Grafana",
+ "type": "grafana",
+ "version": "5.3.3"
+ },
+ {
+ "id": "graph",
+ "name": "Graph",
+ "type": "panel",
+ "version": "5.0.0"
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "showIn": 0,
+ "tags": [ ],
+ "type": "dashboard"
+ }
+ ]
+ },
+ "description": "Detailed Performance of RBD Images (IOPS/Throughput/Latency)",
+ "editable": false,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "hideControls": false,
+ "id": null,
+ "links": [ ],
+ "panels": [
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 8,
+ "x": 0,
+ "y": 0
+ },
+ "id": 2,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null as zero",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(ceph_rbd_write_ops{job=~\"$job\", pool=\"$pool\", image=\"$image\"}[$__rate_interval])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{pool}} Write",
+ "refId": "A"
+ },
+ {
+ "expr": "rate(ceph_rbd_read_ops{job=~\"$job\", pool=\"$pool\", image=\"$image\"}[$__rate_interval])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{pool}} Read",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "IOPS",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "iops",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ },
+ {
+ "format": "iops",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 8,
+ "x": 8,
+ "y": 0
+ },
+ "id": 3,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null as zero",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(ceph_rbd_write_bytes{job=~\"$job\", pool=\"$pool\", image=\"$image\"}[$__rate_interval])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{pool}} Write",
+ "refId": "A"
+ },
+ {
+ "expr": "rate(ceph_rbd_read_bytes{job=~\"$job\", pool=\"$pool\", image=\"$image\"}[$__rate_interval])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{pool}} Read",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Throughput",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "Bps",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ },
+ {
+ "format": "Bps",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 8,
+ "x": 16,
+ "y": 0
+ },
+ "id": 4,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null as zero",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(ceph_rbd_write_latency_sum{job=~\"$job\", pool=\"$pool\", image=\"$image\"}[$__rate_interval]) /\n rate(ceph_rbd_write_latency_count{job=~\"$job\", pool=\"$pool\", image=\"$image\"}[$__rate_interval])\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{pool}} Write",
+ "refId": "A"
+ },
+ {
+ "expr": "rate(ceph_rbd_read_latency_sum{job=~\"$job\", pool=\"$pool\", image=\"$image\"}[$__rate_interval]) /\n rate(ceph_rbd_read_latency_count{job=~\"$job\", pool=\"$pool\", image=\"$image\"}[$__rate_interval])\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{pool}} Read",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Average Latency",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "ns",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ },
+ {
+ "format": "ns",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "refresh": "30s",
+ "rows": [ ],
+ "schemaVersion": 16,
+ "style": "dark",
+ "tags": [
+ "ceph-mixin"
+ ],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "text": "default",
+ "value": "default"
+ },
+ "hide": 0,
+ "label": "Data Source",
+ "name": "datasource",
+ "options": [ ],
+ "query": "prometheus",
+ "refresh": 1,
+ "regex": "",
+ "type": "datasource"
+ },
+ {
+ "allValue": ".+",
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 2,
+ "includeAll": true,
+ "label": "cluster",
+ "multi": true,
+ "name": "cluster",
+ "options": [ ],
+ "query": "label_values(ceph_osd_metadata, cluster)",
+ "refresh": 1,
+ "regex": "(.*)",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": ".+",
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 0,
+ "includeAll": true,
+ "label": "job",
+ "multi": true,
+ "name": "job",
+ "options": [ ],
+ "query": "label_values(ceph_osd_metadata{}, job)",
+ "refresh": 1,
+ "regex": "(.*)",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 0,
+ "includeAll": false,
+ "label": "",
+ "multi": false,
+ "name": "pool",
+ "options": [ ],
+ "query": "label_values(pool)",
+ "refresh": 1,
+ "regex": "",
+ "sort": 0,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 0,
+ "includeAll": false,
+ "label": "",
+ "multi": false,
+ "name": "image",
+ "options": [ ],
+ "query": "label_values(image)",
+ "refresh": 1,
+ "regex": "",
+ "sort": 0,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "",
+ "title": "RBD Details",
+ "uid": "YhCYGcuZz",
+ "version": 0
+}
diff --git a/monitoring/ceph-mixin/dashboards_out/rbd-overview.json b/monitoring/ceph-mixin/dashboards_out/rbd-overview.json
new file mode 100644
index 000000000..e017280e0
--- /dev/null
+++ b/monitoring/ceph-mixin/dashboards_out/rbd-overview.json
@@ -0,0 +1,737 @@
+{
+ "__inputs": [ ],
+ "__requires": [
+ {
+ "id": "grafana",
+ "name": "Grafana",
+ "type": "grafana",
+ "version": "5.4.2"
+ },
+ {
+ "id": "graph",
+ "name": "Graph",
+ "type": "panel",
+ "version": "5.0.0"
+ },
+ {
+ "id": "prometheus",
+ "name": "Prometheus",
+ "type": "datasource",
+ "version": "5.0.0"
+ },
+ {
+ "id": "table",
+ "name": "Table",
+ "type": "panel",
+ "version": "5.0.0"
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "showIn": 0,
+ "tags": [ ],
+ "type": "dashboard"
+ }
+ ]
+ },
+ "description": "",
+ "editable": false,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "hideControls": false,
+ "id": null,
+ "links": [ ],
+ "panels": [
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 8,
+ "x": 0,
+ "y": 0
+ },
+ "id": 2,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "round(sum(rate(ceph_rbd_write_ops{job=~\"$job\"}[$__rate_interval])))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Writes",
+ "refId": "A"
+ },
+ {
+ "expr": "round(sum(rate(ceph_rbd_read_ops{job=~\"$job\"}[$__rate_interval])))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Reads",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "IOPS",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 8,
+ "x": 8,
+ "y": 0
+ },
+ "id": 3,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "round(sum(rate(ceph_rbd_write_bytes{job=~\"$job\"}[$__rate_interval])))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Write",
+ "refId": "A"
+ },
+ {
+ "expr": "round(sum(rate(ceph_rbd_read_bytes{job=~\"$job\"}[$__rate_interval])))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Read",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Throughput",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "Bps",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": { },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 8,
+ "x": 16,
+ "y": 0
+ },
+ "id": 4,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "round(\n sum(rate(ceph_rbd_write_latency_sum{job=~\"$job\"}[$__rate_interval])) /\n sum(rate(ceph_rbd_write_latency_count{job=~\"$job\"}[$__rate_interval]))\n)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Write",
+ "refId": "A"
+ },
+ {
+ "expr": "round(\n sum(rate(ceph_rbd_read_latency_sum{job=~\"$job\"}[$__rate_interval])) /\n sum(rate(ceph_rbd_read_latency_count{job=~\"$job\"}[$__rate_interval]))\n)\n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Read",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Average Latency",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [ ]
+ },
+ "yaxes": [
+ {
+ "format": "ns",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": 0,
+ "show": true
+ }
+ ]
+ },
+ {
+ "columns": [ ],
+ "datasource": "$datasource",
+ "description": "",
+ "gridPos": {
+ "h": 7,
+ "w": 8,
+ "x": 0,
+ "y": 7
+ },
+ "id": 5,
+ "links": [ ],
+ "sort": {
+ "col": 3,
+ "desc": true
+ },
+ "styles": [
+ {
+ "alias": "Pool",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "pool",
+ "thresholds": [ ],
+ "type": "string",
+ "unit": "short",
+ "valueMaps": [ ]
+ },
+ {
+ "alias": "Image",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "image",
+ "thresholds": [ ],
+ "type": "string",
+ "unit": "short",
+ "valueMaps": [ ]
+ },
+ {
+ "alias": "IOPS",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "Value",
+ "thresholds": [ ],
+ "type": "number",
+ "unit": "iops",
+ "valueMaps": [ ]
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "/.*/",
+ "thresholds": [ ],
+ "type": "hidden",
+ "unit": "short",
+ "valueMaps": [ ]
+ }
+ ],
+ "targets": [
+ {
+ "expr": "topk(10,\n (\n sort((\n rate(ceph_rbd_write_ops{job=~\"$job\"}[$__rate_interval]) +\n on (image, pool, namespace) rate(ceph_rbd_read_ops{job=~\"$job\"}[$__rate_interval])\n ))\n )\n)\n",
+ "format": "table",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Highest IOPS",
+ "transform": "table",
+ "type": "table"
+ },
+ {
+ "columns": [ ],
+ "datasource": "$datasource",
+ "description": "",
+ "gridPos": {
+ "h": 7,
+ "w": 8,
+ "x": 8,
+ "y": 7
+ },
+ "id": 6,
+ "links": [ ],
+ "sort": {
+ "col": 3,
+ "desc": true
+ },
+ "styles": [
+ {
+ "alias": "Pool",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "pool",
+ "thresholds": [ ],
+ "type": "string",
+ "unit": "short",
+ "valueMaps": [ ]
+ },
+ {
+ "alias": "Image",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "image",
+ "thresholds": [ ],
+ "type": "string",
+ "unit": "short",
+ "valueMaps": [ ]
+ },
+ {
+ "alias": "Throughput",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "Value",
+ "thresholds": [ ],
+ "type": "number",
+ "unit": "Bps",
+ "valueMaps": [ ]
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "/.*/",
+ "thresholds": [ ],
+ "type": "hidden",
+ "unit": "short",
+ "valueMaps": [ ]
+ }
+ ],
+ "targets": [
+ {
+ "expr": "topk(10,\n sort(\n sum(\n rate(ceph_rbd_read_bytes{job=~\"$job\"}[$__rate_interval]) +\n rate(ceph_rbd_write_bytes{job=~\"$job\"}[$__rate_interval])\n ) by (pool, image, namespace)\n )\n)\n",
+ "format": "table",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Highest Throughput",
+ "transform": "table",
+ "type": "table"
+ },
+ {
+ "columns": [ ],
+ "datasource": "$datasource",
+ "description": "",
+ "gridPos": {
+ "h": 7,
+ "w": 8,
+ "x": 16,
+ "y": 7
+ },
+ "id": 7,
+ "links": [ ],
+ "sort": {
+ "col": 3,
+ "desc": true
+ },
+ "styles": [
+ {
+ "alias": "Pool",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "pool",
+ "thresholds": [ ],
+ "type": "string",
+ "unit": "short",
+ "valueMaps": [ ]
+ },
+ {
+ "alias": "Image",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "image",
+ "thresholds": [ ],
+ "type": "string",
+ "unit": "short",
+ "valueMaps": [ ]
+ },
+ {
+ "alias": "Latency",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "Value",
+ "thresholds": [ ],
+ "type": "number",
+ "unit": "ns",
+ "valueMaps": [ ]
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "/.*/",
+ "thresholds": [ ],
+ "type": "hidden",
+ "unit": "short",
+ "valueMaps": [ ]
+ }
+ ],
+ "targets": [
+ {
+ "expr": "topk(10,\n sum(\n rate(ceph_rbd_write_latency_sum{job=~\"$job\"}[$__rate_interval]) /\n clamp_min(rate(ceph_rbd_write_latency_count{job=~\"$job\"}[$__rate_interval]), 1) +\n rate(ceph_rbd_read_latency_sum{job=~\"$job\"}[$__rate_interval]) /\n clamp_min(rate(ceph_rbd_read_latency_count{job=~\"$job\"}[$__rate_interval]), 1)\n ) by (pool, image, namespace)\n)\n",
+ "format": "table",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Highest Latency",
+ "transform": "table",
+ "type": "table"
+ }
+ ],
+ "refresh": "30s",
+ "rows": [ ],
+ "schemaVersion": 16,
+ "style": "dark",
+ "tags": [
+ "ceph-mixin",
+ "overview"
+ ],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "text": "default",
+ "value": "default"
+ },
+ "hide": 0,
+ "label": "Data Source",
+ "name": "datasource",
+ "options": [ ],
+ "query": "prometheus",
+ "refresh": 1,
+ "regex": "",
+ "type": "datasource"
+ },
+ {
+ "allValue": ".+",
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 2,
+ "includeAll": true,
+ "label": "cluster",
+ "multi": true,
+ "name": "cluster",
+ "options": [ ],
+ "query": "label_values(ceph_osd_metadata, cluster)",
+ "refresh": 1,
+ "regex": "(.*)",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": ".+",
+ "current": { },
+ "datasource": "$datasource",
+ "hide": 0,
+ "includeAll": true,
+ "label": "job",
+ "multi": true,
+ "name": "job",
+ "options": [ ],
+ "query": "label_values(ceph_osd_metadata{}, job)",
+ "refresh": 1,
+ "regex": "(.*)",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "",
+ "title": "RBD Overview",
+ "uid": "41FrpeUiz",
+ "version": 0
+}