summaryrefslogtreecommitdiffstats
path: root/monitoring/ceph-mixin/dashboards/osd.libsonnet
diff options
context:
space:
mode:
Diffstat (limited to 'monitoring/ceph-mixin/dashboards/osd.libsonnet')
-rw-r--r--monitoring/ceph-mixin/dashboards/osd.libsonnet618
1 files changed, 618 insertions, 0 deletions
diff --git a/monitoring/ceph-mixin/dashboards/osd.libsonnet b/monitoring/ceph-mixin/dashboards/osd.libsonnet
new file mode 100644
index 000000000..0ea43c96f
--- /dev/null
+++ b/monitoring/ceph-mixin/dashboards/osd.libsonnet
@@ -0,0 +1,618 @@
+local g = import 'grafonnet/grafana.libsonnet';
+
+(import 'utils.libsonnet') {
+ 'osds-overview.json':
+ $.dashboardSchema(
+ 'OSD Overview',
+ '',
+ 'lo02I1Aiz',
+ 'now-1h',
+ '30s',
+ 16,
+ $._config.dashboardTags,
+ ''
+ )
+ .addAnnotation(
+ $.addAnnotationSchema(
+ 1,
+ '-- Grafana --',
+ true,
+ true,
+ 'rgba(0, 211, 255, 1)',
+ 'Annotations & Alerts',
+ 'dashboard'
+ )
+ )
+ .addRequired(
+ type='grafana', id='grafana', name='Grafana', version='5.0.0'
+ )
+ .addRequired(
+ type='panel', id='grafana-piechart-panel', name='Pie Chart', version='1.3.3'
+ )
+ .addRequired(
+ type='panel', id='graph', name='Graph', version='5.0.0'
+ )
+ .addRequired(
+ type='panel', id='table', name='Table', version='5.0.0'
+ )
+ .addTemplate(
+ g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
+ )
+ .addTemplate(
+ $.addClusterTemplate()
+ )
+ .addTemplate(
+ $.addJobTemplate()
+ )
+ .addPanels([
+ $.simpleGraphPanel(
+ { '@95%ile': '#e0752d' },
+ 'OSD Read Latencies',
+ '',
+ 'ms',
+ null,
+ '0',
+ |||
+ avg (
+ rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) /
+ on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) * 1000
+ )
+ ||| % $.matchers(),
+ 'AVG read',
+ 0,
+ 0,
+ 8,
+ 8
+ )
+ .addTargets(
+ [
+ $.addTargetSchema(
+ |||
+ max(
+ rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) /
+ on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) * 1000
+ )
+ ||| % $.matchers(),
+ 'MAX read'
+ ),
+ $.addTargetSchema(
+ |||
+ quantile(0.95,
+ (
+ rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) /
+ on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval])
+ * 1000
+ )
+ )
+ ||| % $.matchers(),
+ '@95%ile'
+ ),
+ ],
+ ),
+ $.addTableSchema(
+ '$datasource',
+ "This table shows the osd's that are delivering the 10 highest read latencies within the cluster",
+ { col: 2, desc: true },
+ [
+ $.overviewStyle('OSD ID', 'ceph_daemon', 'string', 'short'),
+ $.overviewStyle('Latency (ms)', 'Value', 'number', 'none'),
+ $.overviewStyle('', '/.*/', 'hidden', 'short'),
+ ],
+ 'Highest READ Latencies',
+ 'table'
+ )
+ .addTarget(
+ $.addTargetSchema(
+ |||
+ topk(10,
+ (sort(
+ (
+ rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) /
+ on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) *
+ 1000
+ )
+ ))
+ )
+ ||| % $.matchers(),
+ '',
+ 'table',
+ 1,
+ true
+ )
+ ) + { gridPos: { x: 8, y: 0, w: 4, h: 8 } },
+ $.simpleGraphPanel(
+ {
+ '@95%ile write': '#e0752d',
+ },
+ 'OSD Write Latencies',
+ '',
+ 'ms',
+ null,
+ '0',
+ |||
+ avg(
+ rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) /
+ on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval])
+ * 1000
+ )
+ ||| % $.matchers(),
+ 'AVG write',
+ 12,
+ 0,
+ 8,
+ 8
+ )
+ .addTargets(
+ [
+ $.addTargetSchema(
+ |||
+ max(
+ rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) /
+ on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) *
+ 1000
+ )
+ ||| % $.matchers(), 'MAX write'
+ ),
+ $.addTargetSchema(
+ |||
+ quantile(0.95, (
+ rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) /
+ on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) *
+ 1000
+ ))
+ ||| % $.matchers(), '@95%ile write'
+ ),
+ ],
+ ),
+ $.addTableSchema(
+ '$datasource',
+ "This table shows the osd's that are delivering the 10 highest write latencies within the cluster",
+ { col: 2, desc: true },
+ [
+ $.overviewStyle(
+ 'OSD ID', 'ceph_daemon', 'string', 'short'
+ ),
+ $.overviewStyle('Latency (ms)', 'Value', 'number', 'none'),
+ $.overviewStyle('', '/.*/', 'hidden', 'short'),
+ ],
+ 'Highest WRITE Latencies',
+ 'table'
+ )
+ .addTarget(
+ $.addTargetSchema(
+ |||
+ topk(10,
+ (sort(
+ (rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) /
+ on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) *
+ 1000)
+ ))
+ )
+ ||| % $.matchers(),
+ '',
+ 'table',
+ 1,
+ true
+ )
+ ) + { gridPos: { x: 20, y: 0, w: 4, h: 8 } },
+ $.simplePieChart(
+ {}, '', 'OSD Types Summary'
+ )
+ .addTarget(
+ $.addTargetSchema('count by (device_class) (ceph_osd_metadata{%(matchers)s})' % $.matchers(), '{{device_class}}')
+ ) + { gridPos: { x: 0, y: 8, w: 4, h: 8 } },
+ $.simplePieChart(
+ { 'Non-Encrypted': '#E5AC0E' }, '', 'OSD Objectstore Types'
+ )
+ .addTarget(
+ $.addTargetSchema(
+ 'count(ceph_bluefs_wal_total_bytes{%(matchers)s})' % $.matchers(), 'bluestore', 'time_series', 2
+ )
+ )
+ .addTarget(
+ $.addTargetSchema(
+ 'absent(ceph_bluefs_wal_total_bytes{%(matchers)s}) * count(ceph_osd_metadata{%(matchers)s})' % $.matchers(), 'filestore', 'time_series', 2
+ )
+ ) + { gridPos: { x: 4, y: 8, w: 4, h: 8 } },
+ $.simplePieChart(
+ {}, 'The pie chart shows the various OSD sizes used within the cluster', 'OSD Size Summary'
+ )
+ .addTarget($.addTargetSchema(
+ 'count(ceph_osd_stat_bytes{%(matchers)s} < 1099511627776)' % $.matchers(), '<1TB', 'time_series', 2
+ ))
+ .addTarget($.addTargetSchema(
+ 'count(ceph_osd_stat_bytes{%(matchers)s} >= 1099511627776 < 2199023255552)' % $.matchers(), '<2TB', 'time_series', 2
+ ))
+ .addTarget($.addTargetSchema(
+ 'count(ceph_osd_stat_bytes{%(matchers)s} >= 2199023255552 < 3298534883328)' % $.matchers(), '<3TB', 'time_series', 2
+ ))
+ .addTarget($.addTargetSchema(
+ 'count(ceph_osd_stat_bytes{%(matchers)s} >= 3298534883328 < 4398046511104)' % $.matchers(), '<4TB', 'time_series', 2
+ ))
+ .addTarget($.addTargetSchema(
+ 'count(ceph_osd_stat_bytes{%(matchers)s} >= 4398046511104 < 6597069766656)' % $.matchers(), '<6TB', 'time_series', 2
+ ))
+ .addTarget($.addTargetSchema(
+ 'count(ceph_osd_stat_bytes{%(matchers)s} >= 6597069766656 < 8796093022208)' % $.matchers(), '<8TB', 'time_series', 2
+ ))
+ .addTarget($.addTargetSchema(
+ 'count(ceph_osd_stat_bytes{%(matchers)s} >= 8796093022208 < 10995116277760)' % $.matchers(), '<10TB', 'time_series', 2
+ ))
+ .addTarget($.addTargetSchema(
+ 'count(ceph_osd_stat_bytes{%(matchers)s} >= 10995116277760 < 13194139533312)' % $.matchers(), '<12TB', 'time_series', 2
+ ))
+ .addTarget($.addTargetSchema(
+ 'count(ceph_osd_stat_bytes{%(matchers)s} >= 13194139533312)' % $.matchers(), '<12TB+', 'time_series', 2
+ )) + { gridPos: { x: 8, y: 8, w: 4, h: 8 } },
+ g.graphPanel.new(bars=true,
+ datasource='$datasource',
+ title='Distribution of PGs per OSD',
+ x_axis_buckets=20,
+ x_axis_mode='histogram',
+ x_axis_values=['total'],
+ formatY1='short',
+ formatY2='short',
+ labelY1='# of OSDs',
+ min='0',
+ nullPointMode='null')
+ .addTarget($.addTargetSchema(
+ 'ceph_osd_numpg{%(matchers)s}' % $.matchers(), 'PGs per OSD', 'time_series', 1, true
+ )) + { gridPos: { x: 12, y: 8, w: 8, h: 8 } },
+ $.gaugeSingleStatPanel(
+ 'percentunit',
+ 'OSD onode Hits Ratio',
+ 'This gauge panel shows onode Hits ratio to help determine if increasing RAM per OSD could help improve the performance of the cluster',
+ 'current',
+ true,
+ 1,
+ true,
+ false,
+ '.75',
+ |||
+ sum(ceph_bluestore_onode_hits{%(matchers)s}) / (
+ sum(ceph_bluestore_onode_hits{%(matchers)s}) +
+ sum(ceph_bluestore_onode_misses{%(matchers)s})
+ )
+ ||| % $.matchers(),
+ 'time_series',
+ 20,
+ 8,
+ 4,
+ 8
+ ),
+ $.addRowSchema(false,
+ true,
+ 'R/W Profile') + { gridPos: { x: 0, y: 16, w: 24, h: 1 } },
+ $.simpleGraphPanel(
+ {},
+ 'Read/Write Profile',
+ 'Show the read/write workload profile overtime',
+ 'short',
+ null,
+ null,
+ 'round(sum(rate(ceph_pool_rd{%(matchers)s}[$__rate_interval])))' % $.matchers(),
+ 'Reads',
+ 0,
+ 17,
+ 24,
+ 8
+ )
+ .addTargets([$.addTargetSchema(
+ 'round(sum(rate(ceph_pool_wr{%(matchers)s}[$__rate_interval])))' % $.matchers(), 'Writes'
+ )]),
+ $.addTableSchema(
+ '$datasource',
+ 'This table shows the 10 OSDs with the highest number of slow ops',
+ { col: 2, desc: true },
+ [
+ $.overviewStyle('OSD ID', 'ceph_daemon', 'string', 'short'),
+ $.overviewStyle('Slow Ops', 'Value', 'number', 'none'),
+ $.overviewStyle('', '/.*/', 'hidden', 'short'),
+ ],
+ 'Top Slow Ops',
+ 'table'
+ )
+ .addTarget(
+ $.addTargetSchema(
+ |||
+ topk(10,
+ (ceph_daemon_health_metrics{type="SLOW_OPS", ceph_daemon=~"osd.*"})
+ )
+ ||| % $.matchers(),
+ '',
+ 'table',
+ 1,
+ true
+ )
+ ) + { gridPos: { x: 0, y: 20, w: 4, h: 8 } },
+ ]),
+ 'osd-device-details.json':
+ local OsdDeviceDetailsPanel(title,
+ description,
+ formatY1,
+ labelY1,
+ expr1,
+ expr2,
+ legendFormat1,
+ legendFormat2,
+ x,
+ y,
+ w,
+ h) =
+ $.graphPanelSchema({},
+ title,
+ description,
+ 'null',
+ false,
+ formatY1,
+ 'short',
+ labelY1,
+ null,
+ null,
+ 1,
+ '$datasource')
+ .addTargets(
+ [
+ $.addTargetSchema(expr1,
+ legendFormat1),
+ $.addTargetSchema(expr2, legendFormat2),
+ ]
+ ) + { gridPos: { x: x, y: y, w: w, h: h } };
+
+ $.dashboardSchema(
+ 'OSD device details',
+ '',
+ 'CrAHE0iZz',
+ 'now-3h',
+ '30s',
+ 16,
+ $._config.dashboardTags,
+ ''
+ )
+ .addAnnotation(
+ $.addAnnotationSchema(
+ 1,
+ '-- Grafana --',
+ true,
+ true,
+ 'rgba(0, 211, 255, 1)',
+ 'Annotations & Alerts',
+ 'dashboard'
+ )
+ )
+ .addRequired(
+ type='grafana', id='grafana', name='Grafana', version='5.3.2'
+ )
+ .addRequired(
+ type='panel', id='graph', name='Graph', version='5.0.0'
+ )
+ .addTemplate(
+ g.template.datasource('datasource',
+ 'prometheus',
+ 'default',
+ label='Data Source')
+ )
+ .addTemplate(
+ $.addClusterTemplate()
+ )
+ .addTemplate(
+ $.addJobTemplate()
+ )
+ .addTemplate(
+ $.addTemplateSchema('osd',
+ '$datasource',
+ 'label_values(ceph_osd_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(),
+ 1,
+ false,
+ 1,
+ 'OSD',
+ '(.*)')
+ )
+ .addPanels([
+ $.addRowSchema(
+ false, true, 'OSD Performance'
+ ) + { gridPos: { x: 0, y: 0, w: 24, h: 1 } },
+ OsdDeviceDetailsPanel(
+ '$osd Latency',
+ '',
+ 's',
+ 'Read (-) / Write (+)',
+ |||
+ rate(ceph_osd_op_r_latency_sum{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval]) /
+ on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval])
+ ||| % $.matchers(),
+ |||
+ rate(ceph_osd_op_w_latency_sum{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval]) /
+ on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval])
+ ||| % $.matchers(),
+ 'read',
+ 'write',
+ 0,
+ 1,
+ 6,
+ 9
+ )
+ .addSeriesOverride(
+ {
+ alias: 'read',
+ transform: 'negative-Y',
+ }
+ ),
+ OsdDeviceDetailsPanel(
+ '$osd R/W IOPS',
+ '',
+ 'short',
+ 'Read (-) / Write (+)',
+ 'rate(ceph_osd_op_r{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % $.matchers(),
+ 'rate(ceph_osd_op_w{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % $.matchers(),
+ 'Reads',
+ 'Writes',
+ 6,
+ 1,
+ 6,
+ 9
+ )
+ .addSeriesOverride(
+ { alias: 'Reads', transform: 'negative-Y' }
+ ),
+ OsdDeviceDetailsPanel(
+ '$osd R/W Bytes',
+ '',
+ 'bytes',
+ 'Read (-) / Write (+)',
+ 'rate(ceph_osd_op_r_out_bytes{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % $.matchers(),
+ 'rate(ceph_osd_op_w_in_bytes{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % $.matchers(),
+ 'Read Bytes',
+ 'Write Bytes',
+ 12,
+ 1,
+ 6,
+ 9
+ )
+ .addSeriesOverride({ alias: 'Read Bytes', transform: 'negative-Y' }),
+ $.addRowSchema(
+ false, true, 'Physical Device Performance'
+ ) + { gridPos: { x: 0, y: 10, w: 24, h: 1 } },
+ OsdDeviceDetailsPanel(
+ 'Physical Device Latency for $osd',
+ '',
+ 's',
+ 'Read (-) / Write (+)',
+ |||
+ (
+ label_replace(
+ rate(node_disk_read_time_seconds_total{%(clusterMatcher)s}[$__rate_interval]) /
+ rate(node_disk_reads_completed_total{%(clusterMatcher)s}[$__rate_interval]),
+ "instance", "$1", "instance", "([^:.]*).*"
+ ) and on (instance, device) label_replace(
+ label_replace(
+ ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
+ "device", "$1", "device", "/dev/(.*)"
+ ), "instance", "$1", "instance", "([^:.]*).*"
+ )
+ )
+ ||| % $.matchers(),
+ |||
+ (
+ label_replace(
+ rate(node_disk_write_time_seconds_total{%(clusterMatcher)s}[$__rate_interval]) /
+ rate(node_disk_writes_completed_total{%(clusterMatcher)s}[$__rate_interval]),
+ "instance", "$1", "instance", "([^:.]*).*") and on (instance, device)
+ label_replace(
+ label_replace(
+ ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"
+ ), "instance", "$1", "instance", "([^:.]*).*"
+ )
+ )
+ ||| % $.matchers(),
+ '{{instance}}/{{device}} Reads',
+ '{{instance}}/{{device}} Writes',
+ 0,
+ 11,
+ 6,
+ 9
+ )
+ .addSeriesOverride(
+ { alias: '/.*Reads/', transform: 'negative-Y' }
+ ),
+ OsdDeviceDetailsPanel(
+ 'Physical Device R/W IOPS for $osd',
+ '',
+ 'short',
+ 'Read (-) / Write (+)',
+ |||
+ label_replace(
+ rate(node_disk_writes_completed_total{%(clusterMatcher)s}[$__rate_interval]),
+ "instance", "$1", "instance", "([^:.]*).*"
+ ) and on (instance, device) label_replace(
+ label_replace(
+ ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
+ "device", "$1", "device", "/dev/(.*)"
+ ), "instance", "$1", "instance", "([^:.]*).*"
+ )
+ ||| % $.matchers(),
+ |||
+ label_replace(
+ rate(node_disk_reads_completed_total{%(clusterMatcher)s}[$__rate_interval]),
+ "instance", "$1", "instance", "([^:.]*).*"
+ ) and on (instance, device) label_replace(
+ label_replace(
+ ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
+ "device", "$1", "device", "/dev/(.*)"
+ ), "instance", "$1", "instance", "([^:.]*).*"
+ )
+ ||| % $.matchers(),
+ '{{device}} on {{instance}} Writes',
+ '{{device}} on {{instance}} Reads',
+ 6,
+ 11,
+ 6,
+ 9
+ )
+ .addSeriesOverride(
+ { alias: '/.*Reads/', transform: 'negative-Y' }
+ ),
+ OsdDeviceDetailsPanel(
+ 'Physical Device R/W Bytes for $osd',
+ '',
+ 'Bps',
+ 'Read (-) / Write (+)',
+ |||
+ label_replace(
+ rate(node_disk_read_bytes_total{%(clusterMatcher)s}[$__rate_interval]), "instance", "$1", "instance", "([^:.]*).*"
+ ) and on (instance, device) label_replace(
+ label_replace(
+ ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
+ "device", "$1", "device", "/dev/(.*)"
+ ), "instance", "$1", "instance", "([^:.]*).*"
+ )
+ ||| % $.matchers(),
+ |||
+ label_replace(
+ rate(node_disk_written_bytes_total{%(clusterMatcher)s}[$__rate_interval]), "instance", "$1", "instance", "([^:.]*).*"
+ ) and on (instance, device) label_replace(
+ label_replace(
+ ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
+ "device", "$1", "device", "/dev/(.*)"
+ ), "instance", "$1", "instance", "([^:.]*).*"
+ )
+ ||| % $.matchers(),
+ '{{instance}} {{device}} Reads',
+ '{{instance}} {{device}} Writes',
+ 12,
+ 11,
+ 6,
+ 9
+ )
+ .addSeriesOverride(
+ { alias: '/.*Reads/', transform: 'negative-Y' }
+ ),
+ $.graphPanelSchema(
+ {},
+ 'Physical Device Util% for $osd',
+ '',
+ 'null',
+ false,
+ 'percentunit',
+ 'short',
+ null,
+ null,
+ null,
+ 1,
+ '$datasource'
+ )
+ .addTarget($.addTargetSchema(
+ |||
+ label_replace(
+ rate(node_disk_io_time_seconds_total{%(clusterMatcher)s}[$__rate_interval]),
+ "instance", "$1", "instance", "([^:.]*).*"
+ ) and on (instance, device) label_replace(
+ label_replace(
+ ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"
+ ), "instance", "$1", "instance", "([^:.]*).*"
+ )
+ ||| % $.matchers(),
+ '{{device}} on {{instance}}'
+ )) + { gridPos: { x: 18, y: 11, w: 6, h: 9 } },
+ ]),
+}