summaryrefslogtreecommitdiffstats
path: root/monitoring
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 18:24:20 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 18:24:20 +0000
commit483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch)
treee5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /monitoring
parentInitial commit. (diff)
downloadceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.tar.xz
ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.zip
Adding upstream version 14.2.21.upstream/14.2.21upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'monitoring')
-rw-r--r--monitoring/grafana/README.md14
-rw-r--r--monitoring/grafana/dashboards/CMakeLists.txt8
-rw-r--r--monitoring/grafana/dashboards/README28
-rw-r--r--monitoring/grafana/dashboards/ceph-cluster.json1251
-rw-r--r--monitoring/grafana/dashboards/cephfs-overview.json309
-rw-r--r--monitoring/grafana/dashboards/host-details.json1215
-rw-r--r--monitoring/grafana/dashboards/hosts-overview.json854
-rw-r--r--monitoring/grafana/dashboards/osd-device-details.json800
-rw-r--r--monitoring/grafana/dashboards/osds-overview.json876
-rw-r--r--monitoring/grafana/dashboards/pool-detail.json665
-rw-r--r--monitoring/grafana/dashboards/pool-overview.json706
-rw-r--r--monitoring/grafana/dashboards/radosgw-detail.json491
-rw-r--r--monitoring/grafana/dashboards/radosgw-overview.json630
-rw-r--r--monitoring/grafana/dashboards/rbd-details.json409
-rw-r--r--monitoring/grafana/dashboards/rbd-overview.json685
-rw-r--r--monitoring/grafana/screenshots/ceph-cluster1.pngbin0 -> 201798 bytes
-rw-r--r--monitoring/grafana/screenshots/ceph-cluster2.pngbin0 -> 204800 bytes
-rw-r--r--monitoring/grafana/screenshots/host-details.pngbin0 -> 214033 bytes
-rw-r--r--monitoring/grafana/screenshots/host-details.resized.pngbin0 -> 833169 bytes
-rw-r--r--monitoring/grafana/screenshots/host_overview.pngbin0 -> 403994 bytes
-rw-r--r--monitoring/grafana/screenshots/mds-performance.pngbin0 -> 52682 bytes
-rw-r--r--monitoring/grafana/screenshots/osd-performance.pngbin0 -> 137137 bytes
-rw-r--r--monitoring/grafana/screenshots/osd_overview.pngbin0 -> 358872 bytes
-rw-r--r--monitoring/grafana/screenshots/pool-details.pngbin0 -> 123041 bytes
-rw-r--r--monitoring/grafana/screenshots/pool-overview.pngbin0 -> 102273 bytes
-rw-r--r--monitoring/grafana/screenshots/rgw-detail.pngbin0 -> 277302 bytes
-rw-r--r--monitoring/grafana/screenshots/rgw-overview.pngbin0 -> 296415 bytes
-rw-r--r--monitoring/prometheus/README.md7
-rw-r--r--monitoring/prometheus/alerts/ceph_default_alerts.yml268
29 files changed, 9216 insertions, 0 deletions
diff --git a/monitoring/grafana/README.md b/monitoring/grafana/README.md
new file mode 100644
index 00000000..4054e985
--- /dev/null
+++ b/monitoring/grafana/README.md
@@ -0,0 +1,14 @@
+## Grafana dashboards for Ceph
+
+Here you can find a collection of [Grafana](https://grafana.com/grafana)
+dashboards for Ceph Monitoring. These dashboards are based on metrics collected
+from [prometheus](https://prometheus.io/) scraping the [prometheus mgr
+plugin](http://docs.ceph.com/docs/master/mgr/prometheus/) and the
+[node_exporter](https://github.com/prometheus/node_exporter).
+
+### Other requirements
+
+- Luminous 12.2.5 or newer
+- [Status Panel](https://grafana.com/plugins/vonage-status-panel) installed
+- node_exporter 0.15.x and 0.16.x are supported (host details and hosts
+overview dashboards)
diff --git a/monitoring/grafana/dashboards/CMakeLists.txt b/monitoring/grafana/dashboards/CMakeLists.txt
new file mode 100644
index 00000000..41ca947a
--- /dev/null
+++ b/monitoring/grafana/dashboards/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(CEPH_GRAFANA_DASHBOARDS_DIR "${CMAKE_INSTALL_SYSCONFDIR}/grafana/dashboards/ceph-dashboard"
+ CACHE PATH "Location for grafana dashboards")
+
+FILE(GLOB CEPH_GRAFANA_DASHBOARDS "*.json")
+
+install(FILES
+ ${CEPH_GRAFANA_DASHBOARDS}
+ DESTINATION ${CEPH_GRAFANA_DASHBOARDS_DIR})
diff --git a/monitoring/grafana/dashboards/README b/monitoring/grafana/dashboards/README
new file mode 100644
index 00000000..3803cd7a
--- /dev/null
+++ b/monitoring/grafana/dashboards/README
@@ -0,0 +1,28 @@
+Context
+These dashboards should be enough to get started on the integration. It's not a complete set, so more will be added in the next week.
+
+Bare in mind that the osd device details dashboard needs node_exporter active - all the other dashboards pick data out of ceph-mgr based metrics.
+
+
+The cephfs dashboard only has 2 panels currently. The counter available are
+a little light at the moment. Patrick/Venky have been addressing this with
+https://bugzilla.redhat.com/show_bug.cgi?id=1618523
+cephfs-overview.json
+
+Host Information
+host-details.json combines generic server metrics that show cpu/memory/network stats (including network errors/drops),
+with disk level stats for OSD hosts. OSD charts show the physical device name together with it's corresponding osd id for correlation.
+
+Ceph Pools
+two dashboards. Overview gives the high level combined view, pool-detail needs a pool_name variable passed to it (currently uses a templating var which is visible)
+pool-overview.json
+pool-detail.json
+
+OSD Device Details. This dashboard needs some further work. It currently shows
+OSD level stats with physical device stats but leaves out some of the counters
+that cephmetrics provides for trouble shooting.
+osd-device-details.json
+
+Object gateway dashboards, again split into overview and detail. The detail dashboard needs the relevant ceph-deamon name for the rgw instance.
+radosgw-overview.json
+radosgw-detail.json
diff --git a/monitoring/grafana/dashboards/ceph-cluster.json b/monitoring/grafana/dashboards/ceph-cluster.json
new file mode 100644
index 00000000..447dbb29
--- /dev/null
+++ b/monitoring/grafana/dashboards/ceph-cluster.json
@@ -0,0 +1,1251 @@
+{
+ "__requires": [
+ {
+ "type": "grafana",
+ "id": "grafana",
+ "name": "Grafana",
+ "version": "5.0.0"
+ },
+ {
+ "type": "panel",
+ "id": "graph",
+ "name": "Graph",
+ "version": "5.0.0"
+ },
+ {
+ "type": "panel",
+ "id": "heatmap",
+ "name": "Heatmap",
+ "version": "5.0.0"
+ },
+ {
+ "type": "panel",
+ "id": "singlestat",
+ "name": "Singlestat",
+ "version": "5.0.0"
+ },
+ {
+ "type": "panel",
+ "id": "vonage-status-panel",
+ "name": "Status Panel",
+ "version": "1.0.8"
+ }
+ ],
+ "annotations": {
+ "list": []
+ },
+ "description": "Ceph cluster overview",
+ "editable": false,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "id": null,
+ "iteration": 1525415495309,
+ "links": [],
+ "panels": [
+ {
+ "cacheTimeout": null,
+ "colorBackground": true,
+ "colorValue": false,
+ "colors": [
+ "rgba(50, 128, 45, 0.9)",
+ "rgba(237, 129, 40, 0.9)",
+ "rgb(255, 0, 0)"
+ ],
+ "datasource": "$datasource",
+ "editable": false,
+ "error": false,
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 3,
+ "w": 2,
+ "x": 0,
+ "y": 0
+ },
+ "hideTimeOverride": true,
+ "id": 21,
+ "interval": "1m",
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "span": 2,
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "ceph_health_status{instance=~'$instance'}",
+ "format": "time_series",
+ "interval": "$interval",
+ "intervalFactor": 1,
+ "refId": "A",
+ "step": 60
+ }
+ ],
+ "thresholds": "1,2",
+ "timeFrom": null,
+ "title": "Health Status",
+ "transparent": false,
+ "type": "singlestat",
+ "valueFontSize": "50%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "OK",
+ "value": "0"
+ },
+ {
+ "op": "=",
+ "text": "WARN",
+ "value": "1"
+ },
+ {
+ "op": "=",
+ "text": "ERR",
+ "value": "2"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "colorMode": "Panel",
+ "colors": {
+ "crit": "rgb(255, 0, 0)",
+ "disable": "rgba(128, 128, 128, 0.9)",
+ "ok": "rgba(50, 128, 45, 0.9)",
+ "warn": "rgba(237, 129, 40, 0.9)"
+ },
+ "cornerRadius": 0,
+ "datasource": "$datasource",
+ "displayName": "",
+ "flipCard": false,
+ "flipTime": 5,
+ "fontFormat": "Regular",
+ "gridPos": {
+ "h": 3,
+ "w": 2,
+ "x": 2,
+ "y": 0
+ },
+ "id": 43,
+ "isAutoScrollOnOverflow": false,
+ "isGrayOnNoData": false,
+ "isHideAlertsOnDisable": false,
+ "isIgnoreOKColors": false,
+ "links": [],
+ "targets": [
+ {
+ "aggregation": "Last",
+ "alias": "All",
+ "decimals": 2,
+ "displayAliasType": "Always",
+ "displayType": "Regular",
+ "displayValueWithAlias": "When Alias Displayed",
+ "expr": "count(ceph_osd_metadata{instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "All",
+ "refId": "A",
+ "units": "none",
+ "valueHandler": "Number Threshold"
+ },
+ {
+ "aggregation": "Last",
+ "alias": "In",
+ "decimals": 2,
+ "displayAliasType": "Always",
+ "displayType": "Regular",
+ "displayValueWithAlias": "When Alias Displayed",
+ "expr": "sum(ceph_osds_in{instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "In",
+ "refId": "B",
+ "units": "none",
+ "valueHandler": "Number Threshold"
+ },
+ {
+ "aggregation": "Last",
+ "alias": "Out",
+ "decimals": 2,
+ "displayAliasType": "Warning / Critical",
+ "displayType": "Regular",
+ "displayValueWithAlias": "When Alias Displayed",
+ "expr": "sum(ceph_osd_in{instance=~\"$instance\"} == bool 0)",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "Out",
+ "refId": "C",
+ "units": "none",
+ "valueHandler": "Number Threshold",
+ "warn": 1
+ },
+ {
+ "aggregation": "Last",
+ "alias": "Up",
+ "decimals": 2,
+ "displayAliasType": "Always",
+ "displayType": "Regular",
+ "displayValueWithAlias": "When Alias Displayed",
+ "expr": "sum(ceph_osd_up{instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Up",
+ "refId": "D",
+ "units": "none",
+ "valueHandler": "Number Threshold"
+ },
+ {
+ "aggregation": "Last",
+ "alias": "Down",
+ "crit": 2,
+ "decimals": 2,
+ "displayAliasType": "Warning / Critical",
+ "displayType": "Regular",
+ "displayValueWithAlias": "When Alias Displayed",
+ "expr": "sum(ceph_osd_up{instance=~\"$instance\"} == bool 0)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Down",
+ "refId": "E",
+ "units": "none",
+ "valueHandler": "Number Threshold",
+ "warn": 1
+ }
+ ],
+ "title": "OSDs",
+ "type": "vonage-status-panel"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "$datasource",
+ "decimals": 2,
+ "format": "percentunit",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": true,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 4,
+ "x": 4,
+ "y": 0
+ },
+ "id": 47,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": true
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "sum(ceph_osd_stat_bytes_used{instance=~\"$instance\"})/sum(ceph_osd_stat_bytes{instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Used",
+ "refId": "A"
+ }
+ ],
+ "thresholds": "70,80",
+ "title": "Capacity used",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 0,
+ "gridPos": {
+ "h": 6,
+ "w": 8,
+ "x": 8,
+ "y": 0
+ },
+ "id": 53,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "Active",
+ "color": "#508642",
+ "fill": 1,
+ "stack": "A"
+ },
+ {
+ "alias": "Total",
+ "color": "#f9e2d2"
+ },
+ {
+ "alias": "Degraded",
+ "color": "#eab839"
+ },
+ {
+ "alias": "Undersized",
+ "color": "#f9934e"
+ },
+ {
+ "alias": "Inconsistent",
+ "color": "#e24d42"
+ },
+ {
+ "alias": "Down",
+ "color": "#bf1b00"
+ },
+ {
+ "alias": "Inactive",
+ "color": "#bf1b00",
+ "fill": 4,
+ "linewidth": 0,
+ "stack": "A"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(ceph_pg_total)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Total",
+ "refId": "A"
+ },
+ {
+ "expr": "sum(ceph_pg_active)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Active",
+ "refId": "B"
+ },
+ {
+ "expr": "sum(ceph_pg_total - ceph_pg_active)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Inactive",
+ "refId": "G"
+ },
+ {
+ "expr": "sum(ceph_pg_undersized)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Undersized",
+ "refId": "F"
+ },
+ {
+ "expr": "sum(ceph_pg_degraded)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Degraded",
+ "refId": "C"
+ },
+ {
+ "expr": "sum(ceph_pg_inconsistent)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Inconsistent",
+ "refId": "D"
+ },
+ {
+ "expr": "sum(ceph_pg_down)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Down",
+ "refId": "E"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "PG States",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 0,
+ "gridPos": {
+ "h": 6,
+ "w": 8,
+ "x": 16,
+ "y": 0
+ },
+ "id": 66,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": false,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "Avg Apply Latency",
+ "color": "#7eb26d"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "quantile(0.95, ceph_osd_apply_latency_ms{instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Apply Latency P_95",
+ "refId": "A"
+ },
+ {
+ "expr": "quantile(0.95, ceph_osd_commit_latency_ms{instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Commit Latency P_95",
+ "refId": "B"
+ },
+ {
+ "expr": "avg(ceph_osd_apply_latency_ms{instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Avg Apply Latency",
+ "refId": "C"
+ },
+ {
+ "expr": "avg(ceph_osd_commit_latency_ms{instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Avg Commit Latency",
+ "refId": "D"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "OSD Latencies",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "ms",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "clusterName": "",
+ "colorMode": "Panel",
+ "colors": {
+ "crit": "rgba(245, 54, 54, 0.9)",
+ "disable": "rgba(128, 128, 128, 0.9)",
+ "ok": "rgba(50, 128, 45, 0.9)",
+ "warn": "rgba(237, 129, 40, 0.9)"
+ },
+ "cornerRadius": 1,
+ "datasource": "$datasource",
+ "displayName": "",
+ "flipCard": false,
+ "flipTime": 5,
+ "fontFormat": "Regular",
+ "gridPos": {
+ "h": 3,
+ "w": 2,
+ "x": 0,
+ "y": 3
+ },
+ "id": 41,
+ "isAutoScrollOnOverflow": false,
+ "isGrayOnNoData": false,
+ "isHideAlertsOnDisable": false,
+ "isIgnoreOKColors": false,
+ "links": [],
+ "targets": [
+ {
+ "aggregation": "Last",
+ "alias": "In Quorum",
+ "decimals": 2,
+ "displayAliasType": "Always",
+ "displayType": "Regular",
+ "displayValueWithAlias": "When Alias Displayed",
+ "expr": "sum(ceph_mon_quorum_status{instance=~\"$instance\"})",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "In Quorum",
+ "refId": "A",
+ "units": "none",
+ "valueHandler": "Text Only"
+ },
+ {
+ "aggregation": "Last",
+ "alias": "Total",
+ "crit": 1,
+ "decimals": 2,
+ "displayAliasType": "Always",
+ "displayType": "Regular",
+ "displayValueWithAlias": "When Alias Displayed",
+ "expr": "count(ceph_mon_quorum_status{instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Total",
+ "refId": "B",
+ "units": "none",
+ "valueHandler": "Text Only",
+ "warn": 2
+ },
+ {
+ "aggregation": "Last",
+ "alias": "MONs out of Quorum",
+ "crit": 1.6,
+ "decimals": 2,
+ "displayAliasType": "Warning / Critical",
+ "displayType": "Annotation",
+ "displayValueWithAlias": "Never",
+ "expr": "count(ceph_mon_quorum_status{instance=~\"$instance\"}) / sum(ceph_mon_quorum_status{instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "MONs out of Quorum",
+ "refId": "C",
+ "units": "none",
+ "valueHandler": "Number Threshold",
+ "warn": 1.1
+ }
+ ],
+ "title": "Monitors",
+ "type": "vonage-status-panel"
+ },
+ {
+ "colorMode": "Disabled",
+ "colors": {
+ "crit": "rgba(245, 54, 54, 0.9)",
+ "disable": "rgba(128, 128, 128, 0.9)",
+ "ok": "rgba(50, 128, 45, 0.9)",
+ "warn": "rgba(237, 129, 40, 0.9)"
+ },
+ "cornerRadius": 0,
+ "datasource": "$datasource",
+ "displayName": "",
+ "flipCard": false,
+ "flipTime": 5,
+ "fontFormat": "Regular",
+ "gridPos": {
+ "h": 3,
+ "w": 2,
+ "x": 2,
+ "y": 3
+ },
+ "id": 68,
+ "isAutoScrollOnOverflow": false,
+ "isGrayOnNoData": false,
+ "isHideAlertsOnDisable": false,
+ "isIgnoreOKColors": false,
+ "links": [],
+ "targets": [
+ {
+ "aggregation": "Last",
+ "alias": "Clients",
+ "decimals": 2,
+ "displayAliasType": "Always",
+ "displayType": "Regular",
+ "displayValueWithAlias": "When Alias Displayed",
+ "expr": "ceph_mds_server_handle_client_session{instance=~\"$instance\"}",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Clients",
+ "refId": "A",
+ "units": "none",
+ "valueHandler": "Number Threshold"
+ }
+ ],
+ "title": "Client connections",
+ "type": "vonage-status-panel"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 0,
+ "y": 6
+ },
+ "id": 45,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 0.5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "Reads",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(irate(ceph_osd_op_w_in_bytes{instance=~\"$instance\"}[1m]))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Writes",
+ "refId": "A"
+ },
+ {
+ "expr": "sum(irate(ceph_osd_op_r_out_bytes{instance=~\"$instance\"}[1m]))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Reads",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Cluster I/O",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "Bps",
+ "label": "Read (-) / Write (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 12,
+ "y": 6
+ },
+ "id": 62,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": false,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(deriv(ceph_pool_stored{instance=~\"$instance\"}[1m]))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "In-/Egress",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "Bps",
+ "label": " Egress (-) / Ingress (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ]
+ },
+ {
+ "cards": {
+ "cardPadding": null,
+ "cardRound": 1
+ },
+ "color": {
+ "cardColor": "rgb(0, 254, 255)",
+ "colorScale": "sqrt",
+ "colorScheme": "interpolateBlues",
+ "exponent": 0.5,
+ "min": null,
+ "mode": "spectrum"
+ },
+ "dataFormat": "timeseries",
+ "datasource": "$datasource",
+ "gridPos": {
+ "h": 9,
+ "w": 6,
+ "x": 0,
+ "y": 15
+ },
+ "heatmap": {},
+ "highlightCards": true,
+ "id": 55,
+ "legend": {
+ "show": true
+ },
+ "links": [],
+ "span": 12,
+ "targets": [
+ {
+ "expr": "ceph_osd_stat_bytes_used{instance=~'$instance'} / ceph_osd_stat_bytes{instance=~'$instance'}",
+ "format": "time_series",
+ "interval": "1m",
+ "intervalFactor": 1,
+ "legendFormat": "Util (%)",
+ "refId": "A",
+ "step": 60
+ }
+ ],
+ "timeFrom": null,
+ "title": "OSD Capacity Utilization",
+ "tooltip": {
+ "show": true,
+ "showHistogram": false
+ },
+ "type": "heatmap",
+ "xAxis": {
+ "show": true
+ },
+ "xBucketNumber": null,
+ "xBucketSize": "",
+ "yAxis": {
+ "decimals": null,
+ "format": "percentunit",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true,
+ "splitFactor": null
+ },
+ "yBucketNumber": null,
+ "yBucketSize": null
+ },
+ {
+ "cards": {
+ "cardPadding": null,
+ "cardRound": 1
+ },
+ "color": {
+ "cardColor": "#b4ff00",
+ "colorScale": "sqrt",
+ "colorScheme": "interpolateBlues",
+ "exponent": 0.5,
+ "mode": "spectrum"
+ },
+ "dataFormat": "timeseries",
+ "datasource": "$datasource",
+ "gridPos": {
+ "h": 9,
+ "w": 6,
+ "x": 6,
+ "y": 15
+ },
+ "heatmap": {},
+ "highlightCards": true,
+ "id": 59,
+ "legend": {
+ "show": true
+ },
+ "links": [],
+ "targets": [
+ {
+ "expr": "ceph_osd_numpg{instance=~\"$instance\"}",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "#PGs",
+ "refId": "A"
+ }
+ ],
+ "title": "PGs per OSD",
+ "tooltip": {
+ "show": true,
+ "showHistogram": false
+ },
+ "type": "heatmap",
+ "xAxis": {
+ "show": true
+ },
+ "xBucketNumber": null,
+ "xBucketSize": "",
+ "yAxis": {
+ "decimals": null,
+ "format": "none",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true,
+ "splitFactor": null
+ },
+ "yBucketNumber": null,
+ "yBucketSize": null
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 12,
+ "y": 15
+ },
+ "id": 64,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": false,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(irate(ceph_osd_recovery_ops[1m]))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Op/s",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Recovery Rate",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "ops",
+ "label": "Recovery Ops/s",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "refresh": "30s",
+ "schemaVersion": 16,
+ "style": "dark",
+ "tags": [
+ "ceph",
+ "cluster"
+ ],
+ "templating": {
+ "list": [
+ {
+ "hide": 0,
+ "label": null,
+ "name": "datasource",
+ "options": [],
+ "query": "prometheus",
+ "refresh": 1,
+ "regex": "",
+ "type": "datasource"
+ },
+ {
+ "auto": true,
+ "auto_count": 10,
+ "auto_min": "1m",
+ "current": {
+ "text": "auto",
+ "value": "$__auto_interval_interval"
+ },
+ "datasource": null,
+ "hide": 0,
+ "includeAll": false,
+ "label": "Interval",
+ "multi": false,
+ "name": "interval",
+ "options": [
+ {
+ "selected": true,
+ "text": "auto",
+ "value": "$__auto_interval_interval"
+ },
+ {
+ "selected": false,
+ "text": "1m",
+ "value": "1m"
+ },
+ {
+ "selected": false,
+ "text": "10m",
+ "value": "10m"
+ },
+ {
+ "selected": false,
+ "text": "30m",
+ "value": "30m"
+ },
+ {
+ "selected": false,
+ "text": "1h",
+ "value": "1h"
+ },
+ {
+ "selected": false,
+ "text": "6h",
+ "value": "6h"
+ },
+ {
+ "selected": false,
+ "text": "12h",
+ "value": "12h"
+ },
+ {
+ "selected": false,
+ "text": "1d",
+ "value": "1d"
+ },
+ {
+ "selected": false,
+ "text": "7d",
+ "value": "7d"
+ },
+ {
+ "selected": false,
+ "text": "14d",
+ "value": "14d"
+ },
+ {
+ "selected": false,
+ "text": "30d",
+ "value": "30d"
+ }
+ ],
+ "query": "1m,10m,30m,1h,6h,12h,1d,7d,14d,30d",
+ "refresh": 2,
+ "type": "interval"
+ },
+ {
+ "allFormat": "glob",
+ "allValue": null,
+ "current": {},
+ "datasource": "$datasource",
+ "hide": 0,
+ "hideLabel": false,
+ "includeAll": true,
+ "label": "Exporter Instance",
+ "multi": false,
+ "multiFormat": "glob",
+ "name": "instance",
+ "options": [],
+ "query": "label_values(ceph_health_status, instance)",
+ "refresh": 1,
+ "regex": "",
+ "sort": 0,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-6h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "browser",
+ "title": "Ceph - Cluster",
+ "version": 13
+ }
diff --git a/monitoring/grafana/dashboards/cephfs-overview.json b/monitoring/grafana/dashboards/cephfs-overview.json
new file mode 100644
index 00000000..57922f55
--- /dev/null
+++ b/monitoring/grafana/dashboards/cephfs-overview.json
@@ -0,0 +1,309 @@
+{
+ "__inputs": [],
+ "__requires": [
+ {
+ "type": "grafana",
+ "id": "grafana",
+ "name": "Grafana",
+ "version": "5.3.2"
+ },
+ {
+ "type": "panel",
+ "id": "graph",
+ "name": "Graph",
+ "version": "5.0.0"
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": false,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "id": null,
+ "iteration": 1557392920097,
+ "links": [],
+ "panels": [
+ {
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 0
+ },
+ "id": 10,
+ "panels": [],
+ "title": "MDS Performance",
+ "type": "row"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 0,
+ "y": 1
+ },
+ "id": 2,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*Reads/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(ceph_objecter_op_r{ceph_daemon=~\"($mds_servers).*\"})",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Read Ops",
+ "refId": "A"
+ },
+ {
+ "expr": "sum(ceph_objecter_op_w{ceph_daemon=~\"($mds_servers).*\"})",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Write Ops",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "MDS Workload - $mds_servers",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "none",
+ "label": "Reads(-) / Writes (+)",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 12,
+ "y": 1
+ },
+ "id": 4,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "ceph_mds_server_handle_client_request{ceph_daemon=~\"($mds_servers).*\"}",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{ceph_daemon}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Client Request Load - $mds_servers",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "none",
+ "label": "Client Requests",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "refresh": "15s",
+ "schemaVersion": 16,
+ "style": "dark",
+ "tags": [],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "text": "default",
+ "value": "default"
+ },
+ "hide": 0,
+ "label": "Data Source",
+ "name": "datasource",
+ "options": [],
+ "query": "prometheus",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "type": "datasource"
+ },
+ {
+ "allValue": null,
+ "current": {},
+ "datasource": "$datasource",
+ "hide": 0,
+ "includeAll": true,
+ "label": "MDS Server",
+ "multi": false,
+ "name": "mds_servers",
+ "options": [],
+ "query": "label_values(ceph_mds_inodes, ceph_daemon)",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 0,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "15s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "",
+ "title": "MDS Performance",
+ "uid": "tbO9LAiZz",
+ "version": 2
+}
diff --git a/monitoring/grafana/dashboards/host-details.json b/monitoring/grafana/dashboards/host-details.json
new file mode 100644
index 00000000..d02c4069
--- /dev/null
+++ b/monitoring/grafana/dashboards/host-details.json
@@ -0,0 +1,1215 @@
+{
+ "__inputs": [],
+ "__requires": [
+ {
+ "type": "grafana",
+ "id": "grafana",
+ "name": "Grafana",
+ "version": "5.3.2"
+ },
+ {
+ "type": "panel",
+ "id": "graph",
+ "name": "Graph",
+ "version": "5.0.0"
+ },
+ {
+ "type": "panel",
+ "id": "singlestat",
+ "name": "Singlestat",
+ "version": "5.0.0"
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": false,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "id": null,
+ "iteration": 1557386759572,
+ "links": [],
+ "panels": [
+ {
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 0
+ },
+ "id": 16,
+ "title": "$ceph_hosts System Overview",
+ "type": "row"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "datasource": "$datasource",
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 5,
+ "w": 3,
+ "x": 0,
+ "y": 1
+ },
+ "height": "160",
+ "id": 1,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": "",
+ "minSpan": 4,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "count(sum by (ceph_daemon) (ceph_osd_metadata{hostname='$ceph_hosts'}))",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "refId": "A",
+ "step": 40,
+ "textEditor": true
+ }
+ ],
+ "thresholds": "",
+ "title": "OSDs",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "aliasColors": {
+ "interrupt": "#447EBC",
+ "steal": "#6D1F62",
+ "system": "#890F02",
+ "user": "#3F6833",
+ "wait": "#C15C17"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "Shows the CPU breakdown. When multiple servers are selected, only the first host's cpu data is shown",
+ "fill": 1,
+ "gridPos": {
+ "h": 10,
+ "w": 6,
+ "x": 3,
+ "y": 1
+ },
+ "id": 9,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "minSpan": 12,
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum by (mode) (\n irate(node_cpu{instance=~\"($ceph_hosts).*\", mode=~\"(irq|nice|softirq|steal|system|user|iowait)\"}[1m]) or\n irate(node_cpu_seconds_total{instance=~\"($ceph_hosts).*\", mode=~\"(irq|nice|softirq|steal|system|user|iowait)\"}[1m])\n) / scalar(\n sum(irate(node_cpu{instance=~\"($ceph_hosts).*\"}[1m]) or\n irate(node_cpu_seconds_total{instance=~\"($ceph_hosts).*\"}[1m]))\n) * 100",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{mode}}",
+ "refId": "A",
+ "step": 10,
+ "textEditor": true
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "CPU Utilisation",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "percent",
+ "label": "% Utilization",
+ "logBase": 1,
+ "max": "100",
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "Available": "#508642",
+ "Free": "#508642",
+ "Total": "#bf1b00",
+ "Used": "#bf1b00",
+ "total": "#bf1b00",
+ "used": "#0a50a1"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "gridPos": {
+ "h": 10,
+ "w": 6,
+ "x": 9,
+ "y": 1
+ },
+ "id": 14,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "total",
+ "color": "#bf1b00",
+ "fill": 0,
+ "linewidth": 2,
+ "stack": false
+ }
+ ],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "(node_memory_MemTotal{instance=~\"[[ceph_hosts]].*\"} or node_memory_MemTotal_bytes{instance=~\"[[ceph_hosts]].*\"})- (\n (node_memory_MemFree{instance=~\"[[ceph_hosts]].*\"} or node_memory_MemFree_bytes{instance=~\"[[ceph_hosts]].*\"}) + \n (node_memory_Cached{instance=~\"[[ceph_hosts]].*\"} or node_memory_Cached_bytes{instance=~\"[[ceph_hosts]].*\"}) + \n (node_memory_Buffers{instance=~\"[[ceph_hosts]].*\"} or node_memory_Buffers_bytes{instance=~\"[[ceph_hosts]].*\"}) +\n (node_memory_Slab{instance=~\"[[ceph_hosts]].*\"} or node_memory_Slab_bytes{instance=~\"[[ceph_hosts]].*\"})\n )\n \n",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "used",
+ "refId": "D"
+ },
+ {
+ "expr": "node_memory_MemFree{instance=~\"[[ceph_hosts]].*\"} or node_memory_MemFree_bytes{instance=~\"[[ceph_hosts]].*\"} ",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 1,
+ "legendFormat": "Free",
+ "refId": "A"
+ },
+ {
+ "expr": "(node_memory_Cached{instance=~\"[[ceph_hosts]].*\"} or node_memory_Cached_bytes{instance=~\"[[ceph_hosts]].*\"}) + \n(node_memory_Buffers{instance=~\"[[ceph_hosts]].*\"} or node_memory_Buffers_bytes{instance=~\"[[ceph_hosts]].*\"}) +\n(node_memory_Slab{instance=~\"[[ceph_hosts]].*\"} or node_memory_Slab_bytes{instance=~\"[[ceph_hosts]].*\"}) \n",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 1,
+ "legendFormat": "buffers/cache",
+ "refId": "C"
+ },
+ {
+ "expr": "node_memory_MemTotal{instance=~\"[[ceph_hosts]].*\"} or node_memory_MemTotal_bytes{instance=~\"[[ceph_hosts]].*\"} ",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 1,
+ "legendFormat": "total",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "RAM Usage",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": "RAM used",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "Show the network load (rx,tx) across all interfaces (excluding loopback 'lo')",
+ "fill": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 6,
+ "x": 15,
+ "y": 1
+ },
+ "id": 10,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "hideZero": true,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "minSpan": 12,
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*tx/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum by (device) (\n irate(node_network_receive_bytes{instance=~\"($ceph_hosts).*\",device!=\"lo\"}[1m]) or \n irate(node_network_receive_bytes_total{instance=~\"($ceph_hosts).*\",device!=\"lo\"}[1m])\n)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{device}}.rx",
+ "refId": "A",
+ "step": 10,
+ "textEditor": true
+ },
+ {
+ "expr": "sum by (device) (\n irate(node_network_transmit_bytes{instance=~\"($ceph_hosts).*\",device!=\"lo\"}[1m]) or\n irate(node_network_transmit_bytes_total{instance=~\"($ceph_hosts).*\",device!=\"lo\"}[1m])\n)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{device}}.tx",
+ "refId": "B",
+ "step": 10
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Network Load",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "decbytes",
+ "label": "Send (-) / Receive (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 0,
+ "gridPos": {
+ "h": 5,
+ "w": 3,
+ "x": 21,
+ "y": 1
+ },
+ "hideTimeOverride": true,
+ "id": 18,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": false,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*tx/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_network_receive_drop{instance=~\"[[ceph_hosts]].*\"}[1m]) or irate(node_network_receive_drop_total{instance=~\"[[ceph_hosts]].*\"}[1m])",
+ "format": "time_series",
+ "instant": false,
+ "intervalFactor": 1,
+ "legendFormat": "{{device}}.rx",
+ "refId": "A"
+ },
+ {
+ "expr": "irate(node_network_transmit_drop{instance=~\"[[ceph_hosts]].*\"}[1m]) or irate(node_network_transmit_drop_total{instance=~\"[[ceph_hosts]].*\"}[1m])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{device}}.tx",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Network drop rate",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "pps",
+ "label": "Send (-) / Receive (+)",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "datasource": "$datasource",
+ "decimals": 0,
+ "description": "Each OSD consists of a Journal/WAL partition and a data partition. The RAW Capacity shown is the sum of the data partitions across all OSDs on the selected OSD hosts.",
+ "format": "bytes",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 5,
+ "w": 3,
+ "x": 0,
+ "y": 6
+ },
+ "height": "160",
+ "id": 2,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": "",
+ "minSpan": 4,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "sum(ceph_osd_stat_bytes and on (ceph_daemon) ceph_disk_occupation{instance=~\"($ceph_hosts).*\"})",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "refId": "A",
+ "step": 40,
+ "textEditor": true
+ }
+ ],
+ "thresholds": "",
+ "title": "Raw Capacity",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 0,
+ "gridPos": {
+ "h": 5,
+ "w": 3,
+ "x": 21,
+ "y": 6
+ },
+ "hideTimeOverride": true,
+ "id": 19,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": false,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*tx/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_network_receive_errs{instance=~\"[[ceph_hosts]].*\"}[1m]) or irate(node_network_receive_errs_total{instance=~\"[[ceph_hosts]].*\"}[1m])",
+ "format": "time_series",
+ "instant": false,
+ "intervalFactor": 1,
+ "legendFormat": "{{device}}.rx",
+ "refId": "A"
+ },
+ {
+ "expr": "irate(node_network_transmit_errs{instance=~\"[[ceph_hosts]].*\"}[1m]) or irate(node_network_transmit_errs_total{instance=~\"[[ceph_hosts]].*\"}[1m])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{device}}.tx",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Network error rate",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "pps",
+ "label": "Send (-) / Receive (+)",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 11
+ },
+ "id": 12,
+ "panels": [],
+ "repeat": null,
+ "title": "OSD Disk Performance Statistics",
+ "type": "row"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "For any OSD devices on the host, this chart shows the iops per physical device. Each device is shown by it's name and corresponding OSD id value",
+ "fill": 1,
+ "gridPos": {
+ "h": 9,
+ "w": 11,
+ "x": 0,
+ "y": 12
+ },
+ "id": 6,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "minSpan": 12,
+ "nullPointMode": "connected",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*reads/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "label_replace(\n (\n irate(node_disk_writes_completed{instance=~\"($ceph_hosts).*\"}[5m]) or\n irate(node_disk_writes_completed_total{instance=~\"($ceph_hosts).*\"}[5m])\n ),\n \"instance\",\n \"$1\",\n \"instance\",\n \"([^:.]*).*\"\n)\n* on(instance, device) group_left(ceph_daemon)\n label_replace(\n label_replace(\n ceph_disk_occupation,\n \"device\",\n \"$1\",\n \"device\",\n \"/dev/(.*)\"\n ),\n \"instance\",\n \"$1\",\n \"instance\",\n \"([^:.]*).*\"\n )",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{device}}({{ceph_daemon}}) writes",
+ "refId": "A",
+ "step": 10,
+ "textEditor": true
+ },
+ {
+ "expr": "label_replace(\n (irate(node_disk_reads_completed{instance=~\"($ceph_hosts).*\"}[5m]) or irate(node_disk_reads_completed_total{instance=~\"($ceph_hosts).*\"}[5m])),\n \"instance\",\n \"$1\",\n \"instance\",\n \"([^:.]*).*\"\n)\n* on(instance, device) group_left(ceph_daemon)\n label_replace(\n label_replace(\n ceph_disk_occupation,\n \"device\",\n \"$1\",\n \"device\",\n \"/dev/(.*)\"\n ),\n \"instance\",\n \"$1\",\n \"instance\",\n \"([^:.]*).*\"\n )",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 1,
+ "legendFormat": "{{device}}({{ceph_daemon}}) reads",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "$ceph_hosts Disk IOPS",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "ops",
+ "label": "Read (-) / Write (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "For OSD hosts, this chart shows the disk bandwidth (read bytes/sec + write bytes/sec) of the physical OSD device. Each device is shown by device name, and corresponding OSD id",
+ "fill": 1,
+ "gridPos": {
+ "h": 9,
+ "w": 11,
+ "x": 12,
+ "y": 12
+ },
+ "id": 8,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "minSpan": 12,
+ "nullPointMode": "connected",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*read/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "label_replace((irate(node_disk_bytes_written{instance=~\"($ceph_hosts).*\"}[5m]) or irate(node_disk_written_bytes_total{instance=~\"($ceph_hosts).*\"}[5m])), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") * on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{device}}({{ceph_daemon}}) write",
+ "refId": "B"
+ },
+ {
+ "expr": "label_replace((irate(node_disk_bytes_read{instance=~\"($ceph_hosts).*\"}[5m]) or irate(node_disk_read_bytes_total{instance=~\"($ceph_hosts).*\"}[5m])), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") * on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{device}}({{ceph_daemon}}) read",
+ "refId": "C"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "$ceph_hosts Throughput by Disk",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "Bps",
+ "label": "Read (-) / Write (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "For OSD hosts, this chart shows the latency at the physical drive. Each drive is shown by device name, with it's corresponding OSD id",
+ "fill": 1,
+ "gridPos": {
+ "h": 9,
+ "w": 11,
+ "x": 0,
+ "y": 21
+ },
+ "id": 7,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "minSpan": 12,
+ "nullPointMode": "null as zero",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "max by(instance,device) (label_replace((irate(node_disk_write_time_seconds_total{ instance=~\"($ceph_hosts).*\"}[5m]) ) / clamp_min(irate(node_disk_writes_completed_total{ instance=~\"($ceph_hosts).*\"}[5m]), 0.001) or (irate(node_disk_read_time_seconds_total{ instance=~\"($ceph_hosts).*\"}[5m]) ) / clamp_min(irate(node_disk_reads_completed_total{ instance=~\"($ceph_hosts).*\"}[5m]), 0.001), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")) * on(instance,device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation{instance=~\"($ceph_hosts).*\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 1,
+ "legendFormat": "{{device}}({{ceph_daemon}})",
+ "refId": "D"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "$ceph_hosts Disk Latency",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": "",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "Show disk utilization % (util) of any OSD devices on the host by the physical device name and associated OSD id.",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 11,
+ "x": 12,
+ "y": 21
+ },
+ "id": 5,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 2,
+ "nullPointMode": "connected",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "label_replace(((irate(node_disk_io_time_ms{instance=~\"($ceph_hosts).*\"}[5m]) / 10 ) or irate(node_disk_io_time_seconds_total{instance=~\"($ceph_hosts).*\"}[5m]) * 100), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") * on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation{instance=~\"($ceph_hosts).*\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 1,
+ "legendFormat": "{{device}}({{ceph_daemon}})",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "$ceph_hosts Disk utilisation",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "percent",
+ "label": "%Util",
+ "logBase": 1,
+ "max": "100",
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "refresh": "10s",
+ "schemaVersion": 16,
+ "style": "dark",
+ "tags": [
+ "overview"
+ ],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "text": "default",
+ "value": "default"
+ },
+ "hide": 0,
+ "label": "Data Source",
+ "name": "datasource",
+ "options": [],
+ "query": "prometheus",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "type": "datasource"
+ },
+ {
+ "allValue": null,
+ "current": {},
+ "datasource": "$datasource",
+ "hide": 0,
+ "includeAll": false,
+ "label": "Hostname",
+ "multi": false,
+ "name": "ceph_hosts",
+ "options": [],
+ "query": "label_values(node_scrape_collector_success, instance) ",
+ "refresh": 1,
+ "regex": "([^.:]*).*",
+ "skipUrlSync": false,
+ "sort": 3,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "browser",
+ "title": "Host Details",
+ "uid": "rtOg0AiWz",
+ "version": 4
+}
diff --git a/monitoring/grafana/dashboards/hosts-overview.json b/monitoring/grafana/dashboards/hosts-overview.json
new file mode 100644
index 00000000..804aa51c
--- /dev/null
+++ b/monitoring/grafana/dashboards/hosts-overview.json
@@ -0,0 +1,854 @@
+{
+ "__inputs": [],
+ "__requires": [
+ {
+ "type": "grafana",
+ "id": "grafana",
+ "name": "Grafana",
+ "version": "5.3.2"
+ },
+ {
+ "type": "panel",
+ "id": "graph",
+ "name": "Graph",
+ "version": "5.0.0"
+ },
+ {
+ "type": "panel",
+ "id": "singlestat",
+ "name": "Singlestat",
+ "version": "5.0.0"
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": false,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "id": null,
+ "iteration": 1557393917915,
+ "links": [],
+ "panels": [
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "$datasource",
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 5,
+ "w": 4,
+ "x": 0,
+ "y": 0
+ },
+ "id": 5,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "count(sum by (hostname) (ceph_osd_metadata))",
+ "format": "time_series",
+ "instant": true,
+ "intervalFactor": 1,
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "OSD Hosts",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "$datasource",
+ "decimals": 0,
+ "description": "Average CPU busy across all hosts (OSD, RGW, MON etc) within the cluster",
+ "decimals": 2,
+ "format": "percentunit",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 5,
+ "w": 4,
+ "x": 4,
+ "y": 0
+ },
+ "id": 6,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "avg(\n 1 - (\n avg by(instance) \n (irate(node_cpu_seconds_total{mode='idle',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[1m]) or\n irate(node_cpu{mode='idle',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[1m]))\n )\n )",
+ "format": "time_series",
+ "instant": true,
+ "intervalFactor": 1,
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "AVG CPU Busy",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "$datasource",
+ "decimals": 0,
+ "description": "Average Memory Usage across all hosts in the cluster (excludes buffer/cache usage)",
+ "decimals": 2,
+ "format": "percentunit",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 5,
+ "w": 4,
+ "x": 8,
+ "y": 0
+ },
+ "id": 9,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "avg (((node_memory_MemTotal{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or node_memory_MemTotal_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"})- (\n (node_memory_MemFree{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or node_memory_MemFree_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}) + \n (node_memory_Cached{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or node_memory_Cached_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}) + \n (node_memory_Buffers{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or node_memory_Buffers_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}) +\n (node_memory_Slab{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or node_memory_Slab_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"})\n )) /\n (node_memory_MemTotal{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or node_memory_MemTotal_bytes{instance=~\"($osd_hosts|$rgw_hosts|$mon_hosts|$mds_hosts).*\"} ))",
+ "format": "time_series",
+ "instant": true,
+ "intervalFactor": 1,
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "AVG RAM Utilization",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "$datasource",
+ "description": "IOPS Load at the device as reported by the OS on all OSD hosts",
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 5,
+ "w": 4,
+ "x": 12,
+ "y": 0
+ },
+ "id": 2,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "sum ((irate(node_disk_reads_completed{instance=~\"($osd_hosts).*\"}[5m]) or irate(node_disk_reads_completed_total{instance=~\"($osd_hosts).*\"}[5m]) ) + \n(irate(node_disk_writes_completed{instance=~\"($osd_hosts).*\"}[5m]) or irate(node_disk_writes_completed_total{instance=~\"($osd_hosts).*\"}[5m])))",
+ "format": "time_series",
+ "instant": true,
+ "intervalFactor": 1,
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "Physical IOPS",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "$datasource",
+ "description": "Average Disk utilization for all OSD data devices (i.e. excludes journal/WAL)",
+ "format": "percent",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 5,
+ "w": 4,
+ "x": 16,
+ "y": 0
+ },
+ "id": 20,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr" : "avg (\n label_replace((irate(node_disk_io_time_ms[5m]) / 10 ) or\n (irate(node_disk_io_time_seconds_total[5m]) * 100), \"instance\", \"$1\", \"instance\", \"([^.:]*).*\"\n ) *\n on(instance, device) label_replace(label_replace(ceph_disk_occupation{instance=~\"($osd_hosts).*\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^.:]*).*\")\n)",
+ "format": "time_series",
+ "instant": true,
+ "intervalFactor": 1,
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "AVG Disk Utilization",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "$datasource",
+ "decimals": 0,
+ "description": "Total send/receive network load across all hosts in the ceph cluster",
+ "format": "bytes",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 5,
+ "w": 4,
+ "x": 20,
+ "y": 0
+ },
+ "id": 18,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "sum (\n irate(node_network_receive_bytes{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[1m]) or\n irate(node_network_receive_bytes_total{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[1m])\n ) +\nsum (\n irate(node_network_transmit_bytes{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[1m]) or\n irate(node_network_transmit_bytes_total{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[1m])\n )",
+ "format": "time_series",
+ "instant": true,
+ "intervalFactor": 1,
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "Network Load",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "Show the top 10 busiest hosts by cpu",
+ "fill": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 0,
+ "y": 5
+ },
+ "id": 13,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": false,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "topk(10,100 * ( 1 - (\n avg by(instance) \n (irate(node_cpu_seconds_total{mode='idle',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[1m]) or\n irate(node_cpu{mode='idle',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[1m]))\n )\n )\n)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "CPU Busy - Top 10 Hosts",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": 1,
+ "format": "percent",
+ "label": null,
+ "logBase": 1,
+ "max": "100",
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "Top 10 hosts by network load",
+ "fill": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 12,
+ "y": 5
+ },
+ "id": 19,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": false,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "topk(10, (sum by(instance) (\n (\n irate(node_network_receive_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[1m]) or\n irate(node_network_receive_bytes_total{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[1m])\n ) +\n (\n irate(node_network_transmit_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[1m]) or\n irate(node_network_transmit_bytes_total{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[1m])\n ))\n )\n)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Network Load - Top 10 Hosts",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": 1,
+ "format": "Bps",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "refresh": "10s",
+ "schemaVersion": 16,
+ "style": "dark",
+ "tags": [],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "text": "default",
+ "value": "default"
+ },
+ "hide": 0,
+ "label": "Data Source",
+ "name": "datasource",
+ "options": [],
+ "query": "prometheus",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "type": "datasource"
+ },
+ {
+ "allValue": "",
+ "current": {},
+ "datasource": "$datasource",
+ "hide": 2,
+ "includeAll": true,
+ "label": null,
+ "multi": false,
+ "name": "osd_hosts",
+ "options": [],
+ "query": "label_values(ceph_disk_occupation, exported_instance)",
+ "refresh": 1,
+ "regex": "([^.]*).*",
+ "skipUrlSync": false,
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "ceph",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": {},
+ "datasource": "$datasource",
+ "hide": 2,
+ "includeAll": true,
+ "label": null,
+ "multi": false,
+ "name": "mon_hosts",
+ "options": [],
+ "query": "label_values(ceph_mon_metadata, ceph_daemon)",
+ "refresh": 1,
+ "regex": "mon.(.*)",
+ "skipUrlSync": false,
+ "sort": 0,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": {},
+ "datasource": "$datasource",
+ "hide": 2,
+ "includeAll": true,
+ "label": null,
+ "multi": false,
+ "name": "mds_hosts",
+ "options": [],
+ "query": "label_values(ceph_mds_inodes, ceph_daemon)",
+ "refresh": 1,
+ "regex": "mds.(.*)",
+ "skipUrlSync": false,
+ "sort": 0,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": {},
+ "datasource": "$datasource",
+ "hide": 2,
+ "includeAll": true,
+ "label": null,
+ "multi": false,
+ "name": "rgw_hosts",
+ "options": [],
+ "query": "label_values(ceph_rgw_qlen, ceph_daemon)",
+ "refresh": 1,
+ "regex": "rgw.(.*)",
+ "skipUrlSync": false,
+ "sort": 0,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "",
+ "title": "Host Overview",
+ "uid": "y0KGL0iZz",
+ "version": 3
+}
diff --git a/monitoring/grafana/dashboards/osd-device-details.json b/monitoring/grafana/dashboards/osd-device-details.json
new file mode 100644
index 00000000..deb6ed7d
--- /dev/null
+++ b/monitoring/grafana/dashboards/osd-device-details.json
@@ -0,0 +1,800 @@
+{
+ "__inputs": [],
+ "__requires": [
+ {
+ "type": "grafana",
+ "id": "grafana",
+ "name": "Grafana",
+ "version": "5.3.2"
+ },
+ {
+ "type": "panel",
+ "id": "graph",
+ "name": "Graph",
+ "version": "5.0.0"
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": false,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "id": null,
+ "iteration": 1557395861896,
+ "links": [],
+ "panels": [
+ {
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 0
+ },
+ "id": 14,
+ "panels": [],
+ "title": "OSD Performance",
+ "type": "row"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 6,
+ "x": 0,
+ "y": 1
+ },
+ "id": 2,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "read",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(ceph_osd_op_r_latency_sum{ceph_daemon=~\"$osd\"}[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "read",
+ "refId": "A"
+ },
+ {
+ "expr": "irate(ceph_osd_op_w_latency_sum{ceph_daemon=~\"$osd\"}[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "write",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "$osd Latency",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": "Read (-) / Write (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 6,
+ "x": 6,
+ "y": 1
+ },
+ "id": 8,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "Reads",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(ceph_osd_op_r{ceph_daemon=~\"$osd\"}[1m])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Reads",
+ "refId": "A"
+ },
+ {
+ "expr": "irate(ceph_osd_op_w{ceph_daemon=~\"$osd\"}[1m])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Writes",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "$osd R/W IOPS",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "Read (-) / Write (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 6,
+ "x": 12,
+ "y": 1
+ },
+ "id": 7,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "Read Bytes",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(ceph_osd_op_r_out_bytes{ceph_daemon=~\"$osd\"}[1m])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Read Bytes",
+ "refId": "A"
+ },
+ {
+ "expr": "irate(ceph_osd_op_w_in_bytes{ceph_daemon=~\"$osd\"}[1m])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Write Bytes",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "$osd R/W Bytes",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": "Read (-) / Write (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 10
+ },
+ "id": 12,
+ "panels": [],
+ "title": "Physical Device Performance",
+ "type": "row"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 6,
+ "x": 0,
+ "y": 11
+ },
+ "id": 9,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null as zero",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*Reads/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "(label_replace(irate(node_disk_read_time_seconds_total[1m]) / irate(node_disk_reads_completed_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device) label_replace(label_replace(ceph_disk_occupation{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}/{{device}} Reads",
+ "refId": "A"
+ },
+ {
+ "expr": "(label_replace(irate(node_disk_write_time_seconds_total[1m]) / irate(node_disk_writes_completed_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device) label_replace(label_replace(ceph_disk_occupation{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}/{{device}} Writes",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Physical Device Latency for $osd",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "ms",
+ "label": "Read (-) / Write (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 6,
+ "x": 6,
+ "y": 11
+ },
+ "id": 5,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*Reads/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "label_replace(irate(node_disk_writes_completed_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device) label_replace(label_replace(ceph_disk_occupation{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{device}} on {{instance}} Writes",
+ "refId": "A"
+ },
+ {
+ "expr": "label_replace(irate(node_disk_reads_completed_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device) label_replace(label_replace(ceph_disk_occupation{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{device}} on {{instance}} Reads",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Physical Device R/W IOPS for $osd",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "Read (-) / Write (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "gridPos": {
+ "h": 9,
+ "w": 6,
+ "x": 12,
+ "y": 11
+ },
+ "id": 10,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*Reads/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "label_replace(irate(node_disk_read_bytes_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device) label_replace(label_replace(ceph_disk_occupation{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}} {{device}} Reads",
+ "refId": "A"
+ },
+ {
+ "expr": "label_replace(irate(node_disk_written_bytes_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device) label_replace(label_replace(ceph_disk_occupation{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}} {{device}} Writes",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Physical Device R/W Bytes for $osd",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "Bps",
+ "label": "Read (-) / Write (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "gridPos": {
+ "h": 9,
+ "w": 6,
+ "x": 18,
+ "y": 11
+ },
+ "id": 4,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "label_replace(irate(node_disk_io_time_seconds_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device) label_replace(label_replace(ceph_disk_occupation{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{device}} on {{instance}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Physical Device Util% for $osd",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "percentunit",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "schemaVersion": 16,
+ "style": "dark",
+ "tags": [],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "text": "default",
+ "value": "default"
+ },
+ "hide": 0,
+ "label": "Data Source",
+ "name": "datasource",
+ "options": [],
+ "query": "prometheus",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "type": "datasource"
+ },
+ {
+ "allValue": null,
+ "current": {},
+ "datasource": "$datasource",
+ "hide": 0,
+ "includeAll": false,
+ "label": "OSD",
+ "multi": false,
+ "name": "osd",
+ "options": [],
+ "query": "label_values(ceph_osd_metadata,ceph_daemon)",
+ "refresh": 1,
+ "regex": "(.*)",
+ "skipUrlSync": false,
+ "sort": 0,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-3h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "",
+ "title": "OSD device details",
+ "uid": "CrAHE0iZz",
+ "version": 3
+}
diff --git a/monitoring/grafana/dashboards/osds-overview.json b/monitoring/grafana/dashboards/osds-overview.json
new file mode 100644
index 00000000..4b91df9e
--- /dev/null
+++ b/monitoring/grafana/dashboards/osds-overview.json
@@ -0,0 +1,876 @@
+{
+
+ "__requires": [
+ {
+ "type": "grafana",
+ "id": "grafana",
+ "name": "Grafana",
+ "version": "5.0.0"
+ },
+ {
+ "type": "panel",
+ "id": "grafana-piechart-panel",
+ "name": "Pie Chart",
+ "version": "1.3.3"
+ },
+ {
+ "type": "panel",
+ "id": "graph",
+ "name": "Graph",
+ "version": "5.0.0"
+ },
+ {
+ "type": "panel",
+ "id": "table",
+ "name": "Table",
+ "version": "5.0.0"
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": false,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "id": null,
+ "iteration": 1538083987689,
+ "links": [],
+ "panels": [
+ {
+ "aliasColors": {
+ "@95%ile": "#e0752d"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "gridPos": {
+ "h": 8,
+ "w": 8,
+ "x": 0,
+ "y": 0
+ },
+ "id": 12,
+ "legend": {
+ "avg": false,
+ "current": true,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "avg (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "AVG read",
+ "refId": "A"
+ },
+ {
+ "expr": "max (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "MAX read",
+ "refId": "B"
+ },
+ {
+ "expr": "quantile(0.95,\n (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)\n)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "@95%ile",
+ "refId": "C"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "OSD Read Latencies",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "ms",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ]
+ },
+ {
+ "columns": [],
+ "datasource": "$datasource",
+ "description": "This table shows the osd's that are delivering the 10 highest read latencies within the cluster",
+ "fontSize": "100%",
+ "gridPos": {
+ "h": 8,
+ "w": 4,
+ "x": 8,
+ "y": 0
+ },
+ "id": 15,
+ "links": [],
+ "pageSize": null,
+ "scroll": true,
+ "showHeader": true,
+ "sort": {
+ "col": 2,
+ "desc": true
+ },
+ "styles": [
+ {
+ "alias": "OSD ID",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "ceph_daemon",
+ "thresholds": [],
+ "type": "string",
+ "unit": "short"
+ },
+ {
+ "alias": "Latency (ms)",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 0,
+ "pattern": "Value",
+ "thresholds": [],
+ "type": "number",
+ "unit": "none"
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "/.*/",
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ }
+ ],
+ "targets": [
+ {
+ "expr": "topk(10,\n (sort(\n (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)\n ))\n)\n\n",
+ "format": "table",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "title": "Highest READ Latencies",
+ "transform": "table",
+ "type": "table"
+ },
+ {
+ "aliasColors": {
+ "@95%ile write": "#e0752d"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "gridPos": {
+ "h": 8,
+ "w": 8,
+ "x": 12,
+ "y": 0
+ },
+ "id": 13,
+ "legend": {
+ "avg": false,
+ "current": true,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "avg (irate(ceph_osd_op_w_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "AVG write",
+ "refId": "A"
+ },
+ {
+ "expr": "max (irate(ceph_osd_op_w_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "MAX write",
+ "refId": "B"
+ },
+ {
+ "expr": "quantile(0.95,\n (irate(ceph_osd_op_w_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000)\n)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "@95%ile write",
+ "refId": "C"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "OSD Write Latencies",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "ms",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ]
+ },
+ {
+ "columns": [],
+ "datasource": "$datasource",
+ "description": "This table shows the osd's that are delivering the 10 highest write latencies within the cluster",
+ "fontSize": "100%",
+ "gridPos": {
+ "h": 8,
+ "w": 4,
+ "x": 20,
+ "y": 0
+ },
+ "id": 16,
+ "links": [],
+ "pageSize": null,
+ "scroll": true,
+ "showHeader": true,
+ "sort": {
+ "col": 2,
+ "desc": true
+ },
+ "styles": [
+ {
+ "alias": "OSD ID",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "ceph_daemon",
+ "thresholds": [],
+ "type": "string",
+ "unit": "short"
+ },
+ {
+ "alias": "Latency (ms)",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 0,
+ "pattern": "Value",
+ "thresholds": [],
+ "type": "number",
+ "unit": "none"
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "/.*/",
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ }
+ ],
+ "targets": [
+ {
+ "expr": "topk(10,\n (sort(\n (irate(ceph_osd_op_w_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000)\n ))\n)\n\n",
+ "format": "table",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "title": "Highest WRITE Latencies",
+ "transform": "table",
+ "type": "table"
+ },
+ {
+ "aliasColors": {},
+ "breakPoint": "50%",
+ "cacheTimeout": null,
+ "combine": {
+ "label": "Others",
+ "threshold": 0
+ },
+ "datasource": "$datasource",
+ "fontSize": "80%",
+ "format": "none",
+ "gridPos": {
+ "h": 8,
+ "w": 4,
+ "x": 0,
+ "y": 8
+ },
+ "id": 2,
+ "interval": null,
+ "legend": {
+ "show": true,
+ "values": true
+ },
+ "legendType": "Under graph",
+ "links": [],
+ "maxDataPoints": 3,
+ "nullPointMode": "connected",
+ "pieType": "pie",
+ "strokeWidth": 1,
+ "targets": [
+ {
+ "expr": "count by (device_class) (ceph_osd_metadata)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{device_class}}",
+ "refId": "A"
+ }
+ ],
+ "title": "OSD Types Summary",
+ "type": "grafana-piechart-panel",
+ "valueName": "current"
+ },
+ {
+ "aliasColors": {
+ "Non-Encrypted": "#E5AC0E"
+ },
+ "breakPoint": "50%",
+ "cacheTimeout": null,
+ "combine": {
+ "label": "Others",
+ "threshold": 0
+ },
+ "datasource": "$datasource",
+ "fontSize": "80%",
+ "format": "none",
+ "gridPos": {
+ "h": 8,
+ "w": 4,
+ "x": 4,
+ "y": 8
+ },
+ "height": "200px",
+ "hideTimeOverride": true,
+ "id": 4,
+ "interval": null,
+ "legend": {
+ "percentage": false,
+ "show": true,
+ "values": true
+ },
+ "legendType": "Under graph",
+ "links": [],
+ "maxDataPoints": "1",
+ "minSpan": 4,
+ "nullPointMode": "connected",
+ "pieType": "pie",
+ "strokeWidth": 1,
+ "targets": [
+ {
+ "expr": "count(ceph_bluefs_wal_total_bytes)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "bluestore",
+ "refId": "A",
+ "step": 240
+ },
+ {
+ "expr": "count(ceph_osd_metadata) - count(ceph_bluefs_wal_total_bytes)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "filestore",
+ "refId": "B",
+ "step": 240
+ },
+ {
+ "expr": "absent(ceph_bluefs_wal_total_bytes)*count(ceph_osd_metadata)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "filestore",
+ "refId": "C",
+ "step": 240
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "OSD Objectstore Types",
+ "type": "grafana-piechart-panel",
+ "valueName": "current"
+ },
+ {
+ "aliasColors": {},
+ "breakPoint": "50%",
+ "cacheTimeout": null,
+ "combine": {
+ "label": "Others",
+ "threshold": "0.05"
+ },
+ "datasource": "$datasource",
+ "description": "The pie chart shows the various OSD sizes used within the cluster",
+ "fontSize": "80%",
+ "format": "none",
+ "gridPos": {
+ "h": 8,
+ "w": 4,
+ "x": 8,
+ "y": 8
+ },
+ "height": "220",
+ "hideTimeOverride": true,
+ "id": 8,
+ "interval": null,
+ "legend": {
+ "header": "",
+ "percentage": false,
+ "show": true,
+ "sideWidth": null,
+ "sortDesc": true,
+ "values": true
+ },
+ "legendType": "Under graph",
+ "links": [],
+ "maxDataPoints": "",
+ "minSpan": 6,
+ "nullPointMode": "connected",
+ "pieType": "pie",
+ "strokeWidth": "1",
+ "targets": [
+ {
+ "expr": "count(ceph_osd_stat_bytes < 1099511627776)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "<1 TB",
+ "refId": "A",
+ "step": 2
+ },
+ {
+ "expr": "count(ceph_osd_stat_bytes >= 1099511627776 < 2199023255552)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "<2 TB",
+ "refId": "B",
+ "step": 2
+ },
+ {
+ "expr": "count(ceph_osd_stat_bytes >= 2199023255552 < 3298534883328)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "<3TB",
+ "refId": "C",
+ "step": 2
+ },
+ {
+ "expr": "count(ceph_osd_stat_bytes >= 3298534883328 < 4398046511104)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "<4TB",
+ "refId": "D",
+ "step": 2
+ },
+ {
+ "expr": "count(ceph_osd_stat_bytes >= 4398046511104 < 6597069766656)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "<6TB",
+ "refId": "E",
+ "step": 2
+ },
+ {
+ "expr": "count(ceph_osd_stat_bytes >= 6597069766656 < 8796093022208)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "<8TB",
+ "refId": "F",
+ "step": 2
+ },
+ {
+ "expr": "count(ceph_osd_stat_bytes >= 8796093022208 < 10995116277760)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "<10TB",
+ "refId": "G",
+ "step": 2
+ },
+ {
+ "expr": "count(ceph_osd_stat_bytes >= 10995116277760 < 13194139533312)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "<12TB",
+ "refId": "H",
+ "step": 2
+ },
+ {
+ "expr": "count(ceph_osd_stat_bytes >= 13194139533312)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "12TB+",
+ "refId": "I",
+ "step": 2
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "OSD Size Summary",
+ "type": "grafana-piechart-panel",
+ "valueName": "current"
+ },
+ {
+ "aliasColors": {},
+ "bars": true,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "Each bar indicates the number of OSD's that have a PG count in a specific range as shown on the x axis.",
+ "fill": 1,
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 8
+ },
+ "id": 6,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "hideEmpty": false,
+ "hideZero": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": false,
+ "total": false,
+ "values": false
+ },
+ "lines": false,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "ceph_osd_numpg\n",
+ "format": "time_series",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "PGs per OSD",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Distribution of PGs per OSD",
+ "tooltip": {
+ "shared": false,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": 20,
+ "mode": "histogram",
+ "name": null,
+ "show": true,
+ "values": [
+ "total"
+ ]
+ },
+ "yaxes": [
+ {
+ "decimals": 0,
+ "format": "short",
+ "label": "# of OSDs",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ]
+ },
+ {
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 16
+ },
+ "id": 20,
+ "panels": [],
+ "title": "R/W Profile",
+ "type": "row"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "Show the read/write workload profile overtime",
+ "fill": 1,
+ "gridPos": {
+ "h": 8,
+ "w": 24,
+ "x": 0,
+ "y": 17
+ },
+ "id": 10,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "round(sum(irate(ceph_pool_rd[30s])))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Reads",
+ "refId": "A"
+ },
+ {
+ "expr": "round(sum(irate(ceph_pool_wr[30s])))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Writes",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Read/Write Profile",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "refresh": "10s",
+ "schemaVersion": 16,
+ "style": "dark",
+ "tags": [],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "tags": [],
+ "text": "default",
+ "value": "default"
+ },
+ "hide": 0,
+ "label": "Data Source",
+ "name": "datasource",
+ "options": [],
+ "query": "prometheus",
+ "refresh": 1,
+ "regex": "",
+ "type": "datasource"
+ }
+ ]
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "",
+ "title": "OSD Overview",
+ "uid": "lo02I1Aiz",
+ "version": 3
+}
diff --git a/monitoring/grafana/dashboards/pool-detail.json b/monitoring/grafana/dashboards/pool-detail.json
new file mode 100644
index 00000000..41554ed3
--- /dev/null
+++ b/monitoring/grafana/dashboards/pool-detail.json
@@ -0,0 +1,665 @@
+{
+ "__inputs": [],
+ "__requires": [
+ {
+ "type": "grafana",
+ "id": "grafana",
+ "name": "Grafana",
+ "version": "5.3.2"
+ },
+ {
+ "type": "panel",
+ "id": "graph",
+ "name": "Graph",
+ "version": "5.0.0"
+ },
+ {
+ "type": "panel",
+ "id": "singlestat",
+ "name": "Singlestat",
+ "version": "5.0.0"
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": false,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "id": null,
+ "iteration": 1551858875941,
+ "links": [],
+ "panels": [
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": true,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "$datasource",
+ "decimals": 2,
+ "format": "percentunit",
+ "gauge": {
+ "maxValue": 1,
+ "minValue": 0,
+ "show": true,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 7,
+ "w": 7,
+ "x": 0,
+ "y": 0
+ },
+ "id": 12,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": true
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "(ceph_pool_stored / (ceph_pool_stored + ceph_pool_max_avail)) * on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~\"$pool_name\"}",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "refId": "A"
+ }
+ ],
+ "thresholds": ".7,.8",
+ "title": "Capacity used",
+ "type": "singlestat",
+ "valueFontSize": "50%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "$datasource",
+ "description": "Time till pool is full assuming the average fill rate of the last 6 hours",
+ "format": "s",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 7,
+ "w": 5,
+ "x": 7,
+ "y": 0
+ },
+ "id": 14,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "(ceph_pool_max_avail / deriv(ceph_pool_stored[6h])) * on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~\"$pool_name\"} > 0",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "Time till full",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "inf",
+ "value": "null"
+ },
+ {
+ "op": "=",
+ "text": "inf",
+ "value": "N/A"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "aliasColors": {
+ "read_op_per_sec": "#3F6833",
+ "write_op_per_sec": "#E5AC0E"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 12,
+ "y": 0
+ },
+ "id": 10,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "minSpan": 12,
+ "nullPointMode": "null as zero",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "deriv(ceph_pool_objects[1m]) * on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~\"$pool_name\"}",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Objects per second",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "$pool_name Object Ingress/Egress",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "ops",
+ "label": "Objects out(-) / in(+) ",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "read_op_per_sec": "#3F6833",
+ "write_op_per_sec": "#E5AC0E"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 0,
+ "y": 7
+ },
+ "id": 6,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "minSpan": 12,
+ "nullPointMode": "null as zero",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "reads",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(ceph_pool_rd[1m]) * on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~\"$pool_name\"}",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "reads",
+ "refId": "B"
+ },
+ {
+ "expr": "irate(ceph_pool_wr[1m]) * on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~\"$pool_name\"}",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "writes",
+ "refId": "C"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "$pool_name Client IOPS",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "iops",
+ "label": "Read (-) / Write (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "read_op_per_sec": "#3F6833",
+ "write_op_per_sec": "#E5AC0E"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 12,
+ "y": 7
+ },
+ "id": 7,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "minSpan": 12,
+ "nullPointMode": "null as zero",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "reads",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(ceph_pool_rd_bytes[1m]) + on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~\"$pool_name\"}",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "reads",
+ "refId": "A"
+ },
+ {
+ "expr": "irate(ceph_pool_wr_bytes[1m]) + on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~\"$pool_name\"}",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "writes",
+ "refId": "C"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "$pool_name Client Throughput",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "Bps",
+ "label": "Read (-) / Write (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "read_op_per_sec": "#3F6833",
+ "write_op_per_sec": "#E5AC0E"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 0,
+ "y": 14
+ },
+ "id": 8,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "minSpan": 12,
+ "nullPointMode": "null as zero",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "ceph_pool_objects * on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~\"$pool_name\"}",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Number of Objects",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "$pool_name Objects",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "Objects",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "refresh": "15s",
+ "schemaVersion": 16,
+ "style": "dark",
+ "tags": [],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "text": "Prometheus admin.virt1.home.fajerski.name:9090",
+ "value": "Prometheus admin.virt1.home.fajerski.name:9090"
+ },
+ "hide": 0,
+ "label": "Data Source",
+ "name": "datasource",
+ "options": [],
+ "query": "prometheus",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "type": "datasource"
+ },
+ {
+ "allValue": null,
+ "current": {},
+ "datasource": "$datasource",
+ "hide": 0,
+ "includeAll": false,
+ "label": "Pool Name",
+ "multi": false,
+ "name": "pool_name",
+ "options": [],
+ "query": "label_values(ceph_pool_metadata,name)",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "15s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "browser",
+ "title": "Ceph Pool Details",
+ "uid": "-xyV8KCiz",
+ "version": 1
+}
diff --git a/monitoring/grafana/dashboards/pool-overview.json b/monitoring/grafana/dashboards/pool-overview.json
new file mode 100644
index 00000000..d8654599
--- /dev/null
+++ b/monitoring/grafana/dashboards/pool-overview.json
@@ -0,0 +1,706 @@
+{
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": false,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "iteration": 1551789900270,
+ "links": [],
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 0,
+ "y": 1
+ },
+ "id": 1,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "minSpan": 12,
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.* read/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "topk($topk,rate(ceph_pool_rd[1m]) + on(pool_id) group_left(instance,name) ceph_pool_metadata) ",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 1,
+ "legendFormat": "{{name}} - read",
+ "refId": "F"
+ },
+ {
+ "expr": "topk($topk,rate(ceph_pool_wr[1m]) + on(pool_id) group_left(instance,name) ceph_pool_metadata) ",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{name}} - write",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Top $topk Client IOPS by Pool",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "none",
+ "label": "Read (-) / Write (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 12,
+ "y": 1
+ },
+ "id": 2,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "minSpan": 12,
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.* read/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "topk($topk,rate(ceph_pool_rd_bytes[1m]) + on(pool_id) group_left(instance,name) ceph_pool_metadata)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{name}} - read",
+ "refId": "A",
+ "textEditor": true
+ },
+ {
+ "expr": "topk($topk,rate(ceph_pool_wr_bytes[1m]) + on(pool_id) group_left(instance,name) ceph_pool_metadata)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{name}} - write",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Top $topk Client Throughput by Pool",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "Bps",
+ "label": "Read (-) / Writes (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "columns": [
+ {
+ "text": "Current",
+ "value": "current"
+ }
+ ],
+ "datasource": "$datasource",
+ "fontSize": "100%",
+ "gridPos": {
+ "h": 7,
+ "w": 8,
+ "x": 0,
+ "y": 9
+ },
+ "id": 3,
+ "links": [],
+ "minSpan": 12,
+ "pageSize": null,
+ "scroll": true,
+ "showHeader": true,
+ "sort": {
+ "col": 5,
+ "desc": true
+ },
+ "styles": [
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "Time",
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "id",
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "instance",
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "job",
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ },
+ {
+ "alias": "Pool Name",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "name",
+ "thresholds": [],
+ "type": "number",
+ "unit": "short"
+ },
+ {
+ "alias": "Pool ID",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "pool_id",
+ "thresholds": [],
+ "type": "number",
+ "unit": "short"
+ },
+ {
+ "alias": "IOPS (R+W)",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 0,
+ "pattern": "Value",
+ "thresholds": [],
+ "type": "number",
+ "unit": "none"
+ }
+ ],
+ "targets": [
+ {
+ "expr": "topk($topk,((irate(ceph_pool_rd[1m]) + irate(ceph_pool_wr[1m])) + on(pool_id) group_left(instance,name) ceph_pool_metadata))",
+ "format": "table",
+ "instant": true,
+ "intervalFactor": 2,
+ "refId": "A",
+ "textEditor": true
+ }
+ ],
+ "title": "Top $topk Pools by Client IOPS",
+ "transform": "table",
+ "type": "table"
+ },
+ {
+ "columns": [
+ {
+ "text": "Current",
+ "value": "current"
+ }
+ ],
+ "datasource": "$datasource",
+ "fontSize": "100%",
+ "gridPos": {
+ "h": 7,
+ "w": 8,
+ "x": 8,
+ "y": 9
+ },
+ "id": 4,
+ "links": [],
+ "minSpan": 12,
+ "pageSize": null,
+ "scroll": true,
+ "showHeader": true,
+ "sort": {
+ "col": 5,
+ "desc": true
+ },
+ "styles": [
+ {
+ "alias": "Time",
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "pattern": "Time",
+ "type": "hidden"
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "id",
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "instance",
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "job",
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ },
+ {
+ "alias": "Pool Name",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "name",
+ "thresholds": [],
+ "type": "number",
+ "unit": "short"
+ },
+ {
+ "alias": "Pool ID",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "pool_id",
+ "thresholds": [],
+ "type": "number",
+ "unit": "short"
+ },
+ {
+ "alias": "Throughput",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "Value",
+ "thresholds": [],
+ "type": "number",
+ "unit": "decbytes"
+ }
+ ],
+ "targets": [
+ {
+ "expr": "topk($topk,(irate(ceph_pool_rd_bytes[1m]) + irate(ceph_pool_wr_bytes[1m])) + on(pool_id) group_left(instance,name) ceph_pool_metadata) ",
+ "format": "table",
+ "instant": true,
+ "intervalFactor": 2,
+ "refId": "A",
+ "textEditor": true
+ }
+ ],
+ "title": "Top $topk Pools by Throughput",
+ "transform": "table",
+ "type": "table"
+ },
+ {
+ "columns": [],
+ "datasource": "$datasource",
+ "fontSize": "100%",
+ "gridPos": {
+ "h": 7,
+ "w": 8,
+ "x": 16,
+ "y": 9
+ },
+ "id": 5,
+ "links": [],
+ "minSpan": 8,
+ "pageSize": null,
+ "scroll": true,
+ "showHeader": true,
+ "sort": {
+ "col": 5,
+ "desc": true
+ },
+ "styles": [
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "Time",
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "instance",
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "job",
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ },
+ {
+ "alias": "Pool Name",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "name",
+ "thresholds": [],
+ "type": "string",
+ "unit": "short"
+ },
+ {
+ "alias": "Pool ID",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "pool_id",
+ "thresholds": [],
+ "type": "number",
+ "unit": "short"
+ },
+ {
+ "alias": "Capacity Used",
+ "colorMode": "value",
+ "colors": [
+ "rgba(50, 172, 45, 0.97)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(245, 54, 54, 0.9)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "Value",
+ "thresholds": [
+ "70",
+ "85"
+ ],
+ "type": "number",
+ "unit": "percentunit"
+ }
+ ],
+ "targets": [
+ {
+ "expr": "topk($topk,((ceph_pool_stored / (ceph_pool_stored + ceph_pool_max_avail)) * on(pool_id) group_left(name) ceph_pool_metadata))",
+ "format": "table",
+ "hide": false,
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "D"
+ }
+ ],
+ "title": "Top $topk Pools By Capacity Used",
+ "transform": "table",
+ "type": "table"
+ }
+ ],
+ "refresh": "15s",
+ "schemaVersion": 16,
+ "style": "dark",
+ "tags": [],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "text": "Prometheus admin.virt1.home.fajerski.name:9090",
+ "value": "Prometheus admin.virt1.home.fajerski.name:9090"
+ },
+ "hide": 0,
+ "label": "Data Source",
+ "name": "datasource",
+ "options": [],
+ "query": "prometheus",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "type": "datasource"
+ },
+ {
+ "current": {
+ "text": "15",
+ "value": "15"
+ },
+ "hide": 0,
+ "label": "Top K",
+ "name": "topk",
+ "options": [
+ {
+ "text": "15",
+ "value": "15"
+ }
+ ],
+ "query": "15",
+ "skipUrlSync": false,
+ "type": "textbox"
+ }
+ ]
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "15s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "browser",
+ "title": "Ceph Pools Overview",
+ "uid": "z99hzWtmk",
+ "version": 1
+}
diff --git a/monitoring/grafana/dashboards/radosgw-detail.json b/monitoring/grafana/dashboards/radosgw-detail.json
new file mode 100644
index 00000000..648abab8
--- /dev/null
+++ b/monitoring/grafana/dashboards/radosgw-detail.json
@@ -0,0 +1,491 @@
+{
+ "__requires": [
+ {
+ "type": "grafana",
+ "id": "grafana",
+ "name": "Grafana",
+ "version": "5.0.0"
+ },
+ {
+ "type": "panel",
+ "id": "grafana-piechart-panel",
+ "name": "Pie Chart",
+ "version": "1.3.3"
+ },
+ {
+ "type": "panel",
+ "id": "graph",
+ "name": "Graph",
+ "version": "5.0.0"
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": false,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "id": null,
+ "iteration": 1534386250869,
+ "links": [],
+ "panels": [
+ {
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 0
+ },
+ "id": 12,
+ "panels": [],
+ "repeat": null,
+ "title": "RGW Host Detail : $rgw_servers",
+ "type": "row"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "gridPos": {
+ "h": 8,
+ "w": 6,
+ "x": 0,
+ "y": 1
+ },
+ "id": 34,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum by (ceph_daemon) (rate(ceph_rgw_get_initial_lat_sum{ceph_daemon=~\"($rgw_servers)\"}[30s]) / rate(ceph_rgw_get_initial_lat_count{ceph_daemon=~\"($rgw_servers)\"}[30s]))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "GET {{ceph_daemon}}",
+ "refId": "A"
+ },
+ {
+ "expr": "sum by (ceph_daemon)(rate(ceph_rgw_put_initial_lat_sum{ceph_daemon=~\"($rgw_servers)\"}[30s]) / rate(ceph_rgw_put_initial_lat_count{ceph_daemon=~\"($rgw_servers)\"}[30s]))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "PUT {{ceph_daemon}}",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "$rgw_servers GET/PUT Latencies",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 7,
+ "x": 6,
+ "y": 1
+ },
+ "id": 18,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(ceph_rgw_get_b{ceph_daemon=~\"[[rgw_servers]]\"}[30s])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "GETs {{ceph_daemon}}",
+ "refId": "B"
+ },
+ {
+ "expr": "rate(ceph_rgw_put_b{ceph_daemon=~\"[[rgw_servers]]\"}[30s])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "PUTs {{ceph_daemon}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Bandwidth by HTTP Operation",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": 0,
+ "format": "bytes",
+ "label": "",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ]
+ },
+ {
+ "aliasColors": {
+ "GETs": "#7eb26d",
+ "Other": "#447ebc",
+ "PUTs": "#eab839",
+ "Requests": "#3f2b5b",
+ "Requests Failed": "#bf1b00"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 7,
+ "x": 13,
+ "y": 1
+ },
+ "id": 14,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(ceph_rgw_failed_req{ceph_daemon=~\"[[rgw_servers]]\"}[30s])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Requests Failed {{ceph_daemon}}",
+ "refId": "B"
+ },
+ {
+ "expr": "rate(ceph_rgw_get{ceph_daemon=~\"[[rgw_servers]]\"}[30s])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "GETs {{ceph_daemon}}",
+ "refId": "C"
+ },
+ {
+ "expr": "rate(ceph_rgw_put{ceph_daemon=~\"[[rgw_servers]]\"}[30s])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "PUTs {{ceph_daemon}}",
+ "refId": "D"
+ },
+ {
+ "expr": "rate(ceph_rgw_req{ceph_daemon=~\"[[rgw_servers]]\"}[30s]) -\n (rate(ceph_rgw_get{ceph_daemon=~\"[[rgw_servers]]\"}[30s]) +\n rate(ceph_rgw_put{ceph_daemon=~\"[[rgw_servers]]\"}[30s]))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Other {{ceph_daemon}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "HTTP Request Breakdown",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {
+ "Failures": "#bf1b00",
+ "GETs": "#7eb26d",
+ "Other (HEAD,POST,DELETE)": "#447ebc",
+ "PUTs": "#eab839"
+ },
+ "breakPoint": "50%",
+ "cacheTimeout": null,
+ "combine": {
+ "label": "Others",
+ "threshold": 0
+ },
+ "datasource": "$datasource",
+ "fontSize": "80%",
+ "format": "none",
+ "gridPos": {
+ "h": 8,
+ "w": 4,
+ "x": 20,
+ "y": 1
+ },
+ "id": 23,
+ "interval": null,
+ "legend": {
+ "show": true,
+ "values": true
+ },
+ "legendType": "Under graph",
+ "links": [],
+ "maxDataPoints": 3,
+ "nullPointMode": "connected",
+ "pieType": "pie",
+ "strokeWidth": 1,
+ "targets": [
+ {
+ "expr": "rate(ceph_rgw_failed_req{ceph_daemon=~\"[[rgw_servers]]\"}[30s])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Failures {{ceph_daemon}}",
+ "refId": "A"
+ },
+ {
+ "expr": "rate(ceph_rgw_get{ceph_daemon=~\"[[rgw_servers]]\"}[30s])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "GETs {{ceph_daemon}}",
+ "refId": "B"
+ },
+ {
+ "expr": "rate(ceph_rgw_put{ceph_daemon=~\"[[rgw_servers]]\"}[30s])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "PUTs {{ceph_daemon}}",
+ "refId": "C"
+ },
+ {
+ "expr": "rate(ceph_rgw_req{ceph_daemon=~\"[[rgw_servers]]\"}[30s]) -\n (rate(ceph_rgw_get{ceph_daemon=~\"[[rgw_servers]]\"}[30s]) +\n rate(ceph_rgw_put{ceph_daemon=~\"[[rgw_servers]]\"}[30s]))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Other (DELETE,LIST) {{ceph_daemon}}",
+ "refId": "D"
+ }
+ ],
+ "title": "Workload Breakdown",
+ "type": "grafana-piechart-panel",
+ "valueName": "current"
+ }
+ ],
+ "refresh": "15s",
+ "schemaVersion": 16,
+ "style": "dark",
+ "tags": [
+ "overview"
+ ],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "tags": [],
+ "text": "default",
+ "value": "default"
+ },
+ "hide": 0,
+ "label": "Data Source",
+ "name": "datasource",
+ "options": [],
+ "query": "prometheus",
+ "refresh": 1,
+ "regex": "",
+ "type": "datasource"
+ },
+ {
+ "allValue": null,
+ "current": {},
+ "datasource": "$datasource",
+ "hide": 0,
+ "includeAll": true,
+ "label": null,
+ "multi": false,
+ "name": "rgw_servers",
+ "options": [],
+ "query": "label_values(ceph_rgw_req, ceph_daemon)",
+ "refresh": 1,
+ "regex": "",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "15s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "",
+ "title": "RGW Instance Detail",
+ "uid": "x5ARzZtmk",
+ "version": 2
+}
diff --git a/monitoring/grafana/dashboards/radosgw-overview.json b/monitoring/grafana/dashboards/radosgw-overview.json
new file mode 100644
index 00000000..487d736b
--- /dev/null
+++ b/monitoring/grafana/dashboards/radosgw-overview.json
@@ -0,0 +1,630 @@
+{
+ "__requires": [
+ {
+ "type": "grafana",
+ "id": "grafana",
+ "name": "Grafana",
+ "version": "5.0.0"
+ },
+ {
+ "type": "panel",
+ "id": "graph",
+ "name": "Graph",
+ "version": "5.0.0"
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": false,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "id": null,
+ "iteration": 1534386107523,
+ "links": [],
+ "panels": [
+ {
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 0
+ },
+ "id": 2,
+ "panels": [],
+ "title": "RGW Overview - All Gateways",
+ "type": "row"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "gridPos": {
+ "h": 7,
+ "w": 8,
+ "x": 0,
+ "y": 1
+ },
+ "id": 29,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(ceph_rgw_get_initial_lat_sum[30s]) / rate(ceph_rgw_get_initial_lat_count[30s])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "GET AVG",
+ "refId": "A"
+ },
+ {
+ "expr": "rate(ceph_rgw_put_initial_lat_sum[30s]) / rate(ceph_rgw_put_initial_lat_count[30s])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "PUT AVG",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Average GET/PUT Latencies",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "gridPos": {
+ "h": 7,
+ "w": 7,
+ "x": 8,
+ "y": 1
+ },
+ "id": 4,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum by(rgw_host) (label_replace(rate(ceph_rgw_req[30s]), \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{rgw_host}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Total Requests/sec by RGW Instance",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": 0,
+ "format": "none",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "Latencies are shown stacked, without a yaxis to provide a visual indication of GET latency imbalance across RGW hosts",
+ "fill": 1,
+ "gridPos": {
+ "h": 7,
+ "w": 6,
+ "x": 15,
+ "y": 1
+ },
+ "id": 31,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "label_replace(rate(ceph_rgw_get_initial_lat_sum[30s]),\"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\") / \nlabel_replace(rate(ceph_rgw_get_initial_lat_count[30s]),\"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\")",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{rgw_host}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "GET Latencies by RGW Instance",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": null,
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": false
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "Total bytes transferred in/out of all radosgw instances within the cluster",
+ "fill": 1,
+ "gridPos": {
+ "h": 6,
+ "w": 8,
+ "x": 0,
+ "y": 8
+ },
+ "id": 6,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(rate(ceph_rgw_get_b[30s]))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "GETs",
+ "refId": "A"
+ },
+ {
+ "expr": "sum(rate(ceph_rgw_put_b[30s]))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "PUTs",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Bandwidth Consumed by Type",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "Total bytes transferred in/out through get/put operations, by radosgw instance",
+ "fill": 1,
+ "gridPos": {
+ "h": 6,
+ "w": 7,
+ "x": 8,
+ "y": 8
+ },
+ "id": 9,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum by(rgw_host) (\n (label_replace(rate(ceph_rgw_get_b[30s]), \"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\")) + \n (label_replace(rate(ceph_rgw_put_b[30s]), \"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\"))\n)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{rgw_host}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Bandwidth by RGW Instance",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "Latencies are shown stacked, without a yaxis to provide a visual indication of PUT latency imbalance across RGW hosts",
+ "fill": 1,
+ "gridPos": {
+ "h": 6,
+ "w": 6,
+ "x": 15,
+ "y": 8
+ },
+ "id": 32,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "label_replace(rate(ceph_rgw_put_initial_lat_sum[30s]),\"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\") / \nlabel_replace(rate(ceph_rgw_put_initial_lat_count[30s]),\"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\")",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{rgw_host}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "PUT Latencies by RGW Instance",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": null,
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": false
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ]
+ }
+ ],
+ "refresh": "15s",
+ "schemaVersion": 16,
+ "style": "dark",
+ "tags": [
+ "overview"
+ ],
+ "templating": {
+ "list": [
+ {
+ "allValue": null,
+ "current": {},
+ "datasource": "$datasource",
+ "hide": 2,
+ "includeAll": true,
+ "label": null,
+ "multi": false,
+ "name": "rgw_servers",
+ "options": [],
+ "query": "label_values(ceph_rgw_req, ceph_daemon)",
+ "refresh": 1,
+ "regex": "",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "current": {
+ "tags": [],
+ "text": "default",
+ "value": "default"
+ },
+ "hide": 0,
+ "label": "Data Source",
+ "name": "datasource",
+ "options": [],
+ "query": "prometheus",
+ "refresh": 1,
+ "regex": "",
+ "type": "datasource"
+ }
+ ]
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "15s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "",
+ "title": "RGW Overview",
+ "uid": "WAkugZpiz",
+ "version": 2
+}
diff --git a/monitoring/grafana/dashboards/rbd-details.json b/monitoring/grafana/dashboards/rbd-details.json
new file mode 100644
index 00000000..68fffad1
--- /dev/null
+++ b/monitoring/grafana/dashboards/rbd-details.json
@@ -0,0 +1,409 @@
+{
+ "__inputs": [],
+ "__requires": [
+ {
+ "type": "grafana",
+ "id": "grafana",
+ "name": "Grafana",
+ "version": "5.3.3"
+ },
+ {
+ "type": "panel",
+ "id": "graph",
+ "name": "Graph",
+ "version": "5.0.0"
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "description": "Detailed Performance of RBD Images (IOPS/Throughput/Latency)",
+ "editable": false,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "id": null,
+ "iteration": 1584428820779,
+ "links": [],
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$Datasource",
+ "fill": 1,
+ "gridPos": {
+ "h": 9,
+ "w": 8,
+ "x": 0,
+ "y": 0
+ },
+ "id": 6,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "hideEmpty": false,
+ "hideZero": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null as zero",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(ceph_rbd_write_ops{pool=\"$Pool\", image=\"$Image\"}[30s])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Write",
+ "refId": "A"
+ },
+ {
+ "expr": "irate(ceph_rbd_read_ops{pool=\"$Pool\", image=\"$Image\"}[30s])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Read",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "IOPS",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "iops",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "iops",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": true,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$Datasource",
+ "fill": 1,
+ "gridPos": {
+ "h": 9,
+ "w": 8,
+ "x": 8,
+ "y": 0
+ },
+ "id": 4,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null as zero",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(ceph_rbd_write_bytes{pool=\"$Pool\", image=\"$Image\"}[30s])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Write",
+ "refId": "A"
+ },
+ {
+ "expr": "irate(ceph_rbd_read_bytes{pool=\"$Pool\", image=\"$Image\"}[30s])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Read",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Throughput",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "Bps",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "Bps",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": true,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$Datasource",
+ "fill": 1,
+ "gridPos": {
+ "h": 9,
+ "w": 8,
+ "x": 16,
+ "y": 0
+ },
+ "id": 2,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null as zero",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(ceph_rbd_write_latency_sum{pool=\"$Pool\", image=\"$Image\"}[30s]) / irate(ceph_rbd_write_latency_count{pool=\"$Pool\", image=\"$Image\"}[30s])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Write",
+ "refId": "A"
+ },
+ {
+ "expr": "irate(ceph_rbd_read_latency_sum{pool=\"$Pool\", image=\"$Image\"}[30s]) / irate(ceph_rbd_read_latency_count{pool=\"$Pool\", image=\"$Image\"}[30s])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Read",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Average Latency",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "ns",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "ns",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": true,
+ "alignLevel": null
+ }
+ }
+ ],
+ "refresh": false,
+ "schemaVersion": 16,
+ "style": "dark",
+ "tags": [],
+ "templating": {
+ "list": [
+ {
+ "current": {},
+ "hide": 0,
+ "label": null,
+ "name": "Datasource",
+ "options": [],
+ "query": "prometheus",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "type": "datasource"
+ },
+ {
+ "allValue": null,
+ "current": {},
+ "datasource": "$Datasource",
+ "hide": 0,
+ "includeAll": false,
+ "label": null,
+ "multi": false,
+ "name": "Pool",
+ "options": [],
+ "query": "label_values(pool)",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 0,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": {},
+ "datasource": "$Datasource",
+ "hide": 0,
+ "includeAll": false,
+ "label": null,
+ "multi": false,
+ "name": "Image",
+ "options": [],
+ "query": "label_values(image)",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 0,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "",
+ "title": "RBD Details",
+ "uid": "YhCYGcuZz",
+ "version": 7
+} \ No newline at end of file
diff --git a/monitoring/grafana/dashboards/rbd-overview.json b/monitoring/grafana/dashboards/rbd-overview.json
new file mode 100644
index 00000000..eb15fbcb
--- /dev/null
+++ b/monitoring/grafana/dashboards/rbd-overview.json
@@ -0,0 +1,685 @@
+{
+ "__inputs": [],
+ "__requires": [
+ {
+ "type": "grafana",
+ "id": "grafana",
+ "name": "Grafana",
+ "version": "5.4.2"
+ },
+ {
+ "type": "panel",
+ "id": "graph",
+ "name": "Graph",
+ "version": "5.0.0"
+ },
+ {
+ "type": "datasource",
+ "id": "prometheus",
+ "name": "Prometheus",
+ "version": "5.0.0"
+ },
+ {
+ "type": "panel",
+ "id": "table",
+ "name": "Table",
+ "version": "5.0.0"
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": false,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "id": null,
+ "iteration": 1547242766440,
+ "links": [],
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "gridPos": {
+ "h": 7,
+ "w": 8,
+ "x": 0,
+ "y": 0
+ },
+ "id": 2,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "round(sum(irate(ceph_rbd_write_ops[30s])))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Writes",
+ "refId": "A"
+ },
+ {
+ "expr": "round(sum(irate(ceph_rbd_read_ops[30s])))",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "Reads",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "IOPS",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "gridPos": {
+ "h": 7,
+ "w": 8,
+ "x": 8,
+ "y": 0
+ },
+ "id": 4,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "round(sum(irate(ceph_rbd_write_bytes[30s])))",
+ "format": "time_series",
+ "instant": false,
+ "intervalFactor": 1,
+ "legendFormat": "Write",
+ "refId": "A"
+ },
+ {
+ "expr": "round(sum(irate(ceph_rbd_read_bytes[30s])))",
+ "format": "time_series",
+ "instant": false,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "Read",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Throughput",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "Bps",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "gridPos": {
+ "h": 7,
+ "w": 8,
+ "x": 16,
+ "y": 0
+ },
+ "id": 6,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "round(sum(irate(ceph_rbd_write_latency_sum[30s])) / sum(irate(ceph_rbd_write_latency_count[30s])))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "Write",
+ "refId": "A"
+ },
+ {
+ "expr": "round(sum(irate(ceph_rbd_read_latency_sum[30s])) / sum(irate(ceph_rbd_read_latency_count[30s])))",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "Read",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Average Latency",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "ns",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "columns": [],
+ "datasource": "$datasource",
+ "fontSize": "100%",
+ "gridPos": {
+ "h": 7,
+ "w": 8,
+ "x": 0,
+ "y": 7
+ },
+ "hideTimeOverride": false,
+ "id": 12,
+ "links": [],
+ "pageSize": null,
+ "scroll": true,
+ "showHeader": true,
+ "sort": {
+ "col": 3,
+ "desc": true
+ },
+ "styles": [
+ {
+ "alias": "Pool",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "pool",
+ "thresholds": [],
+ "type": "string",
+ "unit": "short",
+ "valueMaps": []
+ },
+ {
+ "alias": "Image",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "image",
+ "thresholds": [],
+ "type": "string",
+ "unit": "short"
+ },
+ {
+ "alias": "IOPS",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "Value",
+ "thresholds": [],
+ "type": "number",
+ "unit": "iops"
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "/.*/",
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ }
+ ],
+ "targets": [
+ {
+ "expr": "topk(10, (sort((irate(ceph_rbd_write_ops[30s]) + on (image, pool, namespace) irate(ceph_rbd_read_ops[30s])))))",
+ "format": "table",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "title": "Highest IOPS",
+ "transform": "table",
+ "type": "table"
+ },
+ {
+ "columns": [],
+ "datasource": "$datasource",
+ "fontSize": "100%",
+ "gridPos": {
+ "h": 7,
+ "w": 8,
+ "x": 8,
+ "y": 7
+ },
+ "id": 10,
+ "links": [],
+ "pageSize": null,
+ "scroll": true,
+ "showHeader": true,
+ "sort": {
+ "col": 3,
+ "desc": true
+ },
+ "styles": [
+ {
+ "alias": "Pool",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "pool",
+ "thresholds": [],
+ "type": "string",
+ "unit": "short"
+ },
+ {
+ "alias": "Image",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "image",
+ "thresholds": [],
+ "type": "string",
+ "unit": "short"
+ },
+ {
+ "alias": "Throughput",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "Value",
+ "thresholds": [],
+ "type": "number",
+ "unit": "Bps"
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "decimals": 2,
+ "pattern": "/.*/",
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ }
+ ],
+ "targets": [
+ {
+ "expr": "topk(10, sort(sum(irate(ceph_rbd_read_bytes[30s]) + irate(ceph_rbd_write_bytes[30s])) by (pool, image, namespace)))",
+ "format": "table",
+ "instant": true,
+ "intervalFactor": 1,
+ "refId": "A"
+ }
+ ],
+ "title": "Highest Throughput",
+ "transform": "table",
+ "type": "table"
+ },
+ {
+ "columns": [],
+ "datasource": "$datasource",
+ "fontSize": "100%",
+ "gridPos": {
+ "h": 7,
+ "w": 8,
+ "x": 16,
+ "y": 7
+ },
+ "id": 14,
+ "links": [],
+ "pageSize": null,
+ "scroll": true,
+ "showHeader": true,
+ "sort": {
+ "col": 3,
+ "desc": true
+ },
+ "styles": [
+ {
+ "alias": "Pool",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "pool",
+ "thresholds": [],
+ "type": "string",
+ "unit": "short"
+ },
+ {
+ "alias": "Image",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "image",
+ "thresholds": [],
+ "type": "string",
+ "unit": "short"
+ },
+ {
+ "alias": "Latency",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "Value",
+ "thresholds": [],
+ "type": "number",
+ "unit": "ns"
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "/.*/",
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ }
+ ],
+ "targets": [
+ {
+ "expr": "topk(10,\n sum(\n irate(ceph_rbd_write_latency_sum[30s]) / clamp_min(irate(ceph_rbd_write_latency_count[30s]), 1) +\n irate(ceph_rbd_read_latency_sum[30s]) / clamp_min(irate(ceph_rbd_read_latency_count[30s]), 1)\n ) by (pool, image, namespace)\n)",
+ "format": "table",
+ "instant": true,
+ "intervalFactor": 1,
+ "refId": "A"
+ }
+ ],
+ "title": "Highest Latency",
+ "transform": "table",
+ "type": "table"
+ }
+ ],
+ "refresh": "30s",
+ "schemaVersion": 16,
+ "style": "dark",
+ "tags": [
+ "overview"
+ ],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "text": "default",
+ "value": "default"
+ },
+ "hide": 0,
+ "label": "Data Source",
+ "name": "datasource",
+ "options": [],
+ "query": "prometheus",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "type": "datasource"
+ }
+ ]
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "15s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "",
+ "title": "RBD Overview",
+ "uid": "41FrpeUiz",
+ "version": 8
+}
diff --git a/monitoring/grafana/screenshots/ceph-cluster1.png b/monitoring/grafana/screenshots/ceph-cluster1.png
new file mode 100644
index 00000000..4c59bd27
--- /dev/null
+++ b/monitoring/grafana/screenshots/ceph-cluster1.png
Binary files differ
diff --git a/monitoring/grafana/screenshots/ceph-cluster2.png b/monitoring/grafana/screenshots/ceph-cluster2.png
new file mode 100644
index 00000000..18a64d2d
--- /dev/null
+++ b/monitoring/grafana/screenshots/ceph-cluster2.png
Binary files differ
diff --git a/monitoring/grafana/screenshots/host-details.png b/monitoring/grafana/screenshots/host-details.png
new file mode 100644
index 00000000..bf963a9c
--- /dev/null
+++ b/monitoring/grafana/screenshots/host-details.png
Binary files differ
diff --git a/monitoring/grafana/screenshots/host-details.resized.png b/monitoring/grafana/screenshots/host-details.resized.png
new file mode 100644
index 00000000..f18ea23a
--- /dev/null
+++ b/monitoring/grafana/screenshots/host-details.resized.png
Binary files differ
diff --git a/monitoring/grafana/screenshots/host_overview.png b/monitoring/grafana/screenshots/host_overview.png
new file mode 100644
index 00000000..e47afa47
--- /dev/null
+++ b/monitoring/grafana/screenshots/host_overview.png
Binary files differ
diff --git a/monitoring/grafana/screenshots/mds-performance.png b/monitoring/grafana/screenshots/mds-performance.png
new file mode 100644
index 00000000..317cd8b1
--- /dev/null
+++ b/monitoring/grafana/screenshots/mds-performance.png
Binary files differ
diff --git a/monitoring/grafana/screenshots/osd-performance.png b/monitoring/grafana/screenshots/osd-performance.png
new file mode 100644
index 00000000..9bb7af44
--- /dev/null
+++ b/monitoring/grafana/screenshots/osd-performance.png
Binary files differ
diff --git a/monitoring/grafana/screenshots/osd_overview.png b/monitoring/grafana/screenshots/osd_overview.png
new file mode 100644
index 00000000..9e0c1fe6
--- /dev/null
+++ b/monitoring/grafana/screenshots/osd_overview.png
Binary files differ
diff --git a/monitoring/grafana/screenshots/pool-details.png b/monitoring/grafana/screenshots/pool-details.png
new file mode 100644
index 00000000..1f300cda
--- /dev/null
+++ b/monitoring/grafana/screenshots/pool-details.png
Binary files differ
diff --git a/monitoring/grafana/screenshots/pool-overview.png b/monitoring/grafana/screenshots/pool-overview.png
new file mode 100644
index 00000000..ffdd28ea
--- /dev/null
+++ b/monitoring/grafana/screenshots/pool-overview.png
Binary files differ
diff --git a/monitoring/grafana/screenshots/rgw-detail.png b/monitoring/grafana/screenshots/rgw-detail.png
new file mode 100644
index 00000000..64356ad3
--- /dev/null
+++ b/monitoring/grafana/screenshots/rgw-detail.png
Binary files differ
diff --git a/monitoring/grafana/screenshots/rgw-overview.png b/monitoring/grafana/screenshots/rgw-overview.png
new file mode 100644
index 00000000..d6efaf79
--- /dev/null
+++ b/monitoring/grafana/screenshots/rgw-overview.png
Binary files differ
diff --git a/monitoring/prometheus/README.md b/monitoring/prometheus/README.md
new file mode 100644
index 00000000..fde63a35
--- /dev/null
+++ b/monitoring/prometheus/README.md
@@ -0,0 +1,7 @@
+## Prometheus related bits
+
+### Alerts
+In monitoring/prometheus/alerts you'll find a set of Prometheus alert rules that
+should provide a decent set of default alerts for a Ceph cluster. Just put this
+file in a place according to your Prometheus configuration (wherever the `rules`
+configuration stanza points).
diff --git a/monitoring/prometheus/alerts/ceph_default_alerts.yml b/monitoring/prometheus/alerts/ceph_default_alerts.yml
new file mode 100644
index 00000000..ddd19c67
--- /dev/null
+++ b/monitoring/prometheus/alerts/ceph_default_alerts.yml
@@ -0,0 +1,268 @@
+groups:
+ - name: cluster health
+ rules:
+ - alert: health error
+ expr: ceph_health_status == 2
+ for: 5m
+ labels:
+ severity: critical
+ type: ceph_default
+ annotations:
+ description: >
+ Ceph in HEALTH_ERROR state for more than 5 minutes.
+ Please check "ceph health detail" for more information.
+
+ - alert: health warn
+ expr: ceph_health_status == 1
+ for: 15m
+ labels:
+ severity: warning
+ type: ceph_default
+ annotations:
+ description: >
+ Ceph has been in HEALTH_WARN for more than 15 minutes.
+ Please check "ceph health detail" for more information.
+
+ - name: mon
+ rules:
+ - alert: low monitor quorum count
+ expr: sum(ceph_mon_quorum_status) < 3
+ labels:
+ severity: critical
+ type: ceph_default
+ annotations:
+ description: |
+ Monitor count in quorum is below three.
+
+ Only {{ $value }} of {{ with query "count(ceph_mon_quorum_status)" }}{{ . | first | value }}{{ end }} monitors are active.
+
+ The following monitors are down:
+ {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }}
+ - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
+ {{- end }}
+
+ - name: osd
+ rules:
+ - alert: 10% OSDs down
+ expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10
+ labels:
+ severity: critical
+ type: ceph_default
+ annotations:
+ description: |
+ {{ $value | humanize }}% or {{ with query "count(ceph_osd_up == 0)" }}{{ . | first | value }}{{ end }} of {{ with query "count(ceph_osd_up)" }}{{ . | first | value }}{{ end }} OSDs are down (≥ 10%).
+
+ The following OSDs are down:
+ {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }}
+ - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
+ {{- end }}
+
+ - alert: OSD down
+ expr: count(ceph_osd_up == 0) > 0
+ for: 15m
+ labels:
+ severity: warning
+ type: ceph_default
+ annotations:
+ description: |
+ {{ $s := "" }}{{ if gt $value 1.0 }}{{ $s = "s" }}{{ end }}
+ {{ $value }} OSD{{ $s }} down for more than 15 minutes.
+
+ {{ $value }} of {{ query "count(ceph_osd_up)" | first | value }} OSDs are down.
+
+ The following OSD{{ $s }} {{ if eq $s "" }}is{{ else }}are{{ end }} down:
+ {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0"}}
+ - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
+ {{- end }}
+
+ - alert: OSDs near full
+ expr: |
+ (
+ ((ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) and on(ceph_daemon) ceph_osd_up == 1)
+ * on(ceph_daemon) group_left(hostname) ceph_osd_metadata
+ ) * 100 > 90
+ for: 5m
+ labels:
+ severity: critical
+ type: ceph_default
+ annotations:
+ description: >
+ OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} is
+ dangerously full: {{ $value | humanize }}%
+
+ - alert: flapping OSD
+ expr: |
+ (
+ rate(ceph_osd_up[5m])
+ * on(ceph_daemon) group_left(hostname) ceph_osd_metadata
+ ) * 60 > 1
+ labels:
+ severity: warning
+ type: ceph_default
+ annotations:
+ description: >
+ OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} was
+ marked down and back up at {{ $value | humanize }} times once a
+ minute for 5 minutes.
+
+ # alert on high deviation from average PG count
+ - alert: high pg count deviation
+ expr: |
+ abs(
+ (
+ (ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) by (job)
+ ) / on (job) group_left avg(ceph_osd_numpg > 0) by (job)
+ ) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30
+ for: 5m
+ labels:
+ severity: warning
+ type: ceph_default
+ annotations:
+ description: >
+ OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates
+ by more than 30% from average PG count.
+ # alert on high commit latency...but how high is too high
+ - name: mds
+ rules:
+ # no mds metrics are exported yet
+ - name: mgr
+ rules:
+ # no mgr metrics are exported yet
+ - name: pgs
+ rules:
+ - alert: pgs inactive
+ expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active) > 0
+ for: 5m
+ labels:
+ severity: critical
+ type: ceph_default
+ annotations:
+ description: >
+ {{ $value }} PGs have been inactive for more than 5 minutes in pool {{ $labels.name }}.
+ Inactive placement groups aren't able to serve read/write
+ requests.
+ - alert: pgs unclean
+ expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_clean) > 0
+ for: 15m
+ labels:
+ severity: warning
+ type: ceph_default
+ annotations:
+ description: >
+ {{ $value }} PGs haven't been clean for more than 15 minutes in pool {{ $labels.name }}.
+ Unclean PGs haven't been able to completely recover from a
+ previous failure.
+ - name: nodes
+ rules:
+ - alert: root volume full
+ expr: node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 5
+ for: 5m
+ labels:
+ severity: critical
+ type: ceph_default
+ annotations:
+ description: >
+ Root volume (OSD and MON store) is dangerously full: {{ $value | humanize }}% free.
+
+ # alert on nic packet errors and drops rates > 1% packets/s
+ - alert: network packets dropped
+ expr: |
+ (
+ increase(node_network_receive_drop_total{device!="lo"}[1m]) +
+ increase(node_network_transmit_drop_total{device!="lo"}[1m])
+ ) / (
+ increase(node_network_receive_packets_total{device!="lo"}[1m]) +
+ increase(node_network_transmit_packets_total{device!="lo"}[1m])
+ ) >= 0.0001 or (
+ increase(node_network_receive_drop_total{device!="lo"}[1m]) +
+ increase(node_network_transmit_drop_total{device!="lo"}[1m])
+ ) >= 10
+ labels:
+ severity: warning
+ type: ceph_default
+ annotations:
+ description: >
+ Node {{ $labels.instance }} experiences packet drop > 0.01% or >
+ 10 packets/s on interface {{ $labels.device }}.
+
+ - alert: network packet errors
+ expr: |
+ (
+ increase(node_network_receive_errs_total{device!="lo"}[1m]) +
+ increase(node_network_transmit_errs_total{device!="lo"}[1m])
+ ) / (
+ increase(node_network_receive_packets_total{device!="lo"}[1m]) +
+ increase(node_network_transmit_packets_total{device!="lo"}[1m])
+ ) >= 0.0001 or (
+ increase(node_network_receive_errs_total{device!="lo"}[1m]) +
+ increase(node_network_transmit_errs_total{device!="lo"}[1m])
+ ) >= 10
+ labels:
+ severity: warning
+ type: ceph_default
+ annotations:
+ description: >
+ Node {{ $labels.instance }} experiences packet errors > 0.01% or
+ > 10 packets/s on interface {{ $labels.device }}.
+
+ - alert: storage filling up
+ expr: |
+ predict_linear(node_filesystem_free_bytes[2d], 3600 * 24 * 5) *
+ on(instance) group_left(nodename) node_uname_info < 0
+ labels:
+ severity: warning
+ type: ceph_default
+ annotations:
+ description: >
+ Mountpoint {{ $labels.mountpoint }} on {{ $labels.nodename }}
+ will be full in less than 5 days assuming the average fill-up
+ rate of the past 48 hours.
+
+ - alert: MTU Mismatch
+ expr: node_network_mtu_bytes{device!="lo"} != on() group_left() (quantile(0.5, node_network_mtu_bytes{device!="lo"}))
+ labels:
+ severity: warning
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.15.1.2.8.5
+ annotations:
+ description: >
+ Node {{ $labels.instance }} has a different MTU size ({{ $value }})
+ than the median value on device {{ $labels.device }}.
+
+ - name: pools
+ rules:
+ - alert: pool full
+ expr: |
+ ceph_pool_stored / (ceph_pool_stored + ceph_pool_max_avail)
+ * on(pool_id) group_right ceph_pool_metadata * 100 > 90
+ labels:
+ severity: critical
+ type: ceph_default
+ annotations:
+ description: Pool {{ $labels.name }} at {{ $value | humanize }}% capacity.
+
+ - alert: pool filling up
+ expr: |
+ (
+ predict_linear(ceph_pool_stored[2d], 3600 * 24 * 5)
+ >= ceph_pool_stored + ceph_pool_max_avail
+ ) * on(pool_id) group_left(name) ceph_pool_metadata
+ labels:
+ severity: warning
+ type: ceph_default
+ annotations:
+ description: >
+ Pool {{ $labels.name }} will be full in less than 5 days
+ assuming the average fill-up rate of the past 48 hours.
+
+ - name: healthchecks
+ rules:
+ - alert: Slow OSD Ops
+ expr: ceph_healthcheck_slow_ops > 0
+ for: 30s
+ labels:
+ severity: warning
+ type: ceph_default
+ annotations:
+ description: >
+ {{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)