diff options
Diffstat (limited to 'monitoring/ceph-mixin/dashboards/rgw.libsonnet')
-rw-r--r-- | monitoring/ceph-mixin/dashboards/rgw.libsonnet | 872 |
1 files changed, 872 insertions, 0 deletions
diff --git a/monitoring/ceph-mixin/dashboards/rgw.libsonnet b/monitoring/ceph-mixin/dashboards/rgw.libsonnet new file mode 100644 index 000000000..892480d1c --- /dev/null +++ b/monitoring/ceph-mixin/dashboards/rgw.libsonnet @@ -0,0 +1,872 @@ +local g = import 'grafonnet/grafana.libsonnet'; +local u = import 'utils.libsonnet'; + +(import 'utils.libsonnet') { + 'radosgw-sync-overview.json': + local RgwSyncOverviewPanel(title, formatY1, labelY1, rgwMetric, x, y, w, h) = + $.graphPanelSchema({}, + title, + '', + 'null as zero', + true, + formatY1, + 'short', + labelY1, + null, + 0, + 1, + '$datasource') + .addTargets( + [ + $.addTargetSchema( + 'sum by (source_zone) (rate(%(rgwMetric)s{%(matchers)s}[$__rate_interval]))' + % ($.matchers() + { rgwMetric: rgwMetric }), + '{{source_zone}}' + ), + ] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; + + $.dashboardSchema( + 'RGW Sync Overview', + '', + 'rgw-sync-overview', + 'now-1h', + '30s', + 16, + $._config.dashboardTags + ['overview'], + '' + ) + .addAnnotation( + $.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.0.0' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addTemplate( + g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') + ) + .addTemplate( + $.addClusterTemplate() + ) + .addTemplate( + $.addJobTemplate() + ) + .addTemplate( + $.addTemplateSchema( + 'rgw_servers', + '$datasource', + 'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(), + 1, + true, + 1, + '', + 'RGW Server' + ) + ) + .addPanels([ + RgwSyncOverviewPanel( + 'Replication (throughput) from Source Zone', + 'Bps', + null, + 'ceph_data_sync_from_zone_fetch_bytes_sum', + 0, + 0, + 8, + 7 + ), + RgwSyncOverviewPanel( + 'Replication (objects) from Source Zone', + 'short', + 'Objects/s', + 'ceph_data_sync_from_zone_fetch_bytes_count', + 8, + 0, + 8, + 7 + ), + RgwSyncOverviewPanel( + 'Polling Request Latency from Source Zone', + 'ms', + null, + 'ceph_data_sync_from_zone_poll_latency_sum', + 16, + 0, + 8, + 7 + ), + RgwSyncOverviewPanel( + 'Unsuccessful Object Replications from Source Zone', + 'short', + 'Count/s', + 'ceph_data_sync_from_zone_fetch_errors', + 0, + 7, + 8, + 7 + ), + ]), + 'radosgw-overview.json': + local RgwOverviewPanel( + title, + description, + formatY1, + formatY2, + expr1, + legendFormat1, + x, + y, + w, + h, + datasource='$datasource', + legend_alignAsTable=false, + legend_avg=false, + legend_min=false, + legend_max=false, + legend_current=false, + legend_values=false + ) = + $.graphPanelSchema( + {}, + title, + description, + 'null', + false, + formatY1, + formatY2, + null, + null, + 0, + 1, + datasource, + legend_alignAsTable, + legend_avg, + legend_min, + legend_max, + legend_current, + legend_values + ) + .addTargets( + [$.addTargetSchema(expr1, legendFormat1)] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; + + $.dashboardSchema( + 'RGW Overview', + '', + 'WAkugZpiz', + 'now-1h', + '30s', + 16, + $._config.dashboardTags + ['overview'], + '' + ) + .addAnnotation( + $.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.0.0' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addTemplate( + g.template.datasource('datasource', + 'prometheus', + 'default', + label='Data Source') + ) + .addTemplate( + $.addClusterTemplate() + ) + .addTemplate( + $.addJobTemplate() + ) + .addTemplate( + $.addTemplateSchema( + 'rgw_servers', + '$datasource', + 'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(), + 1, + true, + 1, + '', + 'RGW Server' + ) + ) + .addTemplate( + $.addTemplateSchema( + 'code', + '$datasource', + 'label_values(haproxy_server_http_responses_total{job=~"$job_haproxy", instance=~"$ingress_service"}, code)', + 1, + true, + 1, + 'HTTP Code', + '' + ) + ) + .addTemplate( + $.addTemplateSchema( + 'job_haproxy', + '$datasource', + 'label_values(haproxy_server_status, job)', + 1, + true, + 1, + 'job haproxy', + '(.*)', + multi=true, + allValues='.+', + ), + ) + .addTemplate( + $.addTemplateSchema( + 'ingress_service', + '$datasource', + 'label_values(haproxy_server_status{job=~"$job_haproxy"}, instance)', + 1, + true, + 1, + 'Ingress Service', + '' + ) + ) + .addPanels([ + $.addRowSchema(false, + true, + 'RGW Overview - All Gateways') + + { + gridPos: { x: 0, y: 0, w: 24, h: 1 }, + }, + RgwOverviewPanel( + 'Average GET/PUT Latencies by RGW Instance', + '', + 's', + 'short', + ||| + label_replace( + rate(ceph_rgw_get_initial_lat_sum{%(matchers)s}[$__rate_interval]) / + rate(ceph_rgw_get_initial_lat_count{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, + "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" + ) + ||| % $.matchers(), + 'GET {{rgw_host}}', + 0, + 1, + 8, + 7 + ).addTargets( + [ + $.addTargetSchema( + ||| + label_replace( + rate(ceph_rgw_put_initial_lat_sum{%(matchers)s}[$__rate_interval]) / + rate(ceph_rgw_put_initial_lat_count{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, + "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" + ) + ||| % $.matchers(), + 'PUT {{rgw_host}}' + ), + ] + ), + RgwOverviewPanel( + 'Total Requests/sec by RGW Instance', + '', + 'none', + 'short', + ||| + sum by (rgw_host) ( + label_replace( + rate(ceph_rgw_req{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, + "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" + ) + ) + ||| % $.matchers(), + '{{rgw_host}}', + 8, + 1, + 7, + 7 + ), + RgwOverviewPanel( + 'GET Latencies by RGW Instance', + 'Latencies are shown stacked, without a yaxis to provide a visual indication of GET latency imbalance across RGW hosts', + 's', + 'short', + ||| + label_replace( + rate(ceph_rgw_get_initial_lat_sum{%(matchers)s}[$__rate_interval]) / + rate(ceph_rgw_get_initial_lat_count{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, + "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" + ) + ||| % $.matchers(), + '{{rgw_host}}', + 15, + 1, + 6, + 7 + ), + RgwOverviewPanel( + 'Bandwidth Consumed by Type', + 'Total bytes transferred in/out of all radosgw instances within the cluster', + 'bytes', + 'short', + 'sum(rate(ceph_rgw_get_b{%(matchers)s}[$__rate_interval]))' % $.matchers(), + 'GETs', + 0, + 8, + 8, + 6 + ).addTargets( + [$.addTargetSchema('sum(rate(ceph_rgw_put_b{%(matchers)s}[$__rate_interval]))' % $.matchers(), + 'PUTs')] + ), + RgwOverviewPanel( + 'Bandwidth by RGW Instance', + 'Total bytes transferred in/out through get/put operations, by radosgw instance', + 'bytes', + 'short', + ||| + label_replace(sum by (instance_id) ( + rate(ceph_rgw_get_b{%(matchers)s}[$__rate_interval]) + + rate(ceph_rgw_put_b{%(matchers)s}[$__rate_interval])) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, + "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" + ) + ||| % $.matchers(), + '{{rgw_host}}', + 8, + 8, + 7, + 6 + ), + RgwOverviewPanel( + 'PUT Latencies by RGW Instance', + 'Latencies are shown stacked, without a yaxis to provide a visual indication of PUT latency imbalance across RGW hosts', + 's', + 'short', + ||| + label_replace( + rate(ceph_rgw_put_initial_lat_sum{%(matchers)s}[$__rate_interval]) / + rate(ceph_rgw_put_initial_lat_count{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, + "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" + ) + ||| % $.matchers(), + '{{rgw_host}}', + 15, + 8, + 6, + 6 + ), + $.addRowSchema( + false, true, 'RGW Overview - HAProxy Metrics' + ) + { gridPos: { x: 0, y: 12, w: 9, h: 12 } }, + RgwOverviewPanel( + 'Total responses by HTTP code', + '', + 'short', + 'short', + ||| + sum( + rate( + haproxy_frontend_http_responses_total{code=~"$code", job=~"$job_haproxy", instance=~"$ingress_service", proxy=~"frontend"}[$__rate_interval] + ) + ) by (code) + |||, + 'Frontend {{ code }}', + 0, + 12, + 5, + 12, + '$datasource', + true, + true, + true, + true, + true, + true + ) + .addTargets( + [ + $.addTargetSchema( + ||| + sum( + rate( + haproxy_backend_http_responses_total{code=~"$code", job=~"$job_haproxy", instance=~"$ingress_service", proxy=~"backend"}[$__rate_interval] + ) + ) by (code) + |||, 'Backend {{ code }}' + ), + ] + ) + .addSeriesOverride([ + { + alias: '/.*Back.*/', + transform: 'negative-Y', + }, + { alias: '/.*1.*/' }, + { alias: '/.*2.*/' }, + { alias: '/.*3.*/' }, + { alias: '/.*4.*/' }, + { alias: '/.*5.*/' }, + { alias: '/.*other.*/' }, + ]), + RgwOverviewPanel( + 'Total requests / responses', + '', + 'short', + 'short', + ||| + sum( + rate( + haproxy_frontend_http_requests_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) + ) by (instance) + |||, + 'Requests', + 5, + 12, + 5, + 12, + '$datasource', + true, + true, + true, + true, + true, + true + ) + .addTargets( + [ + $.addTargetSchema( + ||| + sum( + rate( + haproxy_backend_response_errors_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) + ) by (instance) + |||, 'Response errors', 'time_series', 2 + ), + $.addTargetSchema( + ||| + sum( + rate( + haproxy_frontend_request_errors_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) + ) by (instance) + |||, 'Requests errors' + ), + $.addTargetSchema( + ||| + sum( + rate( + haproxy_backend_redispatch_warnings_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) + ) by (instance) + |||, 'Backend redispatch', 'time_series', 2 + ), + $.addTargetSchema( + ||| + sum( + rate( + haproxy_backend_retry_warnings_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) + ) by (instance) + |||, 'Backend retry', 'time_series', 2 + ), + $.addTargetSchema( + ||| + sum( + rate( + haproxy_frontend_requests_denied_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) + ) by (instance) + |||, 'Request denied', 'time_series', 2 + ), + $.addTargetSchema( + ||| + sum( + haproxy_backend_current_queue{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"} + ) by (instance) + |||, 'Backend Queued', 'time_series', 2 + ), + ] + ) + .addSeriesOverride([ + { + alias: '/.*Response.*/', + transform: 'negative-Y', + }, + { + alias: '/.*Backend.*/', + transform: 'negative-Y', + }, + ]), + RgwOverviewPanel( + 'Total number of connections', + '', + 'short', + 'short', + ||| + sum( + rate( + haproxy_frontend_connections_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) + ) by (instance) + |||, + 'Front', + 10, + 12, + 5, + 12, + '$datasource', + true, + true, + true, + true, + true, + true + ) + .addTargets( + [ + $.addTargetSchema( + ||| + sum( + rate( + haproxy_backend_connection_attempts_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) + ) by (instance) + |||, 'Back' + ), + $.addTargetSchema( + ||| + sum( + rate( + haproxy_backend_connection_errors_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) + ) by (instance) + |||, 'Back errors' + ), + ] + ) + .addSeriesOverride([ + { + alias: '/.*Back.*/', + transform: 'negative-Y', + }, + ]), + RgwOverviewPanel( + 'Current total of incoming / outgoing bytes', + '', + 'short', + 'short', + ||| + sum( + rate( + haproxy_frontend_bytes_in_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) * 8 + ) by (instance) + |||, + 'IN Front', + 15, + 12, + 6, + 12, + '$datasource', + true, + true, + true, + true, + true, + true + ) + .addTargets( + [ + $.addTargetSchema( + ||| + sum( + rate( + haproxy_frontend_bytes_out_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) * 8 + ) by (instance) + |||, 'OUT Front', 'time_series', 2 + ), + $.addTargetSchema( + ||| + sum( + rate( + haproxy_backend_bytes_in_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) * 8 + ) by (instance) + |||, 'IN Back', 'time_series', 2 + ), + $.addTargetSchema( + ||| + sum( + rate( + haproxy_backend_bytes_out_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) * 8 + ) by (instance) + |||, 'OUT Back', 'time_series', 2 + ), + ] + ) + .addSeriesOverride([ + { + alias: '/.*OUT.*/', + transform: 'negative-Y', + }, + ]), + ]), + 'radosgw-detail.json': + local RgwDetailsPanel(aliasColors, + title, + description, + formatY1, + formatY2, + expr1, + expr2, + legendFormat1, + legendFormat2, + x, + y, + w, + h) = + $.graphPanelSchema(aliasColors, + title, + description, + 'null', + false, + formatY1, + formatY2, + null, + null, + 0, + 1, + '$datasource') + .addTargets( + [$.addTargetSchema(expr1, legendFormat1), $.addTargetSchema(expr2, legendFormat2)] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; + + $.dashboardSchema( + 'RGW Instance Detail', + '', + 'x5ARzZtmk', + 'now-1h', + '30s', + 16, + $._config.dashboardTags + ['overview'], + '' + ) + .addAnnotation( + $.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.0.0' + ) + .addRequired( + type='panel', + id='grafana-piechart-panel', + name='Pie Chart', + version='1.3.3' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addTemplate( + g.template.datasource('datasource', + 'prometheus', + 'default', + label='Data Source') + ) + .addTemplate( + $.addClusterTemplate() + ) + .addTemplate( + $.addJobTemplate() + ) + .addTemplate( + $.addTemplateSchema('rgw_servers', + '$datasource', + 'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(), + 1, + true, + 1, + '', + '') + ) + .addPanels([ + $.addRowSchema(false, true, 'RGW Host Detail : $rgw_servers') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } }, + RgwDetailsPanel( + {}, + '$rgw_servers GET/PUT Latencies', + '', + 's', + 'short', + ||| + sum by (instance_id) ( + rate(ceph_rgw_get_initial_lat_sum{%(matchers)s}[$__rate_interval]) / + rate(ceph_rgw_get_initial_lat_count{%(matchers)s}[$__rate_interval]) + ) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} + ||| % $.matchers(), + ||| + sum by (instance_id) ( + rate(ceph_rgw_put_initial_lat_sum{%(matchers)s}[$__rate_interval]) / + rate(ceph_rgw_put_initial_lat_count{%(matchers)s}[$__rate_interval]) + ) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} + ||| % $.matchers(), + 'GET {{ceph_daemon}}', + 'PUT {{ceph_daemon}}', + 0, + 1, + 6, + 8 + ), + RgwDetailsPanel( + {}, + 'Bandwidth by HTTP Operation', + '', + 'bytes', + 'short', + ||| + rate(ceph_rgw_get_b{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} + ||| % $.matchers(), + ||| + rate(ceph_rgw_put_b{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) + ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} + ||| % $.matchers(), + 'GETs {{ceph_daemon}}', + 'PUTs {{ceph_daemon}}', + 6, + 1, + 7, + 8 + ), + RgwDetailsPanel( + { + GETs: '#7eb26d', + Other: '#447ebc', + PUTs: '#eab839', + Requests: '#3f2b5b', + 'Requests Failed': '#bf1b00', + }, + 'HTTP Request Breakdown', + '', + 'short', + 'short', + ||| + rate(ceph_rgw_failed_req{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s,ceph_daemon=~"$rgw_servers"} + ||| % $.matchers(), + ||| + rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} + ||| % $.matchers(), + 'Requests Failed {{ceph_daemon}}', + 'GETs {{ceph_daemon}}', + 13, + 1, + 7, + 8 + ) + .addTargets( + [ + $.addTargetSchema( + ||| + rate(ceph_rgw_put{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} + ||| % $.matchers(), + 'PUTs {{ceph_daemon}}' + ), + $.addTargetSchema( + ||| + ( + rate(ceph_rgw_req{%(matchers)s}[$__rate_interval]) - + ( + rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) + + rate(ceph_rgw_put{%(matchers)s}[$__rate_interval]) + ) + ) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} + ||| % $.matchers(), + 'Other {{ceph_daemon}}' + ), + ] + ), + $.simplePieChart( + { + GETs: '#7eb26d', + 'Other (HEAD,POST,DELETE)': '#447ebc', + PUTs: '#eab839', + Requests: '#3f2b5b', + Failures: '#bf1b00', + }, '', 'Workload Breakdown' + ) + .addTarget($.addTargetSchema( + ||| + rate(ceph_rgw_failed_req{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} + ||| % $.matchers(), + 'Failures {{ceph_daemon}}' + )) + .addTarget($.addTargetSchema( + ||| + rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} + ||| % $.matchers(), + 'GETs {{ceph_daemon}}' + )) + .addTarget($.addTargetSchema( + ||| + rate(ceph_rgw_put{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} + ||| % $.matchers(), + 'PUTs {{ceph_daemon}}' + )) + .addTarget($.addTargetSchema( + ||| + ( + rate(ceph_rgw_req{%(matchers)s}[$__rate_interval]) - + ( + rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) + + rate(ceph_rgw_put{%(matchers)s}[$__rate_interval]) + ) + ) * on (instance_id) group_left (ceph_daemon) + ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} + ||| % $.matchers(), + 'Other (DELETE,LIST) {{ceph_daemon}}' + )) + { gridPos: { x: 20, y: 1, w: 4, h: 8 } }, + ]), +} |