From e6918187568dbd01842d8d1d2c808ce16a894239 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 21 Apr 2024 13:54:28 +0200 Subject: Adding upstream version 18.2.2. Signed-off-by: Daniel Baumann --- .../tests_dashboards/features/__init__.py | 0 .../tests_dashboards/features/ceph-cluster.feature | 54 +++++ .../tests_dashboards/features/environment.py | 135 +++++++++++ .../tests_dashboards/features/host-details.feature | 131 +++++++++++ .../features/hosts_overview.feature | 41 ++++ .../features/osd-device-details.feature | 88 ++++++++ .../features/osds-overview.feature | 15 ++ .../features/radosgw-detail.feature | 139 ++++++++++++ .../features/radosgw_overview.feature | 250 +++++++++++++++++++++ .../tests_dashboards/features/self.feature | 68 ++++++ .../tests_dashboards/features/steps/__init__.py | 1 + 11 files changed, 922 insertions(+) create mode 100644 monitoring/ceph-mixin/tests_dashboards/features/__init__.py create mode 100644 monitoring/ceph-mixin/tests_dashboards/features/ceph-cluster.feature create mode 100644 monitoring/ceph-mixin/tests_dashboards/features/environment.py create mode 100644 monitoring/ceph-mixin/tests_dashboards/features/host-details.feature create mode 100644 monitoring/ceph-mixin/tests_dashboards/features/hosts_overview.feature create mode 100644 monitoring/ceph-mixin/tests_dashboards/features/osd-device-details.feature create mode 100644 monitoring/ceph-mixin/tests_dashboards/features/osds-overview.feature create mode 100644 monitoring/ceph-mixin/tests_dashboards/features/radosgw-detail.feature create mode 100644 monitoring/ceph-mixin/tests_dashboards/features/radosgw_overview.feature create mode 100644 monitoring/ceph-mixin/tests_dashboards/features/self.feature create mode 100644 monitoring/ceph-mixin/tests_dashboards/features/steps/__init__.py (limited to 'monitoring/ceph-mixin/tests_dashboards/features') diff --git a/monitoring/ceph-mixin/tests_dashboards/features/__init__.py b/monitoring/ceph-mixin/tests_dashboards/features/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/monitoring/ceph-mixin/tests_dashboards/features/ceph-cluster.feature b/monitoring/ceph-mixin/tests_dashboards/features/ceph-cluster.feature new file mode 100644 index 000000000..1a446cd2c --- /dev/null +++ b/monitoring/ceph-mixin/tests_dashboards/features/ceph-cluster.feature @@ -0,0 +1,54 @@ +Feature: Ceph Cluster Dashboard + +Scenario: "Test total PG States" + Given the following series: + | metrics | values | + | ceph_pg_total{foo="var"} | 10 100 | + | ceph_pg_total{foo="bar"} | 20 200 | + Then Grafana panel `PG States` with legend `Total` shows: + | metrics | values | + | {} | 300 | + +Scenario: "Test OSDs in" + Given the following series: + | metrics | values | + | ceph_osd_in{ceph_daemon="osd.0"} | 1.0 | + | ceph_osd_in{ceph_daemon="osd.1"} | 0.0 | + | ceph_osd_in{ceph_daemon="osd.2"} | 1.0 | + When variable `instance` is `.*` + Then Grafana panel `OSDs` with legend `In` shows: + | metrics | values | + | {} | 2 | + +Scenario: "Test OSDs down" + Given the following series: + | metrics | values | + | ceph_osd_up{ceph_daemon="osd.0", instance="127.0.0.1"} | 0.0 | + | ceph_osd_up{ceph_daemon="osd.1", instance="127.0.0.1"} | 0.0 | + | ceph_osd_up{ceph_daemon="osd.2", instance="127.0.0.1"} | 0.0 | + When variable `instance` is `127.0.0.1` + Then Grafana panel `OSDs` with legend `Down` shows: + | metrics | values | + | {} | 3 | + +Scenario: "Test OSDs out" + Given the following series: + | metrics | values | + | ceph_osd_in{ceph_daemon="osd.0", instance="127.0.0.1"} | 0.0 | + | ceph_osd_in{ceph_daemon="osd.1", instance="127.0.0.1"} | 1.0 | + | ceph_osd_in{ceph_daemon="osd.2", instance="127.0.0.1"} | 0.0 | + When variable `instance` is `127.0.0.1` + Then Grafana panel `OSDs` with legend `Out` shows: + | metrics | values | + | {} | 2 | + +Scenario: "Test OSDs all" + Given the following series: + | metrics | values | + | ceph_osd_metadata{ceph_daemon="osd.0", instance="127.0.0.1"} | 1.0 | + | ceph_osd_metadata{ceph_daemon="osd.1", instance="127.0.0.1"} | 1.0 | + | ceph_osd_metadata{ceph_daemon="osd.2", instance="127.0.0.1"} | 1.0 | + When variable `instance` is `127.0.0.1` + Then Grafana panel `OSDs` with legend `All` shows: + | metrics | values | + | {} | 3 | diff --git a/monitoring/ceph-mixin/tests_dashboards/features/environment.py b/monitoring/ceph-mixin/tests_dashboards/features/environment.py new file mode 100644 index 000000000..5dc76a09e --- /dev/null +++ b/monitoring/ceph-mixin/tests_dashboards/features/environment.py @@ -0,0 +1,135 @@ +# type: ignore[no-redef] +# pylint: disable=E0611,W0613,E0102 +import copy + +from behave import given, then, when +from prettytable import PrettyTable + +from tests_dashboards import PromqlTest +from tests_dashboards.util import get_dashboards_data, resolve_time_and_unit + + +class GlobalContext: + def __init__(self): + self.tested_queries_count = 0 + self.promql_expr_test = None + self.data = get_dashboards_data() + self.query_map = self.data['queries'] + + def reset_promql_test(self): + self.promql_expr_test = PromqlTest() + self.promql_expr_test.variables = copy.copy(self.data['variables']) + + def print_query_stats(self): + total = len(self.query_map) + table = PrettyTable() + table.field_names = ['Name', 'Queries', 'Tested', 'Cover'] + + def percent(tested, total): + return str(round((tested / total) * 100, 2)) + '%' + + def file_name(path): + return path.split('/')[-1] + + total = 0 + tested = 0 + for path, stat in self.data['stats'].items(): + assert stat['total'] + table.add_row([file_name(path), stat['total'], stat['tested'], + percent(stat['tested'], stat['total'])]) + total += stat['total'] + tested += stat['tested'] + + assert total + table.add_row(['Total', total, tested, percent(tested, total)]) + print(table) + + +global_context = GlobalContext() + +# Behave function overloading +# =========================== + + +def before_scenario(context, scenario): + global_context.reset_promql_test() + + +def after_scenario(context, scenario): + assert global_context.promql_expr_test.run_promtool() + + +def after_all(context): + global_context.print_query_stats() + + +@given("the following series") +def step_impl(context): + for row in context.table: + metric = row['metrics'] + value = row['values'] + global_context.promql_expr_test.add_series(metric, value) + + +@when('evaluation interval is `{interval}`') +def step_impl(context, interval): + interval_without_unit, unit = resolve_time_and_unit(interval) + if interval_without_unit is None: + raise ValueError(f'Invalid interval time: {interval_without_unit}. ' + + 'A valid time looks like "1m" where you have a number plus a unit') + global_context.promql_expr_test.set_evaluation_interval(interval_without_unit, unit) + + +@when('interval is `{interval}`') +def step_impl(context, interval): + interval_without_unit, unit = resolve_time_and_unit(interval) + if interval_without_unit is None: + raise ValueError(f'Invalid interval time: {interval_without_unit}. ' + + 'A valid time looks like "1m" where you have a number plus a unit') + global_context.promql_expr_test.set_interval(interval_without_unit, unit) + + +@when('evaluation time is `{eval_time}`') +def step_impl(context, eval_time): + eval_time_without_unit, unit = resolve_time_and_unit(eval_time) + if eval_time_without_unit is None: + raise ValueError(f'Invalid evalution time: {eval_time}. ' + + 'A valid time looks like "1m" where you have a number plus a unit') + global_context.promql_expr_test.set_eval_time(eval_time_without_unit, unit) + + +@when('variable `{variable}` is `{value}`') +def step_impl(context, variable, value): + global_context.promql_expr_test.set_variable(variable, value) + + +@then('Grafana panel `{panel_name}` with legend `{legend}` shows') +def step_impl(context, panel_name, legend): + """ + This step can have an empty legend. As 'behave' doesn't provide a way + to say it's empty we use EMPTY to mark as empty. + """ + if legend == "EMPTY": + legend = '' + query_id = panel_name + '-' + legend + if query_id not in global_context.query_map: + raise KeyError((f'Query with legend {legend} in panel "{panel_name}"' + 'couldn\'t be found')) + + expr = global_context.query_map[query_id]['query'] + global_context.promql_expr_test.set_expression(expr) + for row in context.table: + metric = row['metrics'] + value = row['values'] + global_context.promql_expr_test.add_exp_samples(metric, float(value)) + path = global_context.query_map[query_id]['path'] + global_context.data['stats'][path]['tested'] += 1 + + +@then('query `{query}` produces') +def step_impl(context, query): + global_context.promql_expr_test.set_expression(query) + for row in context.table: + metric = row['metrics'] + value = row['values'] + global_context.promql_expr_test.add_exp_samples(metric, float(value)) diff --git a/monitoring/ceph-mixin/tests_dashboards/features/host-details.feature b/monitoring/ceph-mixin/tests_dashboards/features/host-details.feature new file mode 100644 index 000000000..51e3c5819 --- /dev/null +++ b/monitoring/ceph-mixin/tests_dashboards/features/host-details.feature @@ -0,0 +1,131 @@ +Feature: Host Details Dashboard + +Scenario: "Test OSD" + Given the following series: + | metrics | values | + | ceph_osd_metadata{job="ceph",back_iface="",ceph_daemon="osd.0",cluster_addr="192.168.1.12",device_class="hdd",front_iface="",hostname="127.0.0.1",objectstore="bluestore",public_addr="192.168.1.12",ceph_version="ceph version 17.0.0-8967-g6932a4f702a (6932a4f702a0d557fc36df3ca7a3bca70de42667) quincy (dev)"} | 1.0 | + | ceph_osd_metadata{job="ceph",back_iface="",ceph_daemon="osd.1",cluster_addr="192.168.1.12",device_class="hdd",front_iface="",hostname="127.0.0.1",objectstore="bluestore",public_addr="192.168.1.12",ceph_version="ceph version 17.0.0-8967-g6932a4f702a (6932a4f702a0d557fc36df3ca7a3bca70de42667) quincy (dev)"} | 1.0 | + | ceph_osd_metadata{job="ceph",back_iface="",ceph_daemon="osd.2",cluster_addr="192.168.1.12",device_class="hdd",front_iface="",hostname="127.0.0.1",objectstore="bluestore",public_addr="192.168.1.12",ceph_version="ceph version 17.0.0-8967-g6932a4f702a (6932a4f702a0d557fc36df3ca7a3bca70de42667) quincy (dev)"} | 1.0 | + When variable `ceph_hosts` is `127.0.0.1` + Then Grafana panel `OSDs` with legend `EMPTY` shows: + | metrics | values | + | {} | 3 | + +# IOPS Panel - begin + +Scenario: "Test Disk IOPS - Writes - Several OSDs per device" + Given the following series: + | metrics | values | + | node_disk_writes_completed_total{job="ceph",device="sda",instance="localhost:9100"} | 10+60x1 | + | node_disk_writes_completed_total{job="ceph",device="sdb",instance="localhost:9100"} | 10+60x1 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.0 osd.1 osd.2",device="/dev/sda",instance="localhost:9283"} | 1.0 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.3 osd.4 osd.5",device="/dev/sdb",instance="localhost:9283"} | 1.0 | + When variable `ceph_hosts` is `localhost` + Then Grafana panel `$ceph_hosts Disk IOPS` with legend `{{device}}({{ceph_daemon}}) writes` shows: + | metrics | values | + | {job="ceph",ceph_daemon="osd.0 osd.1 osd.2", device="sda", instance="localhost"} | 1 | + | {job="ceph",ceph_daemon="osd.3 osd.4 osd.5", device="sdb", instance="localhost"} | 1 | + +Scenario: "Test Disk IOPS - Writes - Single OSD per device" + Given the following series: + | metrics | values | + | node_disk_writes_completed_total{job="ceph",device="sda",instance="localhost:9100"} | 10+60x1 | + | node_disk_writes_completed_total{job="ceph",device="sdb",instance="localhost:9100"} | 10+60x1 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 | + When variable `ceph_hosts` is `localhost` + Then Grafana panel `$ceph_hosts Disk IOPS` with legend `{{device}}({{ceph_daemon}}) writes` shows: + | metrics | values | + | {job="ceph",ceph_daemon="osd.0", device="sda", instance="localhost"} | 1 | + | {job="ceph",ceph_daemon="osd.1", device="sdb", instance="localhost"} | 1 | + +Scenario: "Test Disk IOPS - Reads - Several OSDs per device" + Given the following series: + | metrics | values | + | node_disk_reads_completed_total{job="ceph",device="sda",instance="localhost:9100"} | 10+60x1 | + | node_disk_reads_completed_total{job="ceph",device="sdb",instance="localhost:9100"} | 10+60x1 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.0 osd.1 osd.2",device="/dev/sda",instance="localhost:9283"} | 1.0 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.3 osd.4 osd.5",device="/dev/sdb",instance="localhost:9283"} | 1.0 | + When variable `ceph_hosts` is `localhost` + Then Grafana panel `$ceph_hosts Disk IOPS` with legend `{{device}}({{ceph_daemon}}) reads` shows: + | metrics | values | + | {job="ceph",ceph_daemon="osd.0 osd.1 osd.2", device="sda", instance="localhost"} | 1 | + | {job="ceph",ceph_daemon="osd.3 osd.4 osd.5", device="sdb", instance="localhost"} | 1 | + +Scenario: "Test Disk IOPS - Reads - Single OSD per device" + Given the following series: + | metrics | values | + | node_disk_reads_completed_total{job="ceph",device="sda",instance="localhost:9100"} | 10+60x1 | + | node_disk_reads_completed_total{job="ceph",device="sdb",instance="localhost:9100"} | 10+60x1 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 | + When variable `ceph_hosts` is `localhost` + Then Grafana panel `$ceph_hosts Disk IOPS` with legend `{{device}}({{ceph_daemon}}) reads` shows: + | metrics | values | + | {job="ceph",ceph_daemon="osd.0", device="sda", instance="localhost"} | 1 | + | {job="ceph",ceph_daemon="osd.1", device="sdb", instance="localhost"} | 1 | + +# IOPS Panel - end + +# Node disk bytes written/read panel - begin + +Scenario: "Test disk throughput - read" + Given the following series: + | metrics | values | + | node_disk_read_bytes_total{job="ceph",device="sda",instance="localhost:9100"} | 10+60x1 | + | node_disk_read_bytes_total{job="ceph",device="sdb",instance="localhost:9100"} | 100+600x1 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 | + When variable `ceph_hosts` is `localhost` + Then Grafana panel `$ceph_hosts Throughput by Disk` with legend `{{device}}({{ceph_daemon}}) read` shows: + | metrics | values | + | {job="ceph",ceph_daemon="osd.0", device="sda", instance="localhost"} | 1 | + | {job="ceph",ceph_daemon="osd.1", device="sdb", instance="localhost"} | 10 | + +Scenario: "Test disk throughput - write" + Given the following series: + | metrics | values | + | node_disk_written_bytes_total{job="ceph",device="sda",instance="localhost:9100"} | 10+60x1 | + | node_disk_written_bytes_total{job="ceph",device="sdb",instance="localhost:9100"} | 100+600x1 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 | + When variable `ceph_hosts` is `localhost` + Then Grafana panel `$ceph_hosts Throughput by Disk` with legend `{{device}}({{ceph_daemon}}) write` shows: + | metrics | values | + | {job="ceph",ceph_daemon="osd.0", device="sda", instance="localhost"} | 1 | + | {job="ceph",ceph_daemon="osd.1", device="sdb", instance="localhost"} | 10 | + +# Node disk bytes written/read panel - end + +Scenario: "Test $ceph_hosts Disk Latency panel" + Given the following series: + | metrics | values | + | node_disk_write_time_seconds_total{job="ceph",device="sda",instance="localhost:9100"} | 10+60x1 | + | node_disk_write_time_seconds_total{job="ceph",device="sdb",instance="localhost:9100"} | 10+60x1 | + | node_disk_writes_completed_total{job="ceph",device="sda",instance="localhost:9100"} | 10+60x1 | + | node_disk_writes_completed_total{job="ceph",device="sdb",instance="localhost:9100"} | 10+60x1 | + | node_disk_read_time_seconds_total{job="ceph",device="sda",instance="localhost:9100"} | 10+60x1 | + | node_disk_read_time_seconds_total{job="ceph",device="sdb",instance="localhost:9100"} | 10+60x1 | + | node_disk_reads_completed_total{job="ceph",device="sda",instance="localhost:9100"} | 10+60x1 | + | node_disk_reads_completed_total{job="ceph",device="sdb",instance="localhost:9100"} | 10+60x1 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 | + When variable `ceph_hosts` is `localhost` + Then Grafana panel `$ceph_hosts Disk Latency` with legend `{{device}}({{ceph_daemon}})` shows: + | metrics | values | + | {ceph_daemon="osd.0", device="sda", instance="localhost"} | 1 | + | {ceph_daemon="osd.1", device="sdb", instance="localhost"} | 1 | + +Scenario: "Test $ceph_hosts Disk utilization" + Given the following series: + | metrics | values | + | node_disk_io_time_seconds_total{job="ceph",device="sda",instance="localhost:9100"} | 10+60x1 | + | node_disk_io_time_seconds_total{job="ceph",device="sdb",instance="localhost:9100"} | 10+60x1 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 | + When variable `ceph_hosts` is `localhost` + Then Grafana panel `$ceph_hosts Disk utilization` with legend `{{device}}({{ceph_daemon}})` shows: + | metrics | values | + | {job="ceph",ceph_daemon="osd.0", device="sda", instance="localhost"} | 100 | + | {job="ceph",ceph_daemon="osd.1", device="sdb", instance="localhost"} | 100 | + diff --git a/monitoring/ceph-mixin/tests_dashboards/features/hosts_overview.feature b/monitoring/ceph-mixin/tests_dashboards/features/hosts_overview.feature new file mode 100644 index 000000000..6c5eceaed --- /dev/null +++ b/monitoring/ceph-mixin/tests_dashboards/features/hosts_overview.feature @@ -0,0 +1,41 @@ +Feature: Hosts Overview Dashboard + +Scenario: "Test network load succeeds" + Given the following series: + | metrics | values | + | node_network_receive_bytes{instance="127.0.0.1", device="eth1"} | 10 100 | + | node_network_receive_bytes{instance="127.0.0.1", device="eth2"} | 10 100 | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth1"} | 10 100 | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth2"} | 10 100 | + When variable `osd_hosts` is `127.0.0.1` + Then Grafana panel `Network Load` with legend `EMPTY` shows: + | metrics | values | + | {} | 6 | + +Scenario: "Test network load with bonding succeeds" + Given the following series: + | metrics | values | + | node_network_receive_bytes{instance="127.0.0.1", device="eth1"} | 10 100 200 | + | node_network_receive_bytes{instance="127.0.0.1", device="eth2"} | 10 100 200 | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth1"} | 10 100 200 | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth2"} | 10 100 200 | + | node_network_transmit_bytes{instance="127.0.0.1", device="bond0"} | 20 200 300 | + | node_network_transmit_bytes{instance="127.0.0.1", device="bond0"} | 20 200 300 | + | bonding_slaves{instance="127.0.0.1", master="bond0"} | 2 | + When variable `osd_hosts` is `127.0.0.1` + Then Grafana panel `Network Load` with legend `EMPTY` shows: + | metrics | values | + | {} | 6 | + +Scenario: "Test AVG Disk Utilization" + Given the following series: + | metrics | values | + | node_disk_io_time_seconds_total{device="sda",instance="localhost:9100"} | 10+60x1 | + | node_disk_io_time_seconds_total{device="sdb",instance="localhost:9100"} | 10+60x1 | + | node_disk_io_time_seconds_total{device="sdc",instance="localhost:9100"} | 10 2000 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.0",device="sda",instance="localhost:9283"} | 1.0 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 | + When variable `osd_hosts` is `localhost` + Then Grafana panel `AVG Disk Utilization` with legend `EMPTY` shows: + | metrics | values | + | {} | 100 | diff --git a/monitoring/ceph-mixin/tests_dashboards/features/osd-device-details.feature b/monitoring/ceph-mixin/tests_dashboards/features/osd-device-details.feature new file mode 100644 index 000000000..0d6ca8b17 --- /dev/null +++ b/monitoring/ceph-mixin/tests_dashboards/features/osd-device-details.feature @@ -0,0 +1,88 @@ +Feature: OSD device details + +Scenario: "Test Physical Device Latency for $osd - Reads" + Given the following series: + | metrics | values | + | node_disk_reads_completed_total{device="sda",instance="localhost"} | 10 60 | + | node_disk_reads_completed_total{device="sdb",instance="localhost"} | 10 60 | + | node_disk_read_time_seconds_total{device="sda",instance="localhost"} | 100 600 | + | node_disk_read_time_seconds_total{device="sdb",instance="localhost"} | 100 600 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 | + When variable `osd` is `osd.0` + Then Grafana panel `Physical Device Latency for $osd` with legend `{{instance}}/{{device}} Reads` shows: + | metrics | values | + | {device="sda",instance="localhost"} | 10 | + +Scenario: "Test Physical Device Latency for $osd - Writes" + Given the following series: + | metrics | values | + | node_disk_writes_completed_total{device="sda",instance="localhost"} | 10 60 | + | node_disk_writes_completed_total{device="sdb",instance="localhost"} | 10 60 | + | node_disk_write_time_seconds_total{device="sda",instance="localhost"} | 100 600 | + | node_disk_write_time_seconds_total{device="sdb",instance="localhost"} | 100 600 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 | + When variable `osd` is `osd.0` + Then Grafana panel `Physical Device Latency for $osd` with legend `{{instance}}/{{device}} Writes` shows: + | metrics | values | + | {device="sda",instance="localhost"} | 10 | + +Scenario: "Test Physical Device R/W IOPS for $osd - Writes" + Given the following series: + | metrics | values | + | node_disk_writes_completed_total{device="sda",instance="localhost"} | 10 100 | + | node_disk_writes_completed_total{device="sdb",instance="localhost"} | 10 100 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 | + When variable `osd` is `osd.0` + Then Grafana panel `Physical Device R/W IOPS for $osd` with legend `{{device}} on {{instance}} Writes` shows: + | metrics | values | + | {device="sda",instance="localhost"} | 1.5 | + +Scenario: "Test Physical Device R/W IOPS for $osd - Reads" + Given the following series: + | metrics | values | + | node_disk_reads_completed_total{device="sda",instance="localhost"} | 10 100 | + | node_disk_reads_completed_total{device="sdb",instance="localhost"} | 10 100 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 | + When variable `osd` is `osd.0` + Then Grafana panel `Physical Device R/W IOPS for $osd` with legend `{{device}} on {{instance}} Reads` shows: + | metrics | values | + | {device="sda",instance="localhost"} | 1.5 | + +Scenario: "Test Physical Device R/W Bytes for $osd - Reads" + Given the following series: + | metrics | values | + | node_disk_reads_completed_total{device="sda",instance="localhost"} | 10 100 | + | node_disk_reads_completed_total{device="sdb",instance="localhost"} | 10 100 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 | + When variable `osd` is `osd.0` + Then Grafana panel `Physical Device R/W IOPS for $osd` with legend `{{device}} on {{instance}} Reads` shows: + | metrics | values | + | {device="sda",instance="localhost"} | 1.5 | + +Scenario: "Test Physical Device R/W Bytes for $osd - Writes" + Given the following series: + | metrics | values | + | node_disk_writes_completed_total{device="sda",instance="localhost"} | 10 100 | + | node_disk_writes_completed_total{device="sdb",instance="localhost"} | 10 100 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 | + When variable `osd` is `osd.0` + Then Grafana panel `Physical Device R/W IOPS for $osd` with legend `{{device}} on {{instance}} Writes` shows: + | metrics | values | + | {device="sda",instance="localhost"} | 1.5 | + +Scenario: "Test Physical Device Util% for $osd" + Given the following series: + | metrics | values | + | node_disk_io_time_seconds_total{device="sda",instance="localhost:9100"} | 10 100 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 | + When variable `osd` is `osd.0` + Then Grafana panel `Physical Device Util% for $osd` with legend `{{device}} on {{instance}}` shows: + | metrics | values | + | {device="sda",instance="localhost"} | 1.5 | diff --git a/monitoring/ceph-mixin/tests_dashboards/features/osds-overview.feature b/monitoring/ceph-mixin/tests_dashboards/features/osds-overview.feature new file mode 100644 index 000000000..78d306419 --- /dev/null +++ b/monitoring/ceph-mixin/tests_dashboards/features/osds-overview.feature @@ -0,0 +1,15 @@ +Feature: OSD Overview + +Scenario: "Test OSD onode Hits Ratio" + Given the following series: + | metrics | values | + | ceph_bluestore_onode_hits{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"} | 5255 | + | ceph_bluestore_onode_hits{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"} | 5419 | + | ceph_bluestore_onode_hits{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"} | 5242 | + | ceph_bluestore_onode_misses{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"} | 202 | + | ceph_bluestore_onode_misses{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"} | 247 | + | ceph_bluestore_onode_misses{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"} | 234 | + Then Grafana panel `OSD onode Hits Ratio` with legend `EMPTY` shows: + | metrics | values | + | {} | 9.588529429483704E-01 | + diff --git a/monitoring/ceph-mixin/tests_dashboards/features/radosgw-detail.feature b/monitoring/ceph-mixin/tests_dashboards/features/radosgw-detail.feature new file mode 100644 index 000000000..e0016c507 --- /dev/null +++ b/monitoring/ceph-mixin/tests_dashboards/features/radosgw-detail.feature @@ -0,0 +1,139 @@ +Feature: RGW Host Detail Dashboard + +Scenario: "Test $rgw_servers GET/PUT Latencies - GET" + Given the following series: + | metrics | values | + | ceph_rgw_get_initial_lat_sum{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 10 50 100 | + | ceph_rgw_get_initial_lat_count{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 20 60 80 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + And variable `rgw_servers` is `rgw.foo` + Then Grafana panel `$rgw_servers GET/PUT Latencies` with legend `GET {{ceph_daemon}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance_id="58892247"} | 1.5 | + +Scenario: "Test $rgw_servers GET/PUT Latencies - PUT" + Given the following series: + | metrics | values | + | ceph_rgw_put_initial_lat_sum{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 15 35 55 | + | ceph_rgw_put_initial_lat_count{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 10 30 50 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + And variable `rgw_servers` is `rgw.foo` + Then Grafana panel `$rgw_servers GET/PUT Latencies` with legend `PUT {{ceph_daemon}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance_id="58892247"} | 1 | + +Scenario: "Test Bandwidth by HTTP Operation - GET" + Given the following series: + | metrics | values | + | ceph_rgw_get_b{instance="127.0.0.1", instance_id="92806566", job="ceph"} | 10 50 100 | + | ceph_rgw_metadata{ceph_daemon="rgw.1", hostname="localhost", instance="127.0.0.1", instance_id="92806566", job="ceph"} | 1 1 1 | + When interval is `30s` + And variable `rgw_servers` is `rgw.1` + Then Grafana panel `Bandwidth by HTTP Operation` with legend `GETs {{ceph_daemon}}` shows: + | metrics | values | + | {ceph_daemon="rgw.1", instance="127.0.0.1", instance_id="92806566", job="ceph"} | 1.5 | + +Scenario: "Test Bandwidth by HTTP Operation - PUT" + Given the following series: + | metrics | values | + | ceph_rgw_put_b{instance="127.0.0.1", instance_id="92806566", job="ceph"} | 5 20 50 | + | ceph_rgw_metadata{ceph_daemon="rgw.1", hostname="localhost", instance="127.0.0.1", instance_id="92806566", job="ceph"} | 1 1 1 | + When interval is `30s` + And variable `rgw_servers` is `rgw.1` + Then Grafana panel `Bandwidth by HTTP Operation` with legend `PUTs {{ceph_daemon}}` shows: + | metrics | values | + | {ceph_daemon="rgw.1", instance="127.0.0.1", instance_id="92806566", job="ceph"} | 7.5E-01 | + +Scenario: "Test HTTP Request Breakdown - Requests Failed" + Given the following series: + | metrics | values | + | ceph_rgw_failed_req{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 5 7 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + And variable `rgw_servers` is `rgw.foo` + Then Grafana panel `HTTP Request Breakdown` with legend `Requests Failed {{ceph_daemon}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1E-01 | + +Scenario: "Test HTTP Request Breakdown - GET" + Given the following series: + | metrics | values | + | ceph_rgw_get{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 100 150 170 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + And variable `rgw_servers` is `rgw.foo` + Then Grafana panel `HTTP Request Breakdown` with legend `GETs {{ceph_daemon}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1.1666666666666667 | + +Scenario: "Test HTTP Request Breakdown - PUT" + Given the following series: + | metrics | values | + | ceph_rgw_put{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 70 90 160 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + And variable `rgw_servers` is `rgw.foo` + Then Grafana panel `HTTP Request Breakdown` with legend `PUTs {{ceph_daemon}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1.5 | + +Scenario: "Test HTTP Request Breakdown - Other" + Given the following series: + | metrics | values | + | ceph_rgw_req{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 175 250 345 | + | ceph_rgw_get{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 100 150 170 | + | ceph_rgw_put{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 70 90 160 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + And variable `rgw_servers` is `rgw.foo` + Then Grafana panel `HTTP Request Breakdown` with legend `Other {{ceph_daemon}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph"} | .16666666666666652 | + +Scenario: "Test Workload Breakdown - Failures" + Given the following series: + | metrics | values | + | ceph_rgw_failed_req{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 5 7 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + And variable `rgw_servers` is `rgw.foo` + Then Grafana panel `Workload Breakdown` with legend `Failures {{ceph_daemon}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1E-01 | + +Scenario: "Test Workload Breakdown - GETs" + Given the following series: + | metrics | values | + | ceph_rgw_get{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 100 150 170 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + And variable `rgw_servers` is `rgw.foo` + Then Grafana panel `Workload Breakdown` with legend `GETs {{ceph_daemon}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1.1666666666666667 | + +Scenario: "Test Workload Breakdown - PUTs" + Given the following series: + | metrics | values | + | ceph_rgw_put{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 70 90 160 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + And variable `rgw_servers` is `rgw.foo` + Then Grafana panel `Workload Breakdown` with legend `PUTs {{ceph_daemon}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1.5 | + +Scenario: "Test Workload Breakdown - Other" + Given the following series: + | metrics | values | + | ceph_rgw_req{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 175 250 345 | + | ceph_rgw_get{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 100 150 170 | + | ceph_rgw_put{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 70 90 160 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + And variable `rgw_servers` is `rgw.foo` + Then Grafana panel `Workload Breakdown` with legend `Other (DELETE,LIST) {{ceph_daemon}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph"} | .16666666666666652 | diff --git a/monitoring/ceph-mixin/tests_dashboards/features/radosgw_overview.feature b/monitoring/ceph-mixin/tests_dashboards/features/radosgw_overview.feature new file mode 100644 index 000000000..642e43978 --- /dev/null +++ b/monitoring/ceph-mixin/tests_dashboards/features/radosgw_overview.feature @@ -0,0 +1,250 @@ +Feature: RGW Overview Dashboard + +Scenario: "Test Average GET Latencies" + Given the following series: + | metrics | values | + | ceph_rgw_get_initial_lat_sum{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 10 50 100 | + | ceph_rgw_get_initial_lat_count{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 20 60 80 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + Then Grafana panel `Average GET/PUT Latencies by RGW Instance` with legend `GET {{rgw_host}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph", rgw_host="foo"} | 1.5 | + +Scenario: "Test Average PUT Latencies" + Given the following series: + | metrics | values | + | ceph_rgw_put_initial_lat_sum{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 15 35 55 | + | ceph_rgw_put_initial_lat_count{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 10 30 50 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + Then Grafana panel `Average GET/PUT Latencies by RGW Instance` with legend `PUT {{rgw_host}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph", rgw_host="foo"} | 1 | + +Scenario: "Test Total Requests/sec by RGW Instance" + Given the following series: + | metrics | values | + | ceph_rgw_req{instance="127.0.0.1", instance_id="92806566", job="ceph"} | 10 50 100 | + | ceph_rgw_metadata{ceph_daemon="rgw.1", hostname="localhost", instance="127.0.0.1", instance_id="92806566", job="ceph"} | 1 1 1 | + When interval is `30s` + Then Grafana panel `Total Requests/sec by RGW Instance` with legend `{{rgw_host}}` shows: + | metrics | values | + | {rgw_host="1"} | 1.5 | + +Scenario: "Test GET Latencies by RGW Instance" + Given the following series: + | metrics | values | + | ceph_rgw_get_initial_lat_sum{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 10 50 100 | + | ceph_rgw_get_initial_lat_count{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 20 60 80 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + Then Grafana panel `GET Latencies by RGW Instance` with legend `{{rgw_host}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph", rgw_host="foo"} | 1.5 | + +Scenario: "Test Bandwidth Consumed by Type- GET" + Given the following series: + | metrics | values | + | ceph_rgw_get_b{instance="127.0.0.1", instance_id="92806566", job="ceph"} | 10 50 100 | + When evaluation time is `1m` + And interval is `30s` + Then Grafana panel `Bandwidth Consumed by Type` with legend `GETs` shows: + | metrics | values | + | {} | 1.5 | + +Scenario: "Test Bandwidth Consumed by Type- PUT" + Given the following series: + | metrics | values | + | ceph_rgw_put_b{instance="127.0.0.1", instance_id="92806566", job="ceph"} | 5 20 50 | + When evaluation time is `1m` + And interval is `30s` + Then Grafana panel `Bandwidth Consumed by Type` with legend `PUTs` shows: + | metrics | values | + | {} | 7.5E-01 | + +Scenario: "Test Bandwidth by RGW Instance" + Given the following series: + | metrics | values | + | ceph_rgw_get_b{instance="127.0.0.1", instance_id="92806566", job="ceph"} | 10 50 100 | + | ceph_rgw_put_b{instance="127.0.0.1", instance_id="92806566", job="ceph"} | 5 20 50 | + | ceph_rgw_metadata{ceph_daemon="rgw.1", hostname="localhost", instance="127.0.0.1", instance_id="92806566", job="ceph"} | 1 1 1 | + When evaluation time is `1m` + And interval is `30s` + Then Grafana panel `Bandwidth by RGW Instance` with legend `{{rgw_host}}` shows: + | metrics | values | + | {ceph_daemon="rgw.1", instance_id="92806566", rgw_host="1"} | 2.25 | + +Scenario: "Test PUT Latencies by RGW Instance" + Given the following series: + | metrics | values | + | ceph_rgw_put_initial_lat_sum{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 15 35 55 | + | ceph_rgw_put_initial_lat_count{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 10 30 50 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When evaluation time is `1m` + And interval is `30s` + Then Grafana panel `PUT Latencies by RGW Instance` with legend `{{rgw_host}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph", rgw_host="foo"} | 1 | + +Scenario: "Test Total backend responses by HTTP code" + Given the following series: + | metrics | values | + | haproxy_backend_http_responses_total{job="haproxy",code="200",instance="ingress.rgw.1",proxy="backend"} | 10 100 | + | haproxy_backend_http_responses_total{job="haproxy",code="404",instance="ingress.rgw.1",proxy="backend"} | 20 200 | + When variable `ingress_service` is `ingress.rgw.1` + When variable `code` is `200` + Then Grafana panel `Total responses by HTTP code` with legend `Backend {{ code }}` shows: + | metrics | values | + | {code="200"} | 1.5 | + +Scenario: "Test Total frontend responses by HTTP code" + Given the following series: + | metrics | values | + | haproxy_frontend_http_responses_total{job="haproxy",code="200",instance="ingress.rgw.1",proxy="frontend"} | 10 100 | + | haproxy_frontend_http_responses_total{job="haproxy",code="404",instance="ingress.rgw.1",proxy="frontend"} | 20 200 | + When variable `ingress_service` is `ingress.rgw.1` + When variable `code` is `200` + Then Grafana panel `Total responses by HTTP code` with legend `Frontend {{ code }}` shows: + | metrics | values | + | {code="200"} | 1.5 | + +Scenario: "Test Total http frontend requests by instance" + Given the following series: + | metrics | values | + | haproxy_frontend_http_requests_total{job="haproxy",proxy="frontend",instance="ingress.rgw.1"} | 10 100 | + | haproxy_frontend_http_requests_total{job="haproxy",proxy="frontend",instance="ingress.rgw.1"} | 20 200 | + When variable `ingress_service` is `ingress.rgw.1` + Then Grafana panel `Total requests / responses` with legend `Requests` shows: + | metrics | values | + | {instance="ingress.rgw.1"} | 3 | + +Scenario: "Test Total backend response errors by instance" + Given the following series: + | metrics | values | + | haproxy_backend_response_errors_total{job="haproxy",proxy="backend",instance="ingress.rgw.1"} | 10 100 | + | haproxy_backend_response_errors_total{job="haproxy",proxy="backend",instance="ingress.rgw.1"} | 20 200 | + When variable `ingress_service` is `ingress.rgw.1` + Then Grafana panel `Total requests / responses` with legend `Response errors` shows: + | metrics | values | + | {instance="ingress.rgw.1"} | 3 | + +Scenario: "Test Total frontend requests errors by instance" + Given the following series: + | metrics | values | + | haproxy_frontend_request_errors_total{job="haproxy",proxy="frontend",instance="ingress.rgw.1"} | 10 100 | + | haproxy_frontend_request_errors_total{job="haproxy",proxy="frontend",instance="ingress.rgw.1"} | 20 200 | + When variable `ingress_service` is `ingress.rgw.1` + Then Grafana panel `Total requests / responses` with legend `Requests errors` shows: + | metrics | values | + | {instance="ingress.rgw.1"} | 3 | + +Scenario: "Test Total backend redispatch warnings by instance" + Given the following series: + | metrics | values | + | haproxy_backend_redispatch_warnings_total{job="haproxy",proxy="backend",instance="ingress.rgw.1"} | 10 100 | + | haproxy_backend_redispatch_warnings_total{job="haproxy",proxy="backend",instance="ingress.rgw.1"} | 20 200 | + When variable `ingress_service` is `ingress.rgw.1` + Then Grafana panel `Total requests / responses` with legend `Backend redispatch` shows: + | metrics | values | + | {instance="ingress.rgw.1"} | 3 | + +Scenario: "Test Total backend retry warnings by instance" + Given the following series: + | metrics | values | + | haproxy_backend_retry_warnings_total{job="haproxy",proxy="backend",instance="ingress.rgw.1"} | 10 100 | + | haproxy_backend_retry_warnings_total{job="haproxy",proxy="backend",instance="ingress.rgw.1"} | 20 200 | + When variable `ingress_service` is `ingress.rgw.1` + Then Grafana panel `Total requests / responses` with legend `Backend retry` shows: + | metrics | values | + | {instance="ingress.rgw.1"} | 3 | + +Scenario: "Test Total frontend requests denied by instance" + Given the following series: + | metrics | values | + | haproxy_frontend_requests_denied_total{job="haproxy",proxy="frontend",instance="ingress.rgw.1"} | 10 100 | + | haproxy_frontend_requests_denied_total{job="haproxy",proxy="frontend",instance="ingress.rgw.1"} | 20 200 | + When variable `ingress_service` is `ingress.rgw.1` + Then Grafana panel `Total requests / responses` with legend `Request denied` shows: + | metrics | values | + | {instance="ingress.rgw.1"} | 3 | + +Scenario: "Test Total backend current queue by instance" + Given the following series: + | metrics | values | + | haproxy_backend_current_queue{job="haproxy",proxy="backend",instance="ingress.rgw.1"} | 10 100 | + | haproxy_backend_current_queue{job="haproxy",proxy="backend",instance="ingress.rgw.1"} | 20 200 | + When variable `ingress_service` is `ingress.rgw.1` + Then Grafana panel `Total requests / responses` with legend `Backend Queued` shows: + | metrics | values | + | {instance="ingress.rgw.1"} | 200 | + +Scenario: "Test Total frontend connections by instance" + Given the following series: + | metrics | values | + | haproxy_frontend_connections_total{job="haproxy",proxy="frontend",instance="ingress.rgw.1"} | 10 100 | + | haproxy_frontend_connections_total{job="haproxy",proxy="frontend",instance="ingress.rgw.1"} | 20 200 | + When variable `ingress_service` is `ingress.rgw.1` + Then Grafana panel `Total number of connections` with legend `Front` shows: + | metrics | values | + | {instance="ingress.rgw.1"} | 3 | + +Scenario: "Test Total backend connections attempts by instance" + Given the following series: + | metrics | values | + | haproxy_backend_connection_attempts_total{job="haproxy",proxy="backend",instance="ingress.rgw.1"} | 10 100 | + | haproxy_backend_connection_attempts_total{job="haproxy",proxy="backend",instance="ingress.rgw.1"} | 20 200 | + When variable `ingress_service` is `ingress.rgw.1` + Then Grafana panel `Total number of connections` with legend `Back` shows: + | metrics | values | + | {instance="ingress.rgw.1"} | 3 | + +Scenario: "Test Total backend connections error by instance" + Given the following series: + | metrics | values | + | haproxy_backend_connection_errors_total{job="haproxy",proxy="backend",instance="ingress.rgw.1"} | 10 100 | + | haproxy_backend_connection_errors_total{job="haproxy",proxy="backend",instance="ingress.rgw.1"} | 20 200 | + When variable `ingress_service` is `ingress.rgw.1` + Then Grafana panel `Total number of connections` with legend `Back errors` shows: + | metrics | values | + | {instance="ingress.rgw.1"} | 3 | + +Scenario: "Test Total frontend bytes incoming by instance" + Given the following series: + | metrics | values | + | haproxy_frontend_bytes_in_total{job="haproxy",proxy="frontend",instance="ingress.rgw.1"} | 10 100 | + | haproxy_frontend_bytes_in_total{job="haproxy",proxy="frontend",instance="ingress.rgw.1"} | 20 200 | + When variable `ingress_service` is `ingress.rgw.1` + Then Grafana panel `Current total of incoming / outgoing bytes` with legend `IN Front` shows: + | metrics | values | + | {instance="ingress.rgw.1"} | 24 | + +Scenario: "Test Total frontend bytes outgoing by instance" + Given the following series: + | metrics | values | + | haproxy_frontend_bytes_out_total{job="haproxy",proxy="frontend",instance="ingress.rgw.1"} | 10 100 | + | haproxy_frontend_bytes_out_total{job="haproxy",proxy="frontend",instance="ingress.rgw.1"} | 20 200 | + When variable `ingress_service` is `ingress.rgw.1` + Then Grafana panel `Current total of incoming / outgoing bytes` with legend `OUT Front` shows: + | metrics | values | + | {instance="ingress.rgw.1"} | 24 | + +Scenario: "Test Total backend bytes incoming by instance" + Given the following series: + | metrics | values | + | haproxy_backend_bytes_in_total{job="haproxy",proxy="backend",instance="ingress.rgw.1"} | 10 100 | + | haproxy_backend_bytes_in_total{job="haproxy",proxy="backend",instance="ingress.rgw.1"} | 20 200 | + When variable `ingress_service` is `ingress.rgw.1` + Then Grafana panel `Current total of incoming / outgoing bytes` with legend `IN Back` shows: + | metrics | values | + | {instance="ingress.rgw.1"} | 24 | + +Scenario: "Test Total backend bytes outgoing by instance" + Given the following series: + | metrics | values | + | haproxy_backend_bytes_out_total{job="haproxy",proxy="backend",instance="ingress.rgw.1"} | 10 100 | + | haproxy_backend_bytes_out_total{job="haproxy",proxy="backend",instance="ingress.rgw.1"} | 20 200 | + When variable `ingress_service` is `ingress.rgw.1` + Then Grafana panel `Current total of incoming / outgoing bytes` with legend `OUT Back` shows: + | metrics | values | + | {instance="ingress.rgw.1"} | 24 | diff --git a/monitoring/ceph-mixin/tests_dashboards/features/self.feature b/monitoring/ceph-mixin/tests_dashboards/features/self.feature new file mode 100644 index 000000000..2b44ce0dc --- /dev/null +++ b/monitoring/ceph-mixin/tests_dashboards/features/self.feature @@ -0,0 +1,68 @@ +Feature: Test tester + +Scenario: "Simple query works" + Given the following series: + | metrics | values | + | node_network_receive_bytes{instance="127.0.0.1", device="eth1"} | 10 100 | + | node_network_receive_bytes{instance="127.0.0.1", device="eth2"} | 10 100 | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth1"} | 10 100 | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth2"} | 10 100 | + | node_network_transmit_bytes{instance="192.168.100.2", device="bond0"} | 20 200 | + | node_network_transmit_bytes{instance="192.168.100.1", device="bond0"} | 20 200 | + | bonding_slaves{instance="127.0.0.1", master="bond0"} | 2 | + Then query `node_network_transmit_bytes{instance="127.0.0.1"} > 0` produces: + | metrics | values | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth1"} | 100 | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth2"} | 100 | + +Scenario: "Query with evaluation time" + Given the following series: + | metrics | values | + | node_network_receive_bytes{instance="127.0.0.1", device="eth1"} | 10 100 | + | node_network_receive_bytes{instance="127.0.0.1", device="eth2"} | 10 100 | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth1"} | 10 100 | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth2"} | 10 100 | + | node_network_transmit_bytes{instance="192.168.100.2", device="bond0"} | 20 200 | + | node_network_transmit_bytes{instance="192.168.100.1", device="bond0"} | 20 200 | + | bonding_slaves{instance="127.0.0.1", master="bond0"} | 2 | + When evaluation time is `0m` + Then query `node_network_transmit_bytes{instance="127.0.0.1"} > 0` produces: + | metrics | values | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth1"} | 10 | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth2"} | 10 | + +Scenario: "Query with evaluation time and variable value" + Given the following series: + | metrics | values | + | node_network_receive_bytes{instance="127.0.0.1", device="eth1"} | 10 100 | + | node_network_receive_bytes{instance="127.0.0.1", device="eth2"} | 10 100 | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth1"} | 10 100 | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth2"} | 10 100 | + | node_network_transmit_bytes{instance="192.168.100.2", device="bond0"} | 20 200 | + | node_network_transmit_bytes{instance="192.168.100.1", device="bond0"} | 20 200 | + | bonding_slaves{instance="127.0.0.1", master="bond0"} | 2 | + When evaluation time is `0m` + And variable `osd_hosts` is `127.0.0.1` + Then query `node_network_transmit_bytes{instance="$osd_hosts"} > 0` produces: + | metrics | values | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth1"} | 10 | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth2"} | 10 | + +Scenario: "Query with interval time" + Given the following series: + | metrics | values | + | node_network_receive_bytes{instance="127.0.0.1", device="eth1"} | 10 100 200 | + | node_network_receive_bytes{instance="127.0.0.1", device="eth2"} | 10 100 200 | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth1"} | 10 100 200 | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth2"} | 10 100 200 | + | node_network_transmit_bytes{instance="192.168.100.2", device="bond0"} | 20 200 300 | + | node_network_transmit_bytes{instance="192.168.100.1", device="bond0"} | 20 200 300 | + | bonding_slaves{instance="127.0.0.1", master="bond0"} | 2 | + When evaluation time is `2h` + And evaluation interval is `1h` + And interval is `1h` + And variable `osd_hosts` is `127.0.0.1` + Then query `node_network_transmit_bytes{instance="$osd_hosts"} > 0` produces: + | metrics | values | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth1"} | 200 | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth2"} | 200 | \ No newline at end of file diff --git a/monitoring/ceph-mixin/tests_dashboards/features/steps/__init__.py b/monitoring/ceph-mixin/tests_dashboards/features/steps/__init__.py new file mode 100644 index 000000000..0b90f46f2 --- /dev/null +++ b/monitoring/ceph-mixin/tests_dashboards/features/steps/__init__.py @@ -0,0 +1 @@ +# This file and steps files is needed even if its empty because of 'behave' :( -- cgit v1.2.3