diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /monitoring/ceph-mixin/tests_dashboards | |
parent | Initial commit. (diff) | |
download | ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'monitoring/ceph-mixin/tests_dashboards')
14 files changed, 1232 insertions, 0 deletions
diff --git a/monitoring/ceph-mixin/tests_dashboards/__init__.py b/monitoring/ceph-mixin/tests_dashboards/__init__.py new file mode 100644 index 000000000..45147e5c3 --- /dev/null +++ b/monitoring/ceph-mixin/tests_dashboards/__init__.py @@ -0,0 +1,189 @@ +import re +import subprocess +import sys +import tempfile +from dataclasses import asdict, dataclass, field +from typing import Any, List + +import yaml + +from .util import replace_grafana_expr_variables + + +@dataclass +class InputSeries: + series: str = '' + values: str = '' + +@dataclass +class ExprSample: + labels: str = '' + value: float = -1 + +@dataclass +class PromqlExprTest: + expr: str = '' + eval_time: str = '1m' + exp_samples: List[ExprSample] = field(default_factory=list) + +@dataclass +class Test: + interval: str = '1m' + input_series: List[InputSeries] = field(default_factory=list) + promql_expr_test: List[PromqlExprTest] = field(default_factory=list) + + +@dataclass +class TestFile: + evaluation_interval: str = '1m' + tests: List[Test] = field(default_factory=list) + + +class PromqlTest: + """ + Base class to provide prometheus query test capabilities. After setting up + the query test with its input and expected output it's expected to run promtool. + + https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/#test-yml + + The workflow of testing would be something like: + + # add prometheus query to test + self.set_expression('bonding_slaves > 0') + + # add some prometheus input series + self.add_series('bonding_slaves{master="bond0"}', '2') + self.add_series('bonding_slaves{master="bond1"}', '3') + self.add_series('node_network_receive_bytes{instance="127.0.0.1", + device="eth1"}', "10 100 230 22") + + # expected output of the query + self.add_exp_samples('bonding_slaves{master="bond0"}', 2) + self.add_exp_samples('bonding_slaves{master="bond1"}', 3) + + # at last, always call promtool with: + self.assertTrue(self.run_promtool()) + # assertTrue means it expect promtool to succeed + """ + + def __init__(self): + self.test_output_file = tempfile.NamedTemporaryFile('w+') + + self.test_file = TestFile() + self.test = Test() + self.promql_expr_test = PromqlExprTest() + self.test.promql_expr_test.append(self.promql_expr_test) + self.test_file.tests.append(self.test) + + self.variables = {} + + def __del__(self): + self.test_output_file.close() + + + def set_evaluation_interval(self, interval: int, unit: str = 'm') -> None: + """ + Set the evaluation interval of the time series + + Args: + interval (int): number of units. + unit (str): unit type: 'ms', 's', 'm', etc... + """ + self.test_file.evaluation_interval = f'{interval}{unit}' + + def set_interval(self, interval: int, unit: str = 'm') -> None: + """ + Set the duration of the time series + + Args: + interval (int): number of units. + unit (str): unit type: 'ms', 's', 'm', etc... + """ + self.test.interval = f'{interval}{unit}' + + def set_expression(self, expr: str) -> None: + """ + Set the prometheus expression/query used to filter data. + + Args: + expr(str): expression/query. + """ + self.promql_expr_test.expr = expr + + def add_series(self, series: str, values: str) -> None: + """ + Add a series to the input. + + Args: + series(str): Prometheus series. + Notation: '<metric name>{<label name>=<label value>, ...}' + values(str): Value of the series. + """ + input_series = InputSeries(series=series, values=values) + self.test.input_series.append(input_series) + + def set_eval_time(self, eval_time: int, unit: str = 'm') -> None: + """ + Set the time when the expression will be evaluated + + Args: + interval (int): number of units. + unit (str): unit type: 'ms', 's', 'm', etc... + """ + self.promql_expr_test.eval_time = f'{eval_time}{unit}' + + def add_exp_samples(self, sample: str, values: Any) -> None: + """ + Add an expected sample/output of the query given the series/input + + Args: + sample(str): Expected sample. + Notation: '<metric name>{<label name>=<label value>, ...}' + values(Any): Value of the sample. + """ + expr_sample = ExprSample(labels=sample, value=values) + self.promql_expr_test.exp_samples.append(expr_sample) + + def set_variable(self, variable: str, value: str): + """ + If a query makes use of grafonnet variables, for example + '$osd_hosts', you should change this to a real value. Example: + + + > self.set_expression('bonding_slaves{master="$osd_hosts"} > 0') + > self.set_variable('osd_hosts', '127.0.0.1') + > print(self.query) + > bonding_slaves{master="127.0.0.1"} > 0 + + Args: + variable(str): Variable name + value(str): Value to replace variable with + + """ + self.variables[variable] = value + + def run_promtool(self): + """ + Run promtool to test the query after setting up the input, output + and extra parameters. + + Returns: + bool: True if successful, False otherwise. + """ + + for variable, value in self.variables.items(): + expr = self.promql_expr_test.expr + new_expr = replace_grafana_expr_variables(expr, variable, value) + self.set_expression(new_expr) + + test_as_dict = asdict(self.test_file) + yaml.dump(test_as_dict, self.test_output_file) + + args = f'promtool test rules {self.test_output_file.name}'.split() + try: + subprocess.run(args, check=True) + return True + except subprocess.CalledProcessError as process_error: + print(yaml.dump(test_as_dict)) + print(process_error.stderr) + return False diff --git a/monitoring/ceph-mixin/tests_dashboards/features/__init__.py b/monitoring/ceph-mixin/tests_dashboards/features/__init__.py new file mode 100644 index 000000000..e69de29bb --- /dev/null +++ b/monitoring/ceph-mixin/tests_dashboards/features/__init__.py diff --git a/monitoring/ceph-mixin/tests_dashboards/features/ceph-cluster.feature b/monitoring/ceph-mixin/tests_dashboards/features/ceph-cluster.feature new file mode 100644 index 000000000..1a446cd2c --- /dev/null +++ b/monitoring/ceph-mixin/tests_dashboards/features/ceph-cluster.feature @@ -0,0 +1,54 @@ +Feature: Ceph Cluster Dashboard + +Scenario: "Test total PG States" + Given the following series: + | metrics | values | + | ceph_pg_total{foo="var"} | 10 100 | + | ceph_pg_total{foo="bar"} | 20 200 | + Then Grafana panel `PG States` with legend `Total` shows: + | metrics | values | + | {} | 300 | + +Scenario: "Test OSDs in" + Given the following series: + | metrics | values | + | ceph_osd_in{ceph_daemon="osd.0"} | 1.0 | + | ceph_osd_in{ceph_daemon="osd.1"} | 0.0 | + | ceph_osd_in{ceph_daemon="osd.2"} | 1.0 | + When variable `instance` is `.*` + Then Grafana panel `OSDs` with legend `In` shows: + | metrics | values | + | {} | 2 | + +Scenario: "Test OSDs down" + Given the following series: + | metrics | values | + | ceph_osd_up{ceph_daemon="osd.0", instance="127.0.0.1"} | 0.0 | + | ceph_osd_up{ceph_daemon="osd.1", instance="127.0.0.1"} | 0.0 | + | ceph_osd_up{ceph_daemon="osd.2", instance="127.0.0.1"} | 0.0 | + When variable `instance` is `127.0.0.1` + Then Grafana panel `OSDs` with legend `Down` shows: + | metrics | values | + | {} | 3 | + +Scenario: "Test OSDs out" + Given the following series: + | metrics | values | + | ceph_osd_in{ceph_daemon="osd.0", instance="127.0.0.1"} | 0.0 | + | ceph_osd_in{ceph_daemon="osd.1", instance="127.0.0.1"} | 1.0 | + | ceph_osd_in{ceph_daemon="osd.2", instance="127.0.0.1"} | 0.0 | + When variable `instance` is `127.0.0.1` + Then Grafana panel `OSDs` with legend `Out` shows: + | metrics | values | + | {} | 2 | + +Scenario: "Test OSDs all" + Given the following series: + | metrics | values | + | ceph_osd_metadata{ceph_daemon="osd.0", instance="127.0.0.1"} | 1.0 | + | ceph_osd_metadata{ceph_daemon="osd.1", instance="127.0.0.1"} | 1.0 | + | ceph_osd_metadata{ceph_daemon="osd.2", instance="127.0.0.1"} | 1.0 | + When variable `instance` is `127.0.0.1` + Then Grafana panel `OSDs` with legend `All` shows: + | metrics | values | + | {} | 3 | diff --git a/monitoring/ceph-mixin/tests_dashboards/features/environment.py b/monitoring/ceph-mixin/tests_dashboards/features/environment.py new file mode 100644 index 000000000..5dc76a09e --- /dev/null +++ b/monitoring/ceph-mixin/tests_dashboards/features/environment.py @@ -0,0 +1,135 @@ +# type: ignore[no-redef] +# pylint: disable=E0611,W0613,E0102 +import copy + +from behave import given, then, when +from prettytable import PrettyTable + +from tests_dashboards import PromqlTest +from tests_dashboards.util import get_dashboards_data, resolve_time_and_unit + + +class GlobalContext: + def __init__(self): + self.tested_queries_count = 0 + self.promql_expr_test = None + self.data = get_dashboards_data() + self.query_map = self.data['queries'] + + def reset_promql_test(self): + self.promql_expr_test = PromqlTest() + self.promql_expr_test.variables = copy.copy(self.data['variables']) + + def print_query_stats(self): + total = len(self.query_map) + table = PrettyTable() + table.field_names = ['Name', 'Queries', 'Tested', 'Cover'] + + def percent(tested, total): + return str(round((tested / total) * 100, 2)) + '%' + + def file_name(path): + return path.split('/')[-1] + + total = 0 + tested = 0 + for path, stat in self.data['stats'].items(): + assert stat['total'] + table.add_row([file_name(path), stat['total'], stat['tested'], + percent(stat['tested'], stat['total'])]) + total += stat['total'] + tested += stat['tested'] + + assert total + table.add_row(['Total', total, tested, percent(tested, total)]) + print(table) + + +global_context = GlobalContext() + +# Behave function overloading +# =========================== + + +def before_scenario(context, scenario): + global_context.reset_promql_test() + + +def after_scenario(context, scenario): + assert global_context.promql_expr_test.run_promtool() + + +def after_all(context): + global_context.print_query_stats() + + +@given("the following series") +def step_impl(context): + for row in context.table: + metric = row['metrics'] + value = row['values'] + global_context.promql_expr_test.add_series(metric, value) + + +@when('evaluation interval is `{interval}`') +def step_impl(context, interval): + interval_without_unit, unit = resolve_time_and_unit(interval) + if interval_without_unit is None: + raise ValueError(f'Invalid interval time: {interval_without_unit}. ' + + 'A valid time looks like "1m" where you have a number plus a unit') + global_context.promql_expr_test.set_evaluation_interval(interval_without_unit, unit) + + +@when('interval is `{interval}`') +def step_impl(context, interval): + interval_without_unit, unit = resolve_time_and_unit(interval) + if interval_without_unit is None: + raise ValueError(f'Invalid interval time: {interval_without_unit}. ' + + 'A valid time looks like "1m" where you have a number plus a unit') + global_context.promql_expr_test.set_interval(interval_without_unit, unit) + + +@when('evaluation time is `{eval_time}`') +def step_impl(context, eval_time): + eval_time_without_unit, unit = resolve_time_and_unit(eval_time) + if eval_time_without_unit is None: + raise ValueError(f'Invalid evalution time: {eval_time}. ' + + 'A valid time looks like "1m" where you have a number plus a unit') + global_context.promql_expr_test.set_eval_time(eval_time_without_unit, unit) + + +@when('variable `{variable}` is `{value}`') +def step_impl(context, variable, value): + global_context.promql_expr_test.set_variable(variable, value) + + +@then('Grafana panel `{panel_name}` with legend `{legend}` shows') +def step_impl(context, panel_name, legend): + """ + This step can have an empty legend. As 'behave' doesn't provide a way + to say it's empty we use EMPTY to mark as empty. + """ + if legend == "EMPTY": + legend = '' + query_id = panel_name + '-' + legend + if query_id not in global_context.query_map: + raise KeyError((f'Query with legend {legend} in panel "{panel_name}"' + 'couldn\'t be found')) + + expr = global_context.query_map[query_id]['query'] + global_context.promql_expr_test.set_expression(expr) + for row in context.table: + metric = row['metrics'] + value = row['values'] + global_context.promql_expr_test.add_exp_samples(metric, float(value)) + path = global_context.query_map[query_id]['path'] + global_context.data['stats'][path]['tested'] += 1 + + +@then('query `{query}` produces') +def step_impl(context, query): + global_context.promql_expr_test.set_expression(query) + for row in context.table: + metric = row['metrics'] + value = row['values'] + global_context.promql_expr_test.add_exp_samples(metric, float(value)) diff --git a/monitoring/ceph-mixin/tests_dashboards/features/host-details.feature b/monitoring/ceph-mixin/tests_dashboards/features/host-details.feature new file mode 100644 index 000000000..51e3c5819 --- /dev/null +++ b/monitoring/ceph-mixin/tests_dashboards/features/host-details.feature @@ -0,0 +1,131 @@ +Feature: Host Details Dashboard + +Scenario: "Test OSD" + Given the following series: + | metrics | values | + | ceph_osd_metadata{job="ceph",back_iface="",ceph_daemon="osd.0",cluster_addr="192.168.1.12",device_class="hdd",front_iface="",hostname="127.0.0.1",objectstore="bluestore",public_addr="192.168.1.12",ceph_version="ceph version 17.0.0-8967-g6932a4f702a (6932a4f702a0d557fc36df3ca7a3bca70de42667) quincy (dev)"} | 1.0 | + | ceph_osd_metadata{job="ceph",back_iface="",ceph_daemon="osd.1",cluster_addr="192.168.1.12",device_class="hdd",front_iface="",hostname="127.0.0.1",objectstore="bluestore",public_addr="192.168.1.12",ceph_version="ceph version 17.0.0-8967-g6932a4f702a (6932a4f702a0d557fc36df3ca7a3bca70de42667) quincy (dev)"} | 1.0 | + | ceph_osd_metadata{job="ceph",back_iface="",ceph_daemon="osd.2",cluster_addr="192.168.1.12",device_class="hdd",front_iface="",hostname="127.0.0.1",objectstore="bluestore",public_addr="192.168.1.12",ceph_version="ceph version 17.0.0-8967-g6932a4f702a (6932a4f702a0d557fc36df3ca7a3bca70de42667) quincy (dev)"} | 1.0 | + When variable `ceph_hosts` is `127.0.0.1` + Then Grafana panel `OSDs` with legend `EMPTY` shows: + | metrics | values | + | {} | 3 | + +# IOPS Panel - begin + +Scenario: "Test Disk IOPS - Writes - Several OSDs per device" + Given the following series: + | metrics | values | + | node_disk_writes_completed_total{job="ceph",device="sda",instance="localhost:9100"} | 10+60x1 | + | node_disk_writes_completed_total{job="ceph",device="sdb",instance="localhost:9100"} | 10+60x1 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.0 osd.1 osd.2",device="/dev/sda",instance="localhost:9283"} | 1.0 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.3 osd.4 osd.5",device="/dev/sdb",instance="localhost:9283"} | 1.0 | + When variable `ceph_hosts` is `localhost` + Then Grafana panel `$ceph_hosts Disk IOPS` with legend `{{device}}({{ceph_daemon}}) writes` shows: + | metrics | values | + | {job="ceph",ceph_daemon="osd.0 osd.1 osd.2", device="sda", instance="localhost"} | 1 | + | {job="ceph",ceph_daemon="osd.3 osd.4 osd.5", device="sdb", instance="localhost"} | 1 | + +Scenario: "Test Disk IOPS - Writes - Single OSD per device" + Given the following series: + | metrics | values | + | node_disk_writes_completed_total{job="ceph",device="sda",instance="localhost:9100"} | 10+60x1 | + | node_disk_writes_completed_total{job="ceph",device="sdb",instance="localhost:9100"} | 10+60x1 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 | + When variable `ceph_hosts` is `localhost` + Then Grafana panel `$ceph_hosts Disk IOPS` with legend `{{device}}({{ceph_daemon}}) writes` shows: + | metrics | values | + | {job="ceph",ceph_daemon="osd.0", device="sda", instance="localhost"} | 1 | + | {job="ceph",ceph_daemon="osd.1", device="sdb", instance="localhost"} | 1 | + +Scenario: "Test Disk IOPS - Reads - Several OSDs per device" + Given the following series: + | metrics | values | + | node_disk_reads_completed_total{job="ceph",device="sda",instance="localhost:9100"} | 10+60x1 | + | node_disk_reads_completed_total{job="ceph",device="sdb",instance="localhost:9100"} | 10+60x1 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.0 osd.1 osd.2",device="/dev/sda",instance="localhost:9283"} | 1.0 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.3 osd.4 osd.5",device="/dev/sdb",instance="localhost:9283"} | 1.0 | + When variable `ceph_hosts` is `localhost` + Then Grafana panel `$ceph_hosts Disk IOPS` with legend `{{device}}({{ceph_daemon}}) reads` shows: + | metrics | values | + | {job="ceph",ceph_daemon="osd.0 osd.1 osd.2", device="sda", instance="localhost"} | 1 | + | {job="ceph",ceph_daemon="osd.3 osd.4 osd.5", device="sdb", instance="localhost"} | 1 | + +Scenario: "Test Disk IOPS - Reads - Single OSD per device" + Given the following series: + | metrics | values | + | node_disk_reads_completed_total{job="ceph",device="sda",instance="localhost:9100"} | 10+60x1 | + | node_disk_reads_completed_total{job="ceph",device="sdb",instance="localhost:9100"} | 10+60x1 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 | + When variable `ceph_hosts` is `localhost` + Then Grafana panel `$ceph_hosts Disk IOPS` with legend `{{device}}({{ceph_daemon}}) reads` shows: + | metrics | values | + | {job="ceph",ceph_daemon="osd.0", device="sda", instance="localhost"} | 1 | + | {job="ceph",ceph_daemon="osd.1", device="sdb", instance="localhost"} | 1 | + +# IOPS Panel - end + +# Node disk bytes written/read panel - begin + +Scenario: "Test disk throughput - read" + Given the following series: + | metrics | values | + | node_disk_read_bytes_total{job="ceph",device="sda",instance="localhost:9100"} | 10+60x1 | + | node_disk_read_bytes_total{job="ceph",device="sdb",instance="localhost:9100"} | 100+600x1 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 | + When variable `ceph_hosts` is `localhost` + Then Grafana panel `$ceph_hosts Throughput by Disk` with legend `{{device}}({{ceph_daemon}}) read` shows: + | metrics | values | + | {job="ceph",ceph_daemon="osd.0", device="sda", instance="localhost"} | 1 | + | {job="ceph",ceph_daemon="osd.1", device="sdb", instance="localhost"} | 10 | + +Scenario: "Test disk throughput - write" + Given the following series: + | metrics | values | + | node_disk_written_bytes_total{job="ceph",device="sda",instance="localhost:9100"} | 10+60x1 | + | node_disk_written_bytes_total{job="ceph",device="sdb",instance="localhost:9100"} | 100+600x1 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 | + When variable `ceph_hosts` is `localhost` + Then Grafana panel `$ceph_hosts Throughput by Disk` with legend `{{device}}({{ceph_daemon}}) write` shows: + | metrics | values | + | {job="ceph",ceph_daemon="osd.0", device="sda", instance="localhost"} | 1 | + | {job="ceph",ceph_daemon="osd.1", device="sdb", instance="localhost"} | 10 | + +# Node disk bytes written/read panel - end + +Scenario: "Test $ceph_hosts Disk Latency panel" + Given the following series: + | metrics | values | + | node_disk_write_time_seconds_total{job="ceph",device="sda",instance="localhost:9100"} | 10+60x1 | + | node_disk_write_time_seconds_total{job="ceph",device="sdb",instance="localhost:9100"} | 10+60x1 | + | node_disk_writes_completed_total{job="ceph",device="sda",instance="localhost:9100"} | 10+60x1 | + | node_disk_writes_completed_total{job="ceph",device="sdb",instance="localhost:9100"} | 10+60x1 | + | node_disk_read_time_seconds_total{job="ceph",device="sda",instance="localhost:9100"} | 10+60x1 | + | node_disk_read_time_seconds_total{job="ceph",device="sdb",instance="localhost:9100"} | 10+60x1 | + | node_disk_reads_completed_total{job="ceph",device="sda",instance="localhost:9100"} | 10+60x1 | + | node_disk_reads_completed_total{job="ceph",device="sdb",instance="localhost:9100"} | 10+60x1 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 | + When variable `ceph_hosts` is `localhost` + Then Grafana panel `$ceph_hosts Disk Latency` with legend `{{device}}({{ceph_daemon}})` shows: + | metrics | values | + | {ceph_daemon="osd.0", device="sda", instance="localhost"} | 1 | + | {ceph_daemon="osd.1", device="sdb", instance="localhost"} | 1 | + +Scenario: "Test $ceph_hosts Disk utilization" + Given the following series: + | metrics | values | + | node_disk_io_time_seconds_total{job="ceph",device="sda",instance="localhost:9100"} | 10+60x1 | + | node_disk_io_time_seconds_total{job="ceph",device="sdb",instance="localhost:9100"} | 10+60x1 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 | + When variable `ceph_hosts` is `localhost` + Then Grafana panel `$ceph_hosts Disk utilization` with legend `{{device}}({{ceph_daemon}})` shows: + | metrics | values | + | {job="ceph",ceph_daemon="osd.0", device="sda", instance="localhost"} | 100 | + | {job="ceph",ceph_daemon="osd.1", device="sdb", instance="localhost"} | 100 | + diff --git a/monitoring/ceph-mixin/tests_dashboards/features/hosts_overview.feature b/monitoring/ceph-mixin/tests_dashboards/features/hosts_overview.feature new file mode 100644 index 000000000..6c5eceaed --- /dev/null +++ b/monitoring/ceph-mixin/tests_dashboards/features/hosts_overview.feature @@ -0,0 +1,41 @@ +Feature: Hosts Overview Dashboard + +Scenario: "Test network load succeeds" + Given the following series: + | metrics | values | + | node_network_receive_bytes{instance="127.0.0.1", device="eth1"} | 10 100 | + | node_network_receive_bytes{instance="127.0.0.1", device="eth2"} | 10 100 | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth1"} | 10 100 | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth2"} | 10 100 | + When variable `osd_hosts` is `127.0.0.1` + Then Grafana panel `Network Load` with legend `EMPTY` shows: + | metrics | values | + | {} | 6 | + +Scenario: "Test network load with bonding succeeds" + Given the following series: + | metrics | values | + | node_network_receive_bytes{instance="127.0.0.1", device="eth1"} | 10 100 200 | + | node_network_receive_bytes{instance="127.0.0.1", device="eth2"} | 10 100 200 | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth1"} | 10 100 200 | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth2"} | 10 100 200 | + | node_network_transmit_bytes{instance="127.0.0.1", device="bond0"} | 20 200 300 | + | node_network_transmit_bytes{instance="127.0.0.1", device="bond0"} | 20 200 300 | + | bonding_slaves{instance="127.0.0.1", master="bond0"} | 2 | + When variable `osd_hosts` is `127.0.0.1` + Then Grafana panel `Network Load` with legend `EMPTY` shows: + | metrics | values | + | {} | 6 | + +Scenario: "Test AVG Disk Utilization" + Given the following series: + | metrics | values | + | node_disk_io_time_seconds_total{device="sda",instance="localhost:9100"} | 10+60x1 | + | node_disk_io_time_seconds_total{device="sdb",instance="localhost:9100"} | 10+60x1 | + | node_disk_io_time_seconds_total{device="sdc",instance="localhost:9100"} | 10 2000 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.0",device="sda",instance="localhost:9283"} | 1.0 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 | + When variable `osd_hosts` is `localhost` + Then Grafana panel `AVG Disk Utilization` with legend `EMPTY` shows: + | metrics | values | + | {} | 100 | diff --git a/monitoring/ceph-mixin/tests_dashboards/features/osd-device-details.feature b/monitoring/ceph-mixin/tests_dashboards/features/osd-device-details.feature new file mode 100644 index 000000000..0d6ca8b17 --- /dev/null +++ b/monitoring/ceph-mixin/tests_dashboards/features/osd-device-details.feature @@ -0,0 +1,88 @@ +Feature: OSD device details + +Scenario: "Test Physical Device Latency for $osd - Reads" + Given the following series: + | metrics | values | + | node_disk_reads_completed_total{device="sda",instance="localhost"} | 10 60 | + | node_disk_reads_completed_total{device="sdb",instance="localhost"} | 10 60 | + | node_disk_read_time_seconds_total{device="sda",instance="localhost"} | 100 600 | + | node_disk_read_time_seconds_total{device="sdb",instance="localhost"} | 100 600 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 | + When variable `osd` is `osd.0` + Then Grafana panel `Physical Device Latency for $osd` with legend `{{instance}}/{{device}} Reads` shows: + | metrics | values | + | {device="sda",instance="localhost"} | 10 | + +Scenario: "Test Physical Device Latency for $osd - Writes" + Given the following series: + | metrics | values | + | node_disk_writes_completed_total{device="sda",instance="localhost"} | 10 60 | + | node_disk_writes_completed_total{device="sdb",instance="localhost"} | 10 60 | + | node_disk_write_time_seconds_total{device="sda",instance="localhost"} | 100 600 | + | node_disk_write_time_seconds_total{device="sdb",instance="localhost"} | 100 600 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 | + When variable `osd` is `osd.0` + Then Grafana panel `Physical Device Latency for $osd` with legend `{{instance}}/{{device}} Writes` shows: + | metrics | values | + | {device="sda",instance="localhost"} | 10 | + +Scenario: "Test Physical Device R/W IOPS for $osd - Writes" + Given the following series: + | metrics | values | + | node_disk_writes_completed_total{device="sda",instance="localhost"} | 10 100 | + | node_disk_writes_completed_total{device="sdb",instance="localhost"} | 10 100 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 | + When variable `osd` is `osd.0` + Then Grafana panel `Physical Device R/W IOPS for $osd` with legend `{{device}} on {{instance}} Writes` shows: + | metrics | values | + | {device="sda",instance="localhost"} | 1.5 | + +Scenario: "Test Physical Device R/W IOPS for $osd - Reads" + Given the following series: + | metrics | values | + | node_disk_reads_completed_total{device="sda",instance="localhost"} | 10 100 | + | node_disk_reads_completed_total{device="sdb",instance="localhost"} | 10 100 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 | + When variable `osd` is `osd.0` + Then Grafana panel `Physical Device R/W IOPS for $osd` with legend `{{device}} on {{instance}} Reads` shows: + | metrics | values | + | {device="sda",instance="localhost"} | 1.5 | + +Scenario: "Test Physical Device R/W Bytes for $osd - Reads" + Given the following series: + | metrics | values | + | node_disk_reads_completed_total{device="sda",instance="localhost"} | 10 100 | + | node_disk_reads_completed_total{device="sdb",instance="localhost"} | 10 100 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 | + When variable `osd` is `osd.0` + Then Grafana panel `Physical Device R/W IOPS for $osd` with legend `{{device}} on {{instance}} Reads` shows: + | metrics | values | + | {device="sda",instance="localhost"} | 1.5 | + +Scenario: "Test Physical Device R/W Bytes for $osd - Writes" + Given the following series: + | metrics | values | + | node_disk_writes_completed_total{device="sda",instance="localhost"} | 10 100 | + | node_disk_writes_completed_total{device="sdb",instance="localhost"} | 10 100 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 | + When variable `osd` is `osd.0` + Then Grafana panel `Physical Device R/W IOPS for $osd` with legend `{{device}} on {{instance}} Writes` shows: + | metrics | values | + | {device="sda",instance="localhost"} | 1.5 | + +Scenario: "Test Physical Device Util% for $osd" + Given the following series: + | metrics | values | + | node_disk_io_time_seconds_total{device="sda",instance="localhost:9100"} | 10 100 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 | + | ceph_disk_occupation_human{job="ceph",ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 | + When variable `osd` is `osd.0` + Then Grafana panel `Physical Device Util% for $osd` with legend `{{device}} on {{instance}}` shows: + | metrics | values | + | {device="sda",instance="localhost"} | 1.5 | diff --git a/monitoring/ceph-mixin/tests_dashboards/features/osds-overview.feature b/monitoring/ceph-mixin/tests_dashboards/features/osds-overview.feature new file mode 100644 index 000000000..78d306419 --- /dev/null +++ b/monitoring/ceph-mixin/tests_dashboards/features/osds-overview.feature @@ -0,0 +1,15 @@ +Feature: OSD Overview + +Scenario: "Test OSD onode Hits Ratio" + Given the following series: + | metrics | values | + | ceph_bluestore_onode_hits{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"} | 5255 | + | ceph_bluestore_onode_hits{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"} | 5419 | + | ceph_bluestore_onode_hits{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"} | 5242 | + | ceph_bluestore_onode_misses{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"} | 202 | + | ceph_bluestore_onode_misses{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"} | 247 | + | ceph_bluestore_onode_misses{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"} | 234 | + Then Grafana panel `OSD onode Hits Ratio` with legend `EMPTY` shows: + | metrics | values | + | {} | 9.588529429483704E-01 | + diff --git a/monitoring/ceph-mixin/tests_dashboards/features/radosgw-detail.feature b/monitoring/ceph-mixin/tests_dashboards/features/radosgw-detail.feature new file mode 100644 index 000000000..e0016c507 --- /dev/null +++ b/monitoring/ceph-mixin/tests_dashboards/features/radosgw-detail.feature @@ -0,0 +1,139 @@ +Feature: RGW Host Detail Dashboard + +Scenario: "Test $rgw_servers GET/PUT Latencies - GET" + Given the following series: + | metrics | values | + | ceph_rgw_get_initial_lat_sum{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 10 50 100 | + | ceph_rgw_get_initial_lat_count{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 20 60 80 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + And variable `rgw_servers` is `rgw.foo` + Then Grafana panel `$rgw_servers GET/PUT Latencies` with legend `GET {{ceph_daemon}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance_id="58892247"} | 1.5 | + +Scenario: "Test $rgw_servers GET/PUT Latencies - PUT" + Given the following series: + | metrics | values | + | ceph_rgw_put_initial_lat_sum{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 15 35 55 | + | ceph_rgw_put_initial_lat_count{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 10 30 50 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + And variable `rgw_servers` is `rgw.foo` + Then Grafana panel `$rgw_servers GET/PUT Latencies` with legend `PUT {{ceph_daemon}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance_id="58892247"} | 1 | + +Scenario: "Test Bandwidth by HTTP Operation - GET" + Given the following series: + | metrics | values | + | ceph_rgw_get_b{instance="127.0.0.1", instance_id="92806566", job="ceph"} | 10 50 100 | + | ceph_rgw_metadata{ceph_daemon="rgw.1", hostname="localhost", instance="127.0.0.1", instance_id="92806566", job="ceph"} | 1 1 1 | + When interval is `30s` + And variable `rgw_servers` is `rgw.1` + Then Grafana panel `Bandwidth by HTTP Operation` with legend `GETs {{ceph_daemon}}` shows: + | metrics | values | + | {ceph_daemon="rgw.1", instance="127.0.0.1", instance_id="92806566", job="ceph"} | 1.5 | + +Scenario: "Test Bandwidth by HTTP Operation - PUT" + Given the following series: + | metrics | values | + | ceph_rgw_put_b{instance="127.0.0.1", instance_id="92806566", job="ceph"} | 5 20 50 | + | ceph_rgw_metadata{ceph_daemon="rgw.1", hostname="localhost", instance="127.0.0.1", instance_id="92806566", job="ceph"} | 1 1 1 | + When interval is `30s` + And variable `rgw_servers` is `rgw.1` + Then Grafana panel `Bandwidth by HTTP Operation` with legend `PUTs {{ceph_daemon}}` shows: + | metrics | values | + | {ceph_daemon="rgw.1", instance="127.0.0.1", instance_id="92806566", job="ceph"} | 7.5E-01 | + +Scenario: "Test HTTP Request Breakdown - Requests Failed" + Given the following series: + | metrics | values | + | ceph_rgw_failed_req{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 5 7 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + And variable `rgw_servers` is `rgw.foo` + Then Grafana panel `HTTP Request Breakdown` with legend `Requests Failed {{ceph_daemon}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1E-01 | + +Scenario: "Test HTTP Request Breakdown - GET" + Given the following series: + | metrics | values | + | ceph_rgw_get{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 100 150 170 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + And variable `rgw_servers` is `rgw.foo` + Then Grafana panel `HTTP Request Breakdown` with legend `GETs {{ceph_daemon}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1.1666666666666667 | + +Scenario: "Test HTTP Request Breakdown - PUT" + Given the following series: + | metrics | values | + | ceph_rgw_put{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 70 90 160 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + And variable `rgw_servers` is `rgw.foo` + Then Grafana panel `HTTP Request Breakdown` with legend `PUTs {{ceph_daemon}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1.5 | + +Scenario: "Test HTTP Request Breakdown - Other" + Given the following series: + | metrics | values | + | ceph_rgw_req{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 175 250 345 | + | ceph_rgw_get{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 100 150 170 | + | ceph_rgw_put{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 70 90 160 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + And variable `rgw_servers` is `rgw.foo` + Then Grafana panel `HTTP Request Breakdown` with legend `Other {{ceph_daemon}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph"} | .16666666666666652 | + +Scenario: "Test Workload Breakdown - Failures" + Given the following series: + | metrics | values | + | ceph_rgw_failed_req{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 5 7 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + And variable `rgw_servers` is `rgw.foo` + Then Grafana panel `Workload Breakdown` with legend `Failures {{ceph_daemon}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1E-01 | + +Scenario: "Test Workload Breakdown - GETs" + Given the following series: + | metrics | values | + | ceph_rgw_get{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 100 150 170 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + And variable `rgw_servers` is `rgw.foo` + Then Grafana panel `Workload Breakdown` with legend `GETs {{ceph_daemon}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1.1666666666666667 | + +Scenario: "Test Workload Breakdown - PUTs" + Given the following series: + | metrics | values | + | ceph_rgw_put{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 70 90 160 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + And variable `rgw_servers` is `rgw.foo` + Then Grafana panel `Workload Breakdown` with legend `PUTs {{ceph_daemon}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1.5 | + +Scenario: "Test Workload Breakdown - Other" + Given the following series: + | metrics | values | + | ceph_rgw_req{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 175 250 345 | + | ceph_rgw_get{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 100 150 170 | + | ceph_rgw_put{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 70 90 160 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + And variable `rgw_servers` is `rgw.foo` + Then Grafana panel `Workload Breakdown` with legend `Other (DELETE,LIST) {{ceph_daemon}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph"} | .16666666666666652 | diff --git a/monitoring/ceph-mixin/tests_dashboards/features/radosgw_overview.feature b/monitoring/ceph-mixin/tests_dashboards/features/radosgw_overview.feature new file mode 100644 index 000000000..642e43978 --- /dev/null +++ b/monitoring/ceph-mixin/tests_dashboards/features/radosgw_overview.feature @@ -0,0 +1,250 @@ +Feature: RGW Overview Dashboard + +Scenario: "Test Average GET Latencies" + Given the following series: + | metrics | values | + | ceph_rgw_get_initial_lat_sum{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 10 50 100 | + | ceph_rgw_get_initial_lat_count{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 20 60 80 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + Then Grafana panel `Average GET/PUT Latencies by RGW Instance` with legend `GET {{rgw_host}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph", rgw_host="foo"} | 1.5 | + +Scenario: "Test Average PUT Latencies" + Given the following series: + | metrics | values | + | ceph_rgw_put_initial_lat_sum{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 15 35 55 | + | ceph_rgw_put_initial_lat_count{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 10 30 50 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + Then Grafana panel `Average GET/PUT Latencies by RGW Instance` with legend `PUT {{rgw_host}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph", rgw_host="foo"} | 1 | + +Scenario: "Test Total Requests/sec by RGW Instance" + Given the following series: + | metrics | values | + | ceph_rgw_req{instance="127.0.0.1", instance_id="92806566", job="ceph"} | 10 50 100 | + | ceph_rgw_metadata{ceph_daemon="rgw.1", hostname="localhost", instance="127.0.0.1", instance_id="92806566", job="ceph"} | 1 1 1 | + When interval is `30s` + Then Grafana panel `Total Requests/sec by RGW Instance` with legend `{{rgw_host}}` shows: + | metrics | values | + | {rgw_host="1"} | 1.5 | + +Scenario: "Test GET Latencies by RGW Instance" + Given the following series: + | metrics | values | + | ceph_rgw_get_initial_lat_sum{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 10 50 100 | + | ceph_rgw_get_initial_lat_count{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 20 60 80 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When interval is `30s` + Then Grafana panel `GET Latencies by RGW Instance` with legend `{{rgw_host}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph", rgw_host="foo"} | 1.5 | + +Scenario: "Test Bandwidth Consumed by Type- GET" + Given the following series: + | metrics | values | + | ceph_rgw_get_b{instance="127.0.0.1", instance_id="92806566", job="ceph"} | 10 50 100 | + When evaluation time is `1m` + And interval is `30s` + Then Grafana panel `Bandwidth Consumed by Type` with legend `GETs` shows: + | metrics | values | + | {} | 1.5 | + +Scenario: "Test Bandwidth Consumed by Type- PUT" + Given the following series: + | metrics | values | + | ceph_rgw_put_b{instance="127.0.0.1", instance_id="92806566", job="ceph"} | 5 20 50 | + When evaluation time is `1m` + And interval is `30s` + Then Grafana panel `Bandwidth Consumed by Type` with legend `PUTs` shows: + | metrics | values | + | {} | 7.5E-01 | + +Scenario: "Test Bandwidth by RGW Instance" + Given the following series: + | metrics | values | + | ceph_rgw_get_b{instance="127.0.0.1", instance_id="92806566", job="ceph"} | 10 50 100 | + | ceph_rgw_put_b{instance="127.0.0.1", instance_id="92806566", job="ceph"} | 5 20 50 | + | ceph_rgw_metadata{ceph_daemon="rgw.1", hostname="localhost", instance="127.0.0.1", instance_id="92806566", job="ceph"} | 1 1 1 | + When evaluation time is `1m` + And interval is `30s` + Then Grafana panel `Bandwidth by RGW Instance` with legend `{{rgw_host}}` shows: + | metrics | values | + | {ceph_daemon="rgw.1", instance_id="92806566", rgw_host="1"} | 2.25 | + +Scenario: "Test PUT Latencies by RGW Instance" + Given the following series: + | metrics | values | + | ceph_rgw_put_initial_lat_sum{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 15 35 55 | + | ceph_rgw_put_initial_lat_count{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 10 30 50 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | + When evaluation time is `1m` + And interval is `30s` + Then Grafana panel `PUT Latencies by RGW Instance` with legend `{{rgw_host}}` shows: + | metrics | values | + | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph", rgw_host="foo"} | 1 | + +Scenario: "Test Total backend responses by HTTP code" + Given the following series: + | metrics | values | + | haproxy_backend_http_responses_total{job="haproxy",code="200",instance="ingress.rgw.1",proxy="backend"} | 10 100 | + | haproxy_backend_http_responses_total{job="haproxy",code="404",instance="ingress.rgw.1",proxy="backend"} | 20 200 | + When variable `ingress_service` is `ingress.rgw.1` + When variable `code` is `200` + Then Grafana panel `Total responses by HTTP code` with legend `Backend {{ code }}` shows: + | metrics | values | + | {code="200"} | 1.5 | + +Scenario: "Test Total frontend responses by HTTP code" + Given the following series: + | metrics | values | + | haproxy_frontend_http_responses_total{job="haproxy",code="200",instance="ingress.rgw.1",proxy="frontend"} | 10 100 | + | haproxy_frontend_http_responses_total{job="haproxy",code="404",instance="ingress.rgw.1",proxy="frontend"} | 20 200 | + When variable `ingress_service` is `ingress.rgw.1` + When variable `code` is `200` + Then Grafana panel `Total responses by HTTP code` with legend `Frontend {{ code }}` shows: + | metrics | values | + | {code="200"} | 1.5 | + +Scenario: "Test Total http frontend requests by instance" + Given the following series: + | metrics | values | + | haproxy_frontend_http_requests_total{job="haproxy",proxy="frontend",instance="ingress.rgw.1"} | 10 100 | + | haproxy_frontend_http_requests_total{job="haproxy",proxy="frontend",instance="ingress.rgw.1"} | 20 200 | + When variable `ingress_service` is `ingress.rgw.1` + Then Grafana panel `Total requests / responses` with legend `Requests` shows: + | metrics | values | + | {instance="ingress.rgw.1"} | 3 | + +Scenario: "Test Total backend response errors by instance" + Given the following series: + | metrics | values | + | haproxy_backend_response_errors_total{job="haproxy",proxy="backend",instance="ingress.rgw.1"} | 10 100 | + | haproxy_backend_response_errors_total{job="haproxy",proxy="backend",instance="ingress.rgw.1"} | 20 200 | + When variable `ingress_service` is `ingress.rgw.1` + Then Grafana panel `Total requests / responses` with legend `Response errors` shows: + | metrics | values | + | {instance="ingress.rgw.1"} | 3 | + +Scenario: "Test Total frontend requests errors by instance" + Given the following series: + | metrics | values | + | haproxy_frontend_request_errors_total{job="haproxy",proxy="frontend",instance="ingress.rgw.1"} | 10 100 | + | haproxy_frontend_request_errors_total{job="haproxy",proxy="frontend",instance="ingress.rgw.1"} | 20 200 | + When variable `ingress_service` is `ingress.rgw.1` + Then Grafana panel `Total requests / responses` with legend `Requests errors` shows: + | metrics | values | + | {instance="ingress.rgw.1"} | 3 | + +Scenario: "Test Total backend redispatch warnings by instance" + Given the following series: + | metrics | values | + | haproxy_backend_redispatch_warnings_total{job="haproxy",proxy="backend",instance="ingress.rgw.1"} | 10 100 | + | haproxy_backend_redispatch_warnings_total{job="haproxy",proxy="backend",instance="ingress.rgw.1"} | 20 200 | + When variable `ingress_service` is `ingress.rgw.1` + Then Grafana panel `Total requests / responses` with legend `Backend redispatch` shows: + | metrics | values | + | {instance="ingress.rgw.1"} | 3 | + +Scenario: "Test Total backend retry warnings by instance" + Given the following series: + | metrics | values | + | haproxy_backend_retry_warnings_total{job="haproxy",proxy="backend",instance="ingress.rgw.1"} | 10 100 | + | haproxy_backend_retry_warnings_total{job="haproxy",proxy="backend",instance="ingress.rgw.1"} | 20 200 | + When variable `ingress_service` is `ingress.rgw.1` + Then Grafana panel `Total requests / responses` with legend `Backend retry` shows: + | metrics | values | + | {instance="ingress.rgw.1"} | 3 | + +Scenario: "Test Total frontend requests denied by instance" + Given the following series: + | metrics | values | + | haproxy_frontend_requests_denied_total{job="haproxy",proxy="frontend",instance="ingress.rgw.1"} | 10 100 | + | haproxy_frontend_requests_denied_total{job="haproxy",proxy="frontend",instance="ingress.rgw.1"} | 20 200 | + When variable `ingress_service` is `ingress.rgw.1` + Then Grafana panel `Total requests / responses` with legend `Request denied` shows: + | metrics | values | + | {instance="ingress.rgw.1"} | 3 | + +Scenario: "Test Total backend current queue by instance" + Given the following series: + | metrics | values | + | haproxy_backend_current_queue{job="haproxy",proxy="backend",instance="ingress.rgw.1"} | 10 100 | + | haproxy_backend_current_queue{job="haproxy",proxy="backend",instance="ingress.rgw.1"} | 20 200 | + When variable `ingress_service` is `ingress.rgw.1` + Then Grafana panel `Total requests / responses` with legend `Backend Queued` shows: + | metrics | values | + | {instance="ingress.rgw.1"} | 200 | + +Scenario: "Test Total frontend connections by instance" + Given the following series: + | metrics | values | + | haproxy_frontend_connections_total{job="haproxy",proxy="frontend",instance="ingress.rgw.1"} | 10 100 | + | haproxy_frontend_connections_total{job="haproxy",proxy="frontend",instance="ingress.rgw.1"} | 20 200 | + When variable `ingress_service` is `ingress.rgw.1` + Then Grafana panel `Total number of connections` with legend `Front` shows: + | metrics | values | + | {instance="ingress.rgw.1"} | 3 | + +Scenario: "Test Total backend connections attempts by instance" + Given the following series: + | metrics | values | + | haproxy_backend_connection_attempts_total{job="haproxy",proxy="backend",instance="ingress.rgw.1"} | 10 100 | + | haproxy_backend_connection_attempts_total{job="haproxy",proxy="backend",instance="ingress.rgw.1"} | 20 200 | + When variable `ingress_service` is `ingress.rgw.1` + Then Grafana panel `Total number of connections` with legend `Back` shows: + | metrics | values | + | {instance="ingress.rgw.1"} | 3 | + +Scenario: "Test Total backend connections error by instance" + Given the following series: + | metrics | values | + | haproxy_backend_connection_errors_total{job="haproxy",proxy="backend",instance="ingress.rgw.1"} | 10 100 | + | haproxy_backend_connection_errors_total{job="haproxy",proxy="backend",instance="ingress.rgw.1"} | 20 200 | + When variable `ingress_service` is `ingress.rgw.1` + Then Grafana panel `Total number of connections` with legend `Back errors` shows: + | metrics | values | + | {instance="ingress.rgw.1"} | 3 | + +Scenario: "Test Total frontend bytes incoming by instance" + Given the following series: + | metrics | values | + | haproxy_frontend_bytes_in_total{job="haproxy",proxy="frontend",instance="ingress.rgw.1"} | 10 100 | + | haproxy_frontend_bytes_in_total{job="haproxy",proxy="frontend",instance="ingress.rgw.1"} | 20 200 | + When variable `ingress_service` is `ingress.rgw.1` + Then Grafana panel `Current total of incoming / outgoing bytes` with legend `IN Front` shows: + | metrics | values | + | {instance="ingress.rgw.1"} | 24 | + +Scenario: "Test Total frontend bytes outgoing by instance" + Given the following series: + | metrics | values | + | haproxy_frontend_bytes_out_total{job="haproxy",proxy="frontend",instance="ingress.rgw.1"} | 10 100 | + | haproxy_frontend_bytes_out_total{job="haproxy",proxy="frontend",instance="ingress.rgw.1"} | 20 200 | + When variable `ingress_service` is `ingress.rgw.1` + Then Grafana panel `Current total of incoming / outgoing bytes` with legend `OUT Front` shows: + | metrics | values | + | {instance="ingress.rgw.1"} | 24 | + +Scenario: "Test Total backend bytes incoming by instance" + Given the following series: + | metrics | values | + | haproxy_backend_bytes_in_total{job="haproxy",proxy="backend",instance="ingress.rgw.1"} | 10 100 | + | haproxy_backend_bytes_in_total{job="haproxy",proxy="backend",instance="ingress.rgw.1"} | 20 200 | + When variable `ingress_service` is `ingress.rgw.1` + Then Grafana panel `Current total of incoming / outgoing bytes` with legend `IN Back` shows: + | metrics | values | + | {instance="ingress.rgw.1"} | 24 | + +Scenario: "Test Total backend bytes outgoing by instance" + Given the following series: + | metrics | values | + | haproxy_backend_bytes_out_total{job="haproxy",proxy="backend",instance="ingress.rgw.1"} | 10 100 | + | haproxy_backend_bytes_out_total{job="haproxy",proxy="backend",instance="ingress.rgw.1"} | 20 200 | + When variable `ingress_service` is `ingress.rgw.1` + Then Grafana panel `Current total of incoming / outgoing bytes` with legend `OUT Back` shows: + | metrics | values | + | {instance="ingress.rgw.1"} | 24 | diff --git a/monitoring/ceph-mixin/tests_dashboards/features/self.feature b/monitoring/ceph-mixin/tests_dashboards/features/self.feature new file mode 100644 index 000000000..2b44ce0dc --- /dev/null +++ b/monitoring/ceph-mixin/tests_dashboards/features/self.feature @@ -0,0 +1,68 @@ +Feature: Test tester + +Scenario: "Simple query works" + Given the following series: + | metrics | values | + | node_network_receive_bytes{instance="127.0.0.1", device="eth1"} | 10 100 | + | node_network_receive_bytes{instance="127.0.0.1", device="eth2"} | 10 100 | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth1"} | 10 100 | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth2"} | 10 100 | + | node_network_transmit_bytes{instance="192.168.100.2", device="bond0"} | 20 200 | + | node_network_transmit_bytes{instance="192.168.100.1", device="bond0"} | 20 200 | + | bonding_slaves{instance="127.0.0.1", master="bond0"} | 2 | + Then query `node_network_transmit_bytes{instance="127.0.0.1"} > 0` produces: + | metrics | values | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth1"} | 100 | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth2"} | 100 | + +Scenario: "Query with evaluation time" + Given the following series: + | metrics | values | + | node_network_receive_bytes{instance="127.0.0.1", device="eth1"} | 10 100 | + | node_network_receive_bytes{instance="127.0.0.1", device="eth2"} | 10 100 | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth1"} | 10 100 | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth2"} | 10 100 | + | node_network_transmit_bytes{instance="192.168.100.2", device="bond0"} | 20 200 | + | node_network_transmit_bytes{instance="192.168.100.1", device="bond0"} | 20 200 | + | bonding_slaves{instance="127.0.0.1", master="bond0"} | 2 | + When evaluation time is `0m` + Then query `node_network_transmit_bytes{instance="127.0.0.1"} > 0` produces: + | metrics | values | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth1"} | 10 | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth2"} | 10 | + +Scenario: "Query with evaluation time and variable value" + Given the following series: + | metrics | values | + | node_network_receive_bytes{instance="127.0.0.1", device="eth1"} | 10 100 | + | node_network_receive_bytes{instance="127.0.0.1", device="eth2"} | 10 100 | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth1"} | 10 100 | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth2"} | 10 100 | + | node_network_transmit_bytes{instance="192.168.100.2", device="bond0"} | 20 200 | + | node_network_transmit_bytes{instance="192.168.100.1", device="bond0"} | 20 200 | + | bonding_slaves{instance="127.0.0.1", master="bond0"} | 2 | + When evaluation time is `0m` + And variable `osd_hosts` is `127.0.0.1` + Then query `node_network_transmit_bytes{instance="$osd_hosts"} > 0` produces: + | metrics | values | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth1"} | 10 | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth2"} | 10 | + +Scenario: "Query with interval time" + Given the following series: + | metrics | values | + | node_network_receive_bytes{instance="127.0.0.1", device="eth1"} | 10 100 200 | + | node_network_receive_bytes{instance="127.0.0.1", device="eth2"} | 10 100 200 | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth1"} | 10 100 200 | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth2"} | 10 100 200 | + | node_network_transmit_bytes{instance="192.168.100.2", device="bond0"} | 20 200 300 | + | node_network_transmit_bytes{instance="192.168.100.1", device="bond0"} | 20 200 300 | + | bonding_slaves{instance="127.0.0.1", master="bond0"} | 2 | + When evaluation time is `2h` + And evaluation interval is `1h` + And interval is `1h` + And variable `osd_hosts` is `127.0.0.1` + Then query `node_network_transmit_bytes{instance="$osd_hosts"} > 0` produces: + | metrics | values | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth1"} | 200 | + | node_network_transmit_bytes{instance="127.0.0.1", device="eth2"} | 200 |
\ No newline at end of file diff --git a/monitoring/ceph-mixin/tests_dashboards/features/steps/__init__.py b/monitoring/ceph-mixin/tests_dashboards/features/steps/__init__.py new file mode 100644 index 000000000..0b90f46f2 --- /dev/null +++ b/monitoring/ceph-mixin/tests_dashboards/features/steps/__init__.py @@ -0,0 +1 @@ +# This file and steps files is needed even if its empty because of 'behave' :( diff --git a/monitoring/ceph-mixin/tests_dashboards/requirements.txt b/monitoring/ceph-mixin/tests_dashboards/requirements.txt new file mode 100644 index 000000000..8ad130e5b --- /dev/null +++ b/monitoring/ceph-mixin/tests_dashboards/requirements.txt @@ -0,0 +1,12 @@ +attrs==21.2.0 +behave==1.2.6 +py==1.10.0 +pyparsing==2.4.7 +PyYAML==6.0 +types-PyYAML==6.0.0 +typing-extensions==3.10.0.2 +termcolor==1.1.0 +types-termcolor==1.1.2 +dataclasses==0.6 +types-dataclasses==0.6.1 +prettytable==2.4.0 diff --git a/monitoring/ceph-mixin/tests_dashboards/util.py b/monitoring/ceph-mixin/tests_dashboards/util.py new file mode 100644 index 000000000..1fce6559d --- /dev/null +++ b/monitoring/ceph-mixin/tests_dashboards/util.py @@ -0,0 +1,109 @@ +import json +import re +from pathlib import Path +from typing import Any, Dict, Tuple, Union + +from termcolor import cprint + +UNITS = ['ms', 's', 'm', 'h', 'd', 'w', 'y'] + + +def resolve_time_and_unit(time: str) -> Union[Tuple[int, str], Tuple[None, None]]: + """ + Divide time with its unit and return a tuple like (10, 'm') + Return None if its and invalid prometheus time + Valid units are inside UNITS. + """ + if time[-1] in UNITS: + return int(time[:-1]), time[-1] + if time[-2:] in UNITS: + return int(time[:-2]), time[-2:] + return None, None + + +def get_dashboards_data() -> Dict[str, Any]: + data: Dict[str, Any] = {'queries': {}, 'variables': {}, 'stats': {}} + for file in Path(__file__).parent.parent \ + .joinpath('dashboards_out').glob('*.json'): + with open(file, 'r') as f: + dashboard_data = json.load(f) + data['stats'][str(file)] = {'total': 0, 'tested': 0} + add_dashboard_queries(data, dashboard_data, str(file)) + add_dashboard_variables(data, dashboard_data) + add_default_dashboards_variables(data) + return data + + +def add_dashboard_queries(data: Dict[str, Any], dashboard_data: Dict[str, Any], path: str) -> None: + """ + Grafana panels can have more than one target/query, in order to identify each + query in the panel we append the "legendFormat" of the target to the panel name. + format: panel_name-legendFormat + """ + if 'panels' not in dashboard_data: + return + error = 0 + for panel in dashboard_data['panels']: + if ( + 'title' in panel + and 'targets' in panel + and len(panel['targets']) > 0 + and 'expr' in panel['targets'][0] + ): + for target in panel['targets']: + title = panel['title'] + legend_format = target['legendFormat'] if 'legendFormat' in target else "" + query_id = f'{title}-{legend_format}' + if query_id in data['queries']: + # NOTE: If two or more panels have the same name and legend it + # might suggest a refactoring is needed or add something else + # to identify each query. + conflict_file = Path(data['queries'][query_id]['path']).name + file = Path(path).name + cprint((f'ERROR: Query in panel "{title}" with legend "{legend_format}"' + f' already exists. Conflict "{conflict_file}" ' + f'with: "{file}"'), 'red') + error = 1 + data['queries'][query_id] = {'query': target['expr'], 'path': path} + data['stats'][path]['total'] += 1 + if error: + raise ValueError('Missing legend_format in queries, please add a proper value.') + + +def add_dashboard_variables(data: Dict[str, Any], dashboard_data: Dict[str, Any]) -> None: + if 'templating' not in dashboard_data or 'list' not in dashboard_data['templating']: + return + for variable in dashboard_data['templating']['list']: + if 'name' in variable: + data['variables'][variable['name']] = 'UNSET VARIABLE' + +def add_default_dashboards_variables(data: Dict[str, Any]) -> None: + data['variables']['job'] = 'ceph' + data['variables']['job_haproxy'] = 'haproxy' + data['variables']['__rate_interval'] = '1m' + +def replace_grafana_expr_variables(expr: str, variable: str, value: Any) -> str: + """ Replace grafana variables in expression with a value + + It should match the whole word, 'osd' musn't match with the 'osd' prefix in 'osd_hosts' + >>> replace_grafana_expr_variables('metric{name~="$osd_hosts|$other|$osd"}', \ + 'osd', 'replacement') + 'metric{name~="$osd_hosts|$other|replacement"}' + + >>> replace_grafana_expr_variables('metric{name~="$osd_hosts|$other|$osd"}', \ + 'other', 'replacement') + 'metric{name~="$osd_hosts|replacement|$osd"}' + + It replaces words with dollar prefix + >>> replace_grafana_expr_variables('metric{name~="no_dollar|$other|$osd"}', \ + 'no_dollar', 'replacement') + 'metric{name~="no_dollar|$other|$osd"}' + + It shouldn't replace the next char after the variable (positive lookahead test). + >>> replace_grafana_expr_variables('metric{name~="$osd"}', \ + 'osd', 'replacement') + 'metric{name~="replacement"}' + """ + regex = fr'\${variable}(?=\W)' + new_expr = re.sub(regex, fr'{value}', expr) + return new_expr |