From be1c7e50e1e8809ea56f2c9d472eccd8ffd73a97 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 19 Apr 2024 04:57:58 +0200 Subject: Adding upstream version 1.44.3. Signed-off-by: Daniel Baumann --- collectors/python.d.plugin/riakkv/Makefile.inc | 13 + collectors/python.d.plugin/riakkv/README.md | 1 + .../python.d.plugin/riakkv/integrations/riakkv.md | 220 +++++++++++++ collectors/python.d.plugin/riakkv/metadata.yaml | 358 +++++++++++++++++++++ collectors/python.d.plugin/riakkv/riakkv.chart.py | 334 +++++++++++++++++++ collectors/python.d.plugin/riakkv/riakkv.conf | 68 ++++ 6 files changed, 994 insertions(+) create mode 100644 collectors/python.d.plugin/riakkv/Makefile.inc create mode 120000 collectors/python.d.plugin/riakkv/README.md create mode 100644 collectors/python.d.plugin/riakkv/integrations/riakkv.md create mode 100644 collectors/python.d.plugin/riakkv/metadata.yaml create mode 100644 collectors/python.d.plugin/riakkv/riakkv.chart.py create mode 100644 collectors/python.d.plugin/riakkv/riakkv.conf (limited to 'collectors/python.d.plugin/riakkv') diff --git a/collectors/python.d.plugin/riakkv/Makefile.inc b/collectors/python.d.plugin/riakkv/Makefile.inc new file mode 100644 index 00000000..87d29f82 --- /dev/null +++ b/collectors/python.d.plugin/riakkv/Makefile.inc @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-3.0-or-later + +# THIS IS NOT A COMPLETE Makefile +# IT IS INCLUDED BY ITS PARENT'S Makefile.am +# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT + +# install these files +dist_python_DATA += riakkv/riakkv.chart.py +dist_pythonconfig_DATA += riakkv/riakkv.conf + +# do not install these files, but include them in the distribution +dist_noinst_DATA += riakkv/README.md riakkv/Makefile.inc + diff --git a/collectors/python.d.plugin/riakkv/README.md b/collectors/python.d.plugin/riakkv/README.md new file mode 120000 index 00000000..f43ece09 --- /dev/null +++ b/collectors/python.d.plugin/riakkv/README.md @@ -0,0 +1 @@ +integrations/riakkv.md \ No newline at end of file diff --git a/collectors/python.d.plugin/riakkv/integrations/riakkv.md b/collectors/python.d.plugin/riakkv/integrations/riakkv.md new file mode 100644 index 00000000..2e8279bc --- /dev/null +++ b/collectors/python.d.plugin/riakkv/integrations/riakkv.md @@ -0,0 +1,220 @@ + + +# RiakKV + + + + + +Plugin: python.d.plugin +Module: riakkv + + + +## Overview + +This collector monitors RiakKV metrics about throughput, latency, resources and more.' + + +This collector reads the database stats from the `/stats` endpoint. + +This collector is supported on all platforms. + +This collector supports collecting metrics from multiple instances of this integration, including remote instances. + + +### Default Behavior + +#### Auto-Detection + +If the /stats endpoint is accessible, RiakKV instances on the local host running on port 8098 will be autodetected. + +#### Limits + +The default configuration for this integration does not impose any limits on data collection. + +#### Performance Impact + +The default configuration for this integration is not expected to impose a significant performance impact on the system. + + +## Metrics + +Metrics grouped by *scope*. + +The scope defines the instance that the metric belongs to. An instance is uniquely identified by a set of labels. + + + +### Per RiakKV instance + +These metrics refer to the entire monitored application. + +This scope has no labels. + +Metrics: + +| Metric | Dimensions | Unit | +|:------|:----------|:----| +| riak.kv.throughput | gets, puts | operations/s | +| riak.dt.vnode_updates | counters, sets, maps | operations/s | +| riak.search | queries | queries/s | +| riak.search.documents | indexed | documents/s | +| riak.consistent.operations | gets, puts | operations/s | +| riak.kv.latency.get | mean, median, 95, 99, 100 | ms | +| riak.kv.latency.put | mean, median, 95, 99, 100 | ms | +| riak.dt.latency.counter_merge | mean, median, 95, 99, 100 | ms | +| riak.dt.latency.set_merge | mean, median, 95, 99, 100 | ms | +| riak.dt.latency.map_merge | mean, median, 95, 99, 100 | ms | +| riak.search.latency.query | median, min, 95, 99, 999, max | ms | +| riak.search.latency.index | median, min, 95, 99, 999, max | ms | +| riak.consistent.latency.get | mean, median, 95, 99, 100 | ms | +| riak.consistent.latency.put | mean, median, 95, 99, 100 | ms | +| riak.vm | processes | total | +| riak.vm.memory.processes | allocated, used | MB | +| riak.kv.siblings_encountered.get | mean, median, 95, 99, 100 | siblings | +| riak.kv.objsize.get | mean, median, 95, 99, 100 | KB | +| riak.search.vnodeq_size | mean, median, 95, 99, 100 | messages | +| riak.search.index | errors | errors | +| riak.core.protobuf_connections | active | connections | +| riak.core.repairs | read | repairs | +| riak.core.fsm_active | get, put, secondary index, list keys | fsms | +| riak.core.fsm_rejected | get, put | fsms | +| riak.search.index | bad_entry, extract_fail | writes | + + + +## Alerts + + +The following alerts are available: + +| Alert name | On metric | Description | +|:------------|:----------|:------------| +| [ riakkv_1h_kv_get_mean_latency ](https://github.com/netdata/netdata/blob/master/health/health.d/riakkv.conf) | riak.kv.latency.get | average time between reception of client GET request and subsequent response to client over the last hour | +| [ riakkv_kv_get_slow ](https://github.com/netdata/netdata/blob/master/health/health.d/riakkv.conf) | riak.kv.latency.get | average time between reception of client GET request and subsequent response to the client over the last 3 minutes, compared to the average over the last hour | +| [ riakkv_1h_kv_put_mean_latency ](https://github.com/netdata/netdata/blob/master/health/health.d/riakkv.conf) | riak.kv.latency.put | average time between reception of client PUT request and subsequent response to the client over the last hour | +| [ riakkv_kv_put_slow ](https://github.com/netdata/netdata/blob/master/health/health.d/riakkv.conf) | riak.kv.latency.put | average time between reception of client PUT request and subsequent response to the client over the last 3 minutes, compared to the average over the last hour | +| [ riakkv_vm_high_process_count ](https://github.com/netdata/netdata/blob/master/health/health.d/riakkv.conf) | riak.vm | number of processes running in the Erlang VM | +| [ riakkv_list_keys_active ](https://github.com/netdata/netdata/blob/master/health/health.d/riakkv.conf) | riak.core.fsm_active | number of currently running list keys finite state machines | + + +## Setup + +### Prerequisites + +#### Configure RiakKV to enable /stats endpoint + +You can follow the RiakKV configuration reference documentation for how to enable this. + +Source : https://docs.riak.com/riak/kv/2.2.3/configuring/reference/#client-interfaces + + + +### Configuration + +#### File + +The configuration file name for this integration is `python.d/riakkv.conf`. + + +You can edit the configuration file using the `edit-config` script from the +Netdata [config directory](https://github.com/netdata/netdata/blob/master/docs/configure/nodes.md#the-netdata-config-directory). + +```bash +cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata +sudo ./edit-config python.d/riakkv.conf +``` +#### Options + +There are 2 sections: + +* Global variables +* One or more JOBS that can define multiple different instances to monitor. + +The following options can be defined globally: priority, penalty, autodetection_retry, update_every, but can also be defined per JOB to override the global values. + +Additionally, the following collapsed table contains all the options that can be configured inside a JOB definition. + +Every configuration JOB starts with a `job_name` value which will appear in the dashboard, unless a `name` parameter is specified. + + +
Config options + +| Name | Description | Default | Required | +|:----|:-----------|:-------|:--------:| +| update_every | Sets the default data collection frequency. | 5 | no | +| priority | Controls the order of charts at the netdata dashboard. | 60000 | no | +| autodetection_retry | Sets the job re-check interval in seconds. | 0 | no | +| penalty | Indicates whether to apply penalty to update_every in case of failures. | yes | no | +| url | The url of the server | no | yes | + +
+ +#### Examples + +##### Basic (default) + +A basic example configuration per job + +```yaml +local: +url: 'http://localhost:8098/stats' + +``` +##### Multi-instance + +> **Note**: When you define multiple jobs, their names must be unique. + +Collecting metrics from local and remote instances. + + +
Config + +```yaml +local: + url: 'http://localhost:8098/stats' + +remote: + url: 'http://192.0.2.1:8098/stats' + +``` +
+ + + +## Troubleshooting + +### Debug Mode + +To troubleshoot issues with the `riakkv` collector, run the `python.d.plugin` with the debug option enabled. The output +should give you clues as to why the collector isn't working. + +- Navigate to the `plugins.d` directory, usually at `/usr/libexec/netdata/plugins.d/`. If that's not the case on + your system, open `netdata.conf` and look for the `plugins` setting under `[directories]`. + + ```bash + cd /usr/libexec/netdata/plugins.d/ + ``` + +- Switch to the `netdata` user. + + ```bash + sudo -u netdata -s + ``` + +- Run the `python.d.plugin` to debug the collector: + + ```bash + ./python.d.plugin riakkv debug trace + ``` + + diff --git a/collectors/python.d.plugin/riakkv/metadata.yaml b/collectors/python.d.plugin/riakkv/metadata.yaml new file mode 100644 index 00000000..441937f8 --- /dev/null +++ b/collectors/python.d.plugin/riakkv/metadata.yaml @@ -0,0 +1,358 @@ +plugin_name: python.d.plugin +modules: + - meta: + plugin_name: python.d.plugin + module_name: riakkv + monitored_instance: + name: RiakKV + link: "https://riak.com/products/riak-kv/index.html" + categories: + - data-collection.database-servers + icon_filename: "riak.svg" + related_resources: + integrations: + list: [] + info_provided_to_referring_integrations: + description: "" + keywords: + - database + - nosql + - big data + most_popular: false + overview: + data_collection: + metrics_description: | + This collector monitors RiakKV metrics about throughput, latency, resources and more.' + method_description: "This collector reads the database stats from the `/stats` endpoint." + supported_platforms: + include: [] + exclude: [] + multi_instance: true + additional_permissions: + description: "" + default_behavior: + auto_detection: + description: "If the /stats endpoint is accessible, RiakKV instances on the local host running on port 8098 will be autodetected." + limits: + description: "" + performance_impact: + description: "" + setup: + prerequisites: + list: + - title: Configure RiakKV to enable /stats endpoint + description: | + You can follow the RiakKV configuration reference documentation for how to enable this. + + Source : https://docs.riak.com/riak/kv/2.2.3/configuring/reference/#client-interfaces + configuration: + file: + name: "python.d/riakkv.conf" + options: + description: | + There are 2 sections: + + * Global variables + * One or more JOBS that can define multiple different instances to monitor. + + The following options can be defined globally: priority, penalty, autodetection_retry, update_every, but can also be defined per JOB to override the global values. + + Additionally, the following collapsed table contains all the options that can be configured inside a JOB definition. + + Every configuration JOB starts with a `job_name` value which will appear in the dashboard, unless a `name` parameter is specified. + folding: + title: "Config options" + enabled: true + list: + - name: update_every + description: Sets the default data collection frequency. + default_value: 5 + required: false + - name: priority + description: Controls the order of charts at the netdata dashboard. + default_value: 60000 + required: false + - name: autodetection_retry + description: Sets the job re-check interval in seconds. + default_value: 0 + required: false + - name: penalty + description: Indicates whether to apply penalty to update_every in case of failures. + default_value: yes + required: false + - name: url + description: The url of the server + default_value: no + required: true + examples: + folding: + enabled: true + title: "Config" + list: + - name: Basic (default) + folding: + enabled: false + description: A basic example configuration per job + config: | + local: + url: 'http://localhost:8098/stats' + - name: Multi-instance + description: | + > **Note**: When you define multiple jobs, their names must be unique. + + Collecting metrics from local and remote instances. + config: | + local: + url: 'http://localhost:8098/stats' + + remote: + url: 'http://192.0.2.1:8098/stats' + troubleshooting: + problems: + list: [] + alerts: + - name: riakkv_1h_kv_get_mean_latency + link: https://github.com/netdata/netdata/blob/master/health/health.d/riakkv.conf + metric: riak.kv.latency.get + info: average time between reception of client GET request and subsequent response to client over the last hour + - name: riakkv_kv_get_slow + link: https://github.com/netdata/netdata/blob/master/health/health.d/riakkv.conf + metric: riak.kv.latency.get + info: average time between reception of client GET request and subsequent response to the client over the last 3 minutes, compared to the average over the last hour + - name: riakkv_1h_kv_put_mean_latency + link: https://github.com/netdata/netdata/blob/master/health/health.d/riakkv.conf + metric: riak.kv.latency.put + info: average time between reception of client PUT request and subsequent response to the client over the last hour + - name: riakkv_kv_put_slow + link: https://github.com/netdata/netdata/blob/master/health/health.d/riakkv.conf + metric: riak.kv.latency.put + info: average time between reception of client PUT request and subsequent response to the client over the last 3 minutes, compared to the average over the last hour + - name: riakkv_vm_high_process_count + link: https://github.com/netdata/netdata/blob/master/health/health.d/riakkv.conf + metric: riak.vm + info: number of processes running in the Erlang VM + - name: riakkv_list_keys_active + link: https://github.com/netdata/netdata/blob/master/health/health.d/riakkv.conf + metric: riak.core.fsm_active + info: number of currently running list keys finite state machines + metrics: + folding: + title: Metrics + enabled: false + description: "" + availability: [] + scopes: + - name: global + description: "These metrics refer to the entire monitored application." + labels: [] + metrics: + - name: riak.kv.throughput + description: Reads & writes coordinated by this node + unit: "operations/s" + chart_type: line + dimensions: + - name: gets + - name: puts + - name: riak.dt.vnode_updates + description: Update operations coordinated by local vnodes by data type + unit: "operations/s" + chart_type: line + dimensions: + - name: counters + - name: sets + - name: maps + - name: riak.search + description: Search queries on the node + unit: "queries/s" + chart_type: line + dimensions: + - name: queries + - name: riak.search.documents + description: Documents indexed by search + unit: "documents/s" + chart_type: line + dimensions: + - name: indexed + - name: riak.consistent.operations + description: Consistent node operations + unit: "operations/s" + chart_type: line + dimensions: + - name: gets + - name: puts + - name: riak.kv.latency.get + description: Time between reception of a client GET request and subsequent response to client + unit: "ms" + chart_type: line + dimensions: + - name: mean + - name: median + - name: "95" + - name: "99" + - name: "100" + - name: riak.kv.latency.put + description: Time between reception of a client PUT request and subsequent response to client + unit: "ms" + chart_type: line + dimensions: + - name: mean + - name: median + - name: "95" + - name: "99" + - name: "100" + - name: riak.dt.latency.counter_merge + description: Time it takes to perform an Update Counter operation + unit: "ms" + chart_type: line + dimensions: + - name: mean + - name: median + - name: "95" + - name: "99" + - name: "100" + - name: riak.dt.latency.set_merge + description: Time it takes to perform an Update Set operation + unit: "ms" + chart_type: line + dimensions: + - name: mean + - name: median + - name: "95" + - name: "99" + - name: "100" + - name: riak.dt.latency.map_merge + description: Time it takes to perform an Update Map operation + unit: "ms" + chart_type: line + dimensions: + - name: mean + - name: median + - name: "95" + - name: "99" + - name: "100" + - name: riak.search.latency.query + description: Search query latency + unit: "ms" + chart_type: line + dimensions: + - name: median + - name: min + - name: "95" + - name: "99" + - name: "999" + - name: max + - name: riak.search.latency.index + description: Time it takes Search to index a new document + unit: "ms" + chart_type: line + dimensions: + - name: median + - name: min + - name: "95" + - name: "99" + - name: "999" + - name: max + - name: riak.consistent.latency.get + description: Strongly consistent read latency + unit: "ms" + chart_type: line + dimensions: + - name: mean + - name: median + - name: "95" + - name: "99" + - name: "100" + - name: riak.consistent.latency.put + description: Strongly consistent write latency + unit: "ms" + chart_type: line + dimensions: + - name: mean + - name: median + - name: "95" + - name: "99" + - name: "100" + - name: riak.vm + description: Total processes running in the Erlang VM + unit: "total" + chart_type: line + dimensions: + - name: processes + - name: riak.vm.memory.processes + description: Memory allocated & used by Erlang processes + unit: "MB" + chart_type: line + dimensions: + - name: allocated + - name: used + - name: riak.kv.siblings_encountered.get + description: Number of siblings encountered during GET operations by this node during the past minute + unit: "siblings" + chart_type: line + dimensions: + - name: mean + - name: median + - name: "95" + - name: "99" + - name: "100" + - name: riak.kv.objsize.get + description: Object size encountered by this node during the past minute + unit: "KB" + chart_type: line + dimensions: + - name: mean + - name: median + - name: "95" + - name: "99" + - name: "100" + - name: riak.search.vnodeq_size + description: Number of unprocessed messages in the vnode message queues of Search on this node in the past minute + unit: "messages" + chart_type: line + dimensions: + - name: mean + - name: median + - name: "95" + - name: "99" + - name: "100" + - name: riak.search.index + description: Number of document index errors encountered by Search + unit: "errors" + chart_type: line + dimensions: + - name: errors + - name: riak.core.protobuf_connections + description: Protocol buffer connections by status + unit: "connections" + chart_type: line + dimensions: + - name: active + - name: riak.core.repairs + description: Number of repair operations this node has coordinated + unit: "repairs" + chart_type: line + dimensions: + - name: read + - name: riak.core.fsm_active + description: Active finite state machines by kind + unit: "fsms" + chart_type: line + dimensions: + - name: get + - name: put + - name: secondary index + - name: list keys + - name: riak.core.fsm_rejected + description: Finite state machines being rejected by Sidejobs overload protection + unit: "fsms" + chart_type: line + dimensions: + - name: get + - name: put + - name: riak.search.index + description: Number of writes to Search failed due to bad data format by reason + unit: "writes" + chart_type: line + dimensions: + - name: bad_entry + - name: extract_fail diff --git a/collectors/python.d.plugin/riakkv/riakkv.chart.py b/collectors/python.d.plugin/riakkv/riakkv.chart.py new file mode 100644 index 00000000..c390c8bc --- /dev/null +++ b/collectors/python.d.plugin/riakkv/riakkv.chart.py @@ -0,0 +1,334 @@ +# -*- coding: utf-8 -*- +# Description: riak netdata python.d module +# +# See also: +# https://docs.riak.com/riak/kv/latest/using/reference/statistics-monitoring/index.html + +from json import loads + +from bases.FrameworkServices.UrlService import UrlService + +# Riak updates the metrics at the /stats endpoint every 1 second. +# If we use `update_every = 1` here, that means we might get weird jitter in the graph, +# so the default is set to 2 seconds to prevent it. +update_every = 2 + +# charts order (can be overridden if you want less charts, or different order) +ORDER = [ + # Throughput metrics + # https://docs.riak.com/riak/kv/latest/using/reference/statistics-monitoring/index.html#throughput-metrics + # Collected in totals. + "kv.node_operations", # K/V node operations. + "dt.vnode_updates", # Data type vnode updates. + "search.queries", # Search queries on the node. + "search.documents", # Documents indexed by Search. + "consistent.operations", # Consistent node operations. + + # Latency metrics + # https://docs.riak.com/riak/kv/latest/using/reference/statistics-monitoring/index.html#throughput-metrics + # Collected for the past minute in milliseconds, + # returned from riak in microseconds. + "kv.latency.get", # K/V GET FSM traversal latency. + "kv.latency.put", # K/V PUT FSM traversal latency. + "dt.latency.counter", # Update Counter Data type latency. + "dt.latency.set", # Update Set Data type latency. + "dt.latency.map", # Update Map Data type latency. + "search.latency.query", # Search query latency. + "search.latency.index", # Time it takes for search to index a new document. + "consistent.latency.get", # Strong consistent read latency. + "consistent.latency.put", # Strong consistent write latency. + + # Erlang resource usage metrics + # https://docs.riak.com/riak/kv/latest/using/reference/statistics-monitoring/index.html#erlang-resource-usage-metrics + # Processes collected as a gauge, + # memory collected as Megabytes, returned as bytes from Riak. + "vm.processes", # Number of processes currently running in the Erlang VM. + "vm.memory.processes", # Total amount of memory allocated & used for Erlang processes. + + # General Riak Load / Health metrics + # https://docs.riak.com/riak/kv/latest/using/reference/statistics-monitoring/index.html#general-riak-load-health-metrics + # The following are collected by Riak over the past minute: + "kv.siblings_encountered.get", # Siblings encountered during GET operations by this node. + "kv.objsize.get", # Object size encountered by this node. + "search.vnodeq_size", # Number of unprocessed messages in the vnode message queues (Search). + # The following are calculated in total, or as gauges: + "search.index_errors", # Errors of the search subsystem while indexing documents. + "core.pbc", # Number of currently active protocol buffer connections. + "core.repairs", # Total read repair operations coordinated by this node. + "core.fsm_active", # Active finite state machines by kind. + "core.fsm_rejected", # Rejected finite state machines by kind. + + # General Riak Search Load / Health metrics + # https://docs.riak.com/riak/kv/latest/using/reference/statistics-monitoring/index.html#general-riak-search-load-health-metrics + # Reported as counters. + "search.errors", # Write and read errors of the Search subsystem. +] + +CHARTS = { + # Throughput metrics + "kv.node_operations": { + "options": [None, "Reads & writes coordinated by this node", "operations/s", "throughput", "riak.kv.throughput", + "line"], + "lines": [ + ["node_gets_total", "gets", "incremental"], + ["node_puts_total", "puts", "incremental"] + ] + }, + "dt.vnode_updates": { + "options": [None, "Update operations coordinated by local vnodes by data type", "operations/s", "throughput", + "riak.dt.vnode_updates", "line"], + "lines": [ + ["vnode_counter_update_total", "counters", "incremental"], + ["vnode_set_update_total", "sets", "incremental"], + ["vnode_map_update_total", "maps", "incremental"], + ] + }, + "search.queries": { + "options": [None, "Search queries on the node", "queries/s", "throughput", "riak.search", "line"], + "lines": [ + ["search_query_throughput_count", "queries", "incremental"] + ] + }, + "search.documents": { + "options": [None, "Documents indexed by search", "documents/s", "throughput", "riak.search.documents", "line"], + "lines": [ + ["search_index_throughput_count", "indexed", "incremental"] + ] + }, + "consistent.operations": { + "options": [None, "Consistent node operations", "operations/s", "throughput", "riak.consistent.operations", + "line"], + "lines": [ + ["consistent_gets_total", "gets", "incremental"], + ["consistent_puts_total", "puts", "incremental"], + ] + }, + + # Latency metrics + "kv.latency.get": { + "options": [None, "Time between reception of a client GET request and subsequent response to client", "ms", + "latency", "riak.kv.latency.get", "line"], + "lines": [ + ["node_get_fsm_time_mean", "mean", "absolute", 1, 1000], + ["node_get_fsm_time_median", "median", "absolute", 1, 1000], + ["node_get_fsm_time_95", "95", "absolute", 1, 1000], + ["node_get_fsm_time_99", "99", "absolute", 1, 1000], + ["node_get_fsm_time_100", "100", "absolute", 1, 1000], + ] + }, + "kv.latency.put": { + "options": [None, "Time between reception of a client PUT request and subsequent response to client", "ms", + "latency", "riak.kv.latency.put", "line"], + "lines": [ + ["node_put_fsm_time_mean", "mean", "absolute", 1, 1000], + ["node_put_fsm_time_median", "median", "absolute", 1, 1000], + ["node_put_fsm_time_95", "95", "absolute", 1, 1000], + ["node_put_fsm_time_99", "99", "absolute", 1, 1000], + ["node_put_fsm_time_100", "100", "absolute", 1, 1000], + ] + }, + "dt.latency.counter": { + "options": [None, "Time it takes to perform an Update Counter operation", "ms", "latency", + "riak.dt.latency.counter_merge", "line"], + "lines": [ + ["object_counter_merge_time_mean", "mean", "absolute", 1, 1000], + ["object_counter_merge_time_median", "median", "absolute", 1, 1000], + ["object_counter_merge_time_95", "95", "absolute", 1, 1000], + ["object_counter_merge_time_99", "99", "absolute", 1, 1000], + ["object_counter_merge_time_100", "100", "absolute", 1, 1000], + ] + }, + "dt.latency.set": { + "options": [None, "Time it takes to perform an Update Set operation", "ms", "latency", + "riak.dt.latency.set_merge", "line"], + "lines": [ + ["object_set_merge_time_mean", "mean", "absolute", 1, 1000], + ["object_set_merge_time_median", "median", "absolute", 1, 1000], + ["object_set_merge_time_95", "95", "absolute", 1, 1000], + ["object_set_merge_time_99", "99", "absolute", 1, 1000], + ["object_set_merge_time_100", "100", "absolute", 1, 1000], + ] + }, + "dt.latency.map": { + "options": [None, "Time it takes to perform an Update Map operation", "ms", "latency", + "riak.dt.latency.map_merge", "line"], + "lines": [ + ["object_map_merge_time_mean", "mean", "absolute", 1, 1000], + ["object_map_merge_time_median", "median", "absolute", 1, 1000], + ["object_map_merge_time_95", "95", "absolute", 1, 1000], + ["object_map_merge_time_99", "99", "absolute", 1, 1000], + ["object_map_merge_time_100", "100", "absolute", 1, 1000], + ] + }, + "search.latency.query": { + "options": [None, "Search query latency", "ms", "latency", "riak.search.latency.query", "line"], + "lines": [ + ["search_query_latency_median", "median", "absolute", 1, 1000], + ["search_query_latency_min", "min", "absolute", 1, 1000], + ["search_query_latency_95", "95", "absolute", 1, 1000], + ["search_query_latency_99", "99", "absolute", 1, 1000], + ["search_query_latency_999", "999", "absolute", 1, 1000], + ["search_query_latency_max", "max", "absolute", 1, 1000], + ] + }, + "search.latency.index": { + "options": [None, "Time it takes Search to index a new document", "ms", "latency", "riak.search.latency.index", + "line"], + "lines": [ + ["search_index_latency_median", "median", "absolute", 1, 1000], + ["search_index_latency_min", "min", "absolute", 1, 1000], + ["search_index_latency_95", "95", "absolute", 1, 1000], + ["search_index_latency_99", "99", "absolute", 1, 1000], + ["search_index_latency_999", "999", "absolute", 1, 1000], + ["search_index_latency_max", "max", "absolute", 1, 1000], + ] + }, + + # Riak Strong Consistency metrics + "consistent.latency.get": { + "options": [None, "Strongly consistent read latency", "ms", "latency", "riak.consistent.latency.get", "line"], + "lines": [ + ["consistent_get_time_mean", "mean", "absolute", 1, 1000], + ["consistent_get_time_median", "median", "absolute", 1, 1000], + ["consistent_get_time_95", "95", "absolute", 1, 1000], + ["consistent_get_time_99", "99", "absolute", 1, 1000], + ["consistent_get_time_100", "100", "absolute", 1, 1000], + ] + }, + "consistent.latency.put": { + "options": [None, "Strongly consistent write latency", "ms", "latency", "riak.consistent.latency.put", "line"], + "lines": [ + ["consistent_put_time_mean", "mean", "absolute", 1, 1000], + ["consistent_put_time_median", "median", "absolute", 1, 1000], + ["consistent_put_time_95", "95", "absolute", 1, 1000], + ["consistent_put_time_99", "99", "absolute", 1, 1000], + ["consistent_put_time_100", "100", "absolute", 1, 1000], + ] + }, + + # BEAM metrics + "vm.processes": { + "options": [None, "Total processes running in the Erlang VM", "total", "vm", "riak.vm", "line"], + "lines": [ + ["sys_process_count", "processes", "absolute"], + ] + }, + "vm.memory.processes": { + "options": [None, "Memory allocated & used by Erlang processes", "MB", "vm", "riak.vm.memory.processes", + "line"], + "lines": [ + ["memory_processes", "allocated", "absolute", 1, 1024 * 1024], + ["memory_processes_used", "used", "absolute", 1, 1024 * 1024] + ] + }, + + # General Riak Load/Health metrics + "kv.siblings_encountered.get": { + "options": [None, "Number of siblings encountered during GET operations by this node during the past minute", + "siblings", "load", "riak.kv.siblings_encountered.get", "line"], + "lines": [ + ["node_get_fsm_siblings_mean", "mean", "absolute"], + ["node_get_fsm_siblings_median", "median", "absolute"], + ["node_get_fsm_siblings_95", "95", "absolute"], + ["node_get_fsm_siblings_99", "99", "absolute"], + ["node_get_fsm_siblings_100", "100", "absolute"], + ] + }, + "kv.objsize.get": { + "options": [None, "Object size encountered by this node during the past minute", "KB", "load", + "riak.kv.objsize.get", "line"], + "lines": [ + ["node_get_fsm_objsize_mean", "mean", "absolute", 1, 1024], + ["node_get_fsm_objsize_median", "median", "absolute", 1, 1024], + ["node_get_fsm_objsize_95", "95", "absolute", 1, 1024], + ["node_get_fsm_objsize_99", "99", "absolute", 1, 1024], + ["node_get_fsm_objsize_100", "100", "absolute", 1, 1024], + ] + }, + "search.vnodeq_size": { + "options": [None, + "Number of unprocessed messages in the vnode message queues of Search on this node in the past minute", + "messages", "load", "riak.search.vnodeq_size", "line"], + "lines": [ + ["riak_search_vnodeq_mean", "mean", "absolute"], + ["riak_search_vnodeq_median", "median", "absolute"], + ["riak_search_vnodeq_95", "95", "absolute"], + ["riak_search_vnodeq_99", "99", "absolute"], + ["riak_search_vnodeq_100", "100", "absolute"], + ] + }, + "search.index_errors": { + "options": [None, "Number of document index errors encountered by Search", "errors", "load", + "riak.search.index", "line"], + "lines": [ + ["search_index_fail_count", "errors", "absolute"] + ] + }, + "core.pbc": { + "options": [None, "Protocol buffer connections by status", "connections", "load", + "riak.core.protobuf_connections", "line"], + "lines": [ + ["pbc_active", "active", "absolute"], + # ["pbc_connects", "established_pastmin", "absolute"] + ] + }, + "core.repairs": { + "options": [None, "Number of repair operations this node has coordinated", "repairs", "load", + "riak.core.repairs", "line"], + "lines": [ + ["read_repairs", "read", "absolute"] + ] + }, + "core.fsm_active": { + "options": [None, "Active finite state machines by kind", "fsms", "load", "riak.core.fsm_active", "line"], + "lines": [ + ["node_get_fsm_active", "get", "absolute"], + ["node_put_fsm_active", "put", "absolute"], + ["index_fsm_active", "secondary index", "absolute"], + ["list_fsm_active", "list keys", "absolute"] + ] + }, + "core.fsm_rejected": { + # Writing "Sidejob's" here seems to cause some weird issues: it results in this chart being rendered in + # its own context and additionally, moves the entire Riak graph all the way up to the top of the Netdata + # dashboard for some reason. + "options": [None, "Finite state machines being rejected by Sidejobs overload protection", "fsms", "load", + "riak.core.fsm_rejected", "line"], + "lines": [ + ["node_get_fsm_rejected", "get", "absolute"], + ["node_put_fsm_rejected", "put", "absolute"] + ] + }, + + # General Riak Search Load / Health metrics + "search.errors": { + "options": [None, "Number of writes to Search failed due to bad data format by reason", "writes", "load", + "riak.search.index", "line"], + "lines": [ + ["search_index_bad_entry_count", "bad_entry", "absolute"], + ["search_index_extract_fail_count", "extract_fail", "absolute"], + ] + } +} + + +class Service(UrlService): + def __init__(self, configuration=None, name=None): + UrlService.__init__(self, configuration=configuration, name=name) + self.order = ORDER + self.definitions = CHARTS + + def _get_data(self): + """ + Format data received from http request + :return: dict + """ + raw = self._get_raw_data() + if not raw: + return None + + try: + return loads(raw) + except (TypeError, ValueError) as err: + self.error(err) + return None diff --git a/collectors/python.d.plugin/riakkv/riakkv.conf b/collectors/python.d.plugin/riakkv/riakkv.conf new file mode 100644 index 00000000..be01c48a --- /dev/null +++ b/collectors/python.d.plugin/riakkv/riakkv.conf @@ -0,0 +1,68 @@ +# netdata python.d.plugin configuration for riak +# +# This file is in YaML format. Generally the format is: +# +# name: value +# +# There are 2 sections: +# - global variables +# - one or more JOBS +# +# JOBS allow you to collect values from multiple sources. +# Each source will have its own set of charts. +# +# JOB parameters have to be indented (using spaces only, example below). + +# ---------------------------------------------------------------------- +# Global Variables +# These variables set the defaults for all JOBs, however each JOB +# may define its own, overriding the defaults. + +# update_every sets the default data collection frequency. +# If unset, the python.d.plugin default is used. +# update_every: 1 + +# priority controls the order of charts at the netdata dashboard. +# Lower numbers move the charts towards the top of the page. +# If unset, the default for python.d.plugin is used. +# priority: 60000 + +# penalty indicates whether to apply penalty to update_every in case of failures. +# Penalty will increase every 5 failed updates in a row. Maximum penalty is 10 minutes. +# penalty: yes + +# autodetection_retry sets the job re-check interval in seconds. +# The job is not deleted if check fails. +# Attempts to start the job are made once every autodetection_retry. +# This feature is disabled by default. +# autodetection_retry: 0 + +# ---------------------------------------------------------------------- +# JOBS (data collection sources) +# +# The default JOBS share the same *name*. JOBS with the same name +# are mutually exclusive. Only one of them will be allowed running at +# any time. This allows autodetection to try several alternatives and +# pick the one that works. +# +# Any number of jobs is supported. +# +# All python.d.plugin JOBS (for all its modules) support a set of +# predefined parameters. These are: +# +# job_name: +# name: myname # the JOB's name as it will appear at the +# # dashboard (by default is the job_name) +# # JOBs sharing a name are mutually exclusive +# update_every: 1 # the JOB's data collection frequency +# priority: 60000 # the JOB's order on the dashboard +# penalty: yes # the JOB's penalty +# autodetection_retry: 0 # the JOB's re-check interval in seconds +# +# +# ---------------------------------------------------------------------- +# AUTO-DETECTION JOBS +# only one of them will run (they have the same name) + +local: + url : 'http://localhost:8098/stats' -- cgit v1.2.3