diff options
Diffstat (limited to 'collectors/python.d.plugin')
18 files changed, 1046 insertions, 120 deletions
diff --git a/collectors/python.d.plugin/Makefile.am b/collectors/python.d.plugin/Makefile.am index 652a35da4..ad72cfaef 100644 --- a/collectors/python.d.plugin/Makefile.am +++ b/collectors/python.d.plugin/Makefile.am @@ -87,6 +87,7 @@ include rabbitmq/Makefile.inc include redis/Makefile.inc include rethinkdbs/Makefile.inc include retroshare/Makefile.inc +include riakkv/Makefile.inc include samba/Makefile.inc include sensors/Makefile.inc include smartd_log/Makefile.inc diff --git a/collectors/python.d.plugin/README.md b/collectors/python.d.plugin/README.md index 8955197a7..32437c6db 100644 --- a/collectors/python.d.plugin/README.md +++ b/collectors/python.d.plugin/README.md @@ -150,7 +150,7 @@ Classes implement `_get_raw_data` which should be used to grab raw data. This me _This is last resort class, if a new module cannot be written by using other framework class this one can be used._ -_Example: `mysql`, `sensors`_ +_Example: `ceph`, `sensors`_ It is the lowest-level class which implements most of module logic, like: - threading diff --git a/collectors/python.d.plugin/adaptec_raid/adaptec_raid.chart.py b/collectors/python.d.plugin/adaptec_raid/adaptec_raid.chart.py index 052c93144..3fcb5fda8 100644 --- a/collectors/python.d.plugin/adaptec_raid/adaptec_raid.chart.py +++ b/collectors/python.d.plugin/adaptec_raid/adaptec_raid.chart.py @@ -56,8 +56,8 @@ GOOD_PD_STATUS = ( ) RE_LD = re.compile( - r'Logical device number\s+([0-9]+).*?' - r'Status of logical device\s+: ([a-zA-Z]+)' + r'Logical [dD]evice number\s+([0-9]+).*?' + r'Status of [lL]ogical [dD]evice\s+: ([a-zA-Z]+)' ) diff --git a/collectors/python.d.plugin/dns_query_time/dns_query_time.chart.py b/collectors/python.d.plugin/dns_query_time/dns_query_time.chart.py index 47a7d23f6..7fe860314 100644 --- a/collectors/python.d.plugin/dns_query_time/dns_query_time.chart.py +++ b/collectors/python.d.plugin/dns_query_time/dns_query_time.chart.py @@ -8,11 +8,6 @@ from socket import getaddrinfo, gaierror from threading import Thread try: - from time import monotonic as time -except ImportError: - from time import time - -try: import dns.message import dns.query import dns.name @@ -89,13 +84,15 @@ def dns_request(server_list, timeout, domains): request = dns.message.make_query(domain, dns.rdatatype.A) try: - dns_start = time() - dns.query.udp(request, ns, timeout=t) - dns_end = time() - query_time = round((dns_end - dns_start) * 1000) - q.put({'_'.join(['ns', ns.replace('.', '_')]): query_time}) + resp = dns.query.udp(request, ns, timeout=t) + if (resp.rcode() == dns.rcode.NOERROR and resp.answer): + query_time = resp.time * 1000 + else: + query_time = -100 except dns.exception.Timeout: - q.put({'_'.join(['ns', ns.replace('.', '_')]): -100}) + query_time = -100 + finally: + q.put({'_'.join(['ns', ns.replace('.', '_')]): query_time}) for server in server_list: th = Thread(target=dns_req, args=(server, timeout, que)) diff --git a/collectors/python.d.plugin/elasticsearch/elasticsearch.chart.py b/collectors/python.d.plugin/elasticsearch/elasticsearch.chart.py index 9b3c1284d..20109c64f 100644 --- a/collectors/python.d.plugin/elasticsearch/elasticsearch.chart.py +++ b/collectors/python.d.plugin/elasticsearch/elasticsearch.chart.py @@ -10,9 +10,9 @@ from collections import namedtuple from socket import gethostbyname, gaierror try: - from queue import Queue + from queue import Queue except ImportError: - from Queue import Queue + from Queue import Queue from bases.FrameworkServices.UrlService import UrlService @@ -83,11 +83,11 @@ NODE_STATS = [ ] CLUSTER_STATS = [ - 'nodes.count.data_only', - 'nodes.count.master_data', + 'nodes.count.data', + 'nodes.count.master', 'nodes.count.total', - 'nodes.count.master_only', - 'nodes.count.client', + 'nodes.count.coordinating_only', + 'nodes.count.ingest', 'indices.docs.count', 'indices.query_cache.hit_count', 'indices.query_cache.miss_count', @@ -371,7 +371,7 @@ CHARTS = { }, 'cluster_health_nodes': { 'options': [None, 'Nodes Statistics', 'nodes', 'cluster health API', - 'elastic.cluster_health_nodes', 'stacked'], + 'elastic.cluster_health_nodes', 'area'], 'lines': [ ['number_of_nodes', 'nodes', 'absolute'], ['number_of_data_nodes', 'data_nodes', 'absolute'], @@ -417,13 +417,13 @@ CHARTS = { }, 'cluster_stats_nodes': { 'options': [None, 'Nodes Statistics', 'nodes', 'cluster stats API', - 'elastic.cluster_nodes', 'stacked'], + 'elastic.cluster_nodes', 'area'], 'lines': [ - ['nodes_count_data_only', 'data_only', 'absolute'], - ['nodes_count_master_data', 'master_data', 'absolute'], + ['nodes_count_data', 'data', 'absolute'], + ['nodes_count_master', 'master', 'absolute'], ['nodes_count_total', 'total', 'absolute'], - ['nodes_count_master_only', 'master_only', 'absolute'], - ['nodes_count_client', 'client', 'absolute'] + ['nodes_count_ingest', 'ingest', 'absolute'], + ['nodes_count_coordinating_only', 'coordinating_only', 'absolute'] ] }, 'cluster_stats_query_cache': { diff --git a/collectors/python.d.plugin/monit/monit.chart.py b/collectors/python.d.plugin/monit/monit.chart.py index 3ac0032c5..9f3270572 100644 --- a/collectors/python.d.plugin/monit/monit.chart.py +++ b/collectors/python.d.plugin/monit/monit.chart.py @@ -4,23 +4,49 @@ # SPDX-License-Identifier: GPL-3.0-or-later import xml.etree.ElementTree as ET + +from collections import namedtuple + from bases.FrameworkServices.UrlService import UrlService -# see enum State_Type from monit.h (https://bitbucket.org/tildeslash/monit/src/master/src/monit.h) -MONIT_SERVICE_NAMES = [ - 'Filesystem', - 'Directory', - 'File', - 'Process', - 'Host', - 'System', - 'Fifo', - 'Program', - 'Net', -] +MonitType = namedtuple('MonitType', ('index', 'name')) + +# see enum Service_Type from monit.h (https://bitbucket.org/tildeslash/monit/src/master/src/monit.h) +# typedef enum { +# Service_Filesystem = 0, +# Service_Directory, +# Service_File, +# Service_Process, +# Service_Host, +# Service_System, +# Service_Fifo, +# Service_Program, +# Service_Net, +# Service_Last = Service_Net +# } __attribute__((__packed__)) Service_Type; -DEFAULT_SERVICES_IDS = [0, 1, 2, 3, 4, 6, 7, 8] +TYPE_FILESYSTEM = MonitType(0, 'filesystem') +TYPE_DIRECTORY = MonitType(1, 'directory') +TYPE_FILE = MonitType(2, 'file') +TYPE_PROCESS = MonitType(3, 'process') +TYPE_HOST = MonitType(4, 'host') +TYPE_SYSTEM = MonitType(5, 'system') +TYPE_FIFO = MonitType(6, 'fifo') +TYPE_PROGRAM = MonitType(7, 'program') +TYPE_NET = MonitType(8, 'net') + +TYPES = ( + TYPE_FILESYSTEM, + TYPE_DIRECTORY, + TYPE_FILE, + TYPE_PROCESS, + TYPE_HOST, + TYPE_SYSTEM, + TYPE_FIFO, + TYPE_PROGRAM, + TYPE_NET, +) # charts order (can be overridden if you want less charts, or different order) ORDER = [ @@ -38,6 +64,7 @@ ORDER = [ 'program', 'net' ] + CHARTS = { 'filesystem': { 'options': ['filesystems', 'Filesystems', 'filesystems', 'filesystem', 'monit.filesystems', 'line'], @@ -83,7 +110,7 @@ CHARTS = { 'lines': [] }, 'host_latency': { - 'options': ['hosts latency', 'Hosts latency', 'milliseconds/s', 'network', 'monit.host_latency', 'line'], + 'options': ['hosts latency', 'Hosts latency', 'milliseconds', 'network', 'monit.host_latency', 'line'], 'lines': [] }, 'net': { @@ -94,85 +121,224 @@ CHARTS = { } +class BaseMonitService(object): + def __init__(self, typ, name, status, monitor): + self.type = typ + self.name = name + self.status = status + self.monitor = monitor + + def __repr__(self): + return 'MonitService({0}:{1})'.format(self.type.name, self.name) + + def __eq__(self, other): + if not isinstance(other, BaseMonitService): + return False + return self.type == other.type and self.name == other.name + + def __ne__(self, other): + return not self == other + + def __hash__(self): + return hash(repr(self)) + + def is_running(self): + return self.status == '0' and self.monitor == '1' + + def key(self): + return '{0}_{1}'.format(self.type.name, self.name) + + def data(self): + return {self.key(): int(self.is_running())} + + +class ProcessMonitService(BaseMonitService): + def __init__(self, typ, name, status, monitor): + super(ProcessMonitService, self).__init__(typ, name, status, monitor) + self.uptime = None + self.threads = None + self.children = None + + def uptime_key(self): + return 'process_uptime_{0}'.format(self.name) + + def threads_key(self): + return 'process_threads_{0}'.format(self.name) + + def children_key(self): + return 'process_children_{0}'.format(self.name) + + def data(self): + base_data = super(ProcessMonitService, self).data() + # skipping bugged metrics with negative uptime (monit before v5.16) + uptime = self.uptime if self.uptime and int(self.uptime) >= 0 else None + data = { + self.uptime_key(): uptime, + self.threads_key(): self.threads, + self.children_key(): self.children, + } + data.update(base_data) + + return data + + +class HostMonitService(BaseMonitService): + def __init__(self, typ, name, status, monitor): + super(HostMonitService, self).__init__(typ, name, status, monitor) + self.latency = None + + def latency_key(self): + return 'host_latency_{0}'.format(self.name) + + def data(self): + base_data = super(HostMonitService, self).data() + latency = float(self.latency) * 1000000 if self.latency else None + data = {self.latency_key(): latency} + data.update(base_data) + + return data + + class Service(UrlService): def __init__(self, configuration=None, name=None): UrlService.__init__(self, configuration=configuration, name=name) self.order = ORDER self.definitions = CHARTS - base_url = self.configuration.get('url', 'http://localhost:2812') + base_url = self.configuration.get('url', "http://localhost:2812") self.url = '{0}/_status?format=xml&level=full'.format(base_url) + self.active_services = list() - def parse(self, data): + def parse(self, raw): try: - xml = ET.fromstring(data) + root = ET.fromstring(raw) except ET.ParseError: - self.error("URL {0} didn't return a vaild XML page. Please check your settings.".format(self.url)) + self.error("URL {0} didn't return a valid XML page. Please check your settings.".format(self.url)) + return None + return root + + def _get_data(self): + raw = self._get_raw_data() + if not raw: return None - return xml - def check(self): - self._manager = self._build_manager() + root = self.parse(raw) + if root is None: + return None - raw_data = self._get_raw_data() - if not raw_data: + services = self.get_services(root) + if not services: return None - return bool(self.parse(raw_data)) + if len(self.charts) > 0: + self.update_charts(services) - def _get_data(self): - raw_data = self._get_raw_data() + data = dict() - if not raw_data: - return None + for svc in services: + data.update(svc.data()) - xml = self.parse(raw_data) - if not xml: - return None + return data - data = {} - for service_id in DEFAULT_SERVICES_IDS: - service_category = MONIT_SERVICE_NAMES[service_id].lower() + def get_services(self, root): + services = list() - if service_category == 'system': - self.debug("Skipping service from 'System' category, because it's useless in graphs") + for typ in TYPES: + if typ == TYPE_SYSTEM: + self.debug("skipping service from '{0}' category, it's useless in graphs".format(TYPE_SYSTEM.name)) continue - xpath_query = "./service[@type='{0}']".format(service_id) - self.debug('Searching for {0} as {1}'.format(service_category, xpath_query)) - for service_node in xml.findall(xpath_query): - - service_name = service_node.find('name').text - service_status = service_node.find('status').text - service_monitoring = service_node.find('monitor').text - self.debug('=> found {0} with type={1}, status={2}, monitoring={3}'.format(service_name, - service_id, service_status, service_monitoring)) - - dimension_key = service_category + '_' + service_name - if dimension_key not in self.charts[service_category]: - self.charts[service_category].add_dimension([dimension_key, service_name, 'absolute']) - data[dimension_key] = 1 if service_status == '0' and service_monitoring == '1' else 0 - - if service_category == 'process': - for subnode in ('uptime', 'threads', 'children'): - subnode_value = service_node.find(subnode) - if subnode_value is None: - continue - if subnode == 'uptime' and int(subnode_value.text) < 0: - self.debug('Skipping bugged metrics with negative uptime (monit before v5.16') - continue - dimension_key = 'process_{0}_{1}'.format(subnode, service_name) - if dimension_key not in self.charts['process_' + subnode]: - self.charts['process_' + subnode].add_dimension([dimension_key, service_name, 'absolute']) - data[dimension_key] = int(subnode_value.text) - - if service_category == 'host': - subnode_value = service_node.find('./icmp/responsetime') - if subnode_value is None: - continue - dimension_key = 'host_latency_{0}'.format(service_name) - if dimension_key not in self.charts['host_latency']: - self.charts['host_latency'].add_dimension([dimension_key, service_name, - 'absolute', 1000, 1000000]) - data[dimension_key] = float(subnode_value.text) * 1000000 - - return data or None + xpath_query = "./service[@type='{0}']".format(typ.index) + self.debug('Searching for {0} as {1}'.format(typ.name, xpath_query)) + + for svc_root in root.findall(xpath_query): + svc = create_service(svc_root, typ) + self.debug('=> found {0} with type={1}, status={2}, monitoring={3}'.format( + svc.name, svc.type.name, svc.status, svc.monitor)) + + services.append(svc) + + return services + + def update_charts(self, services): + remove = [svc for svc in self.active_services if svc not in services] + add = [svc for svc in services if svc not in self.active_services] + + self.remove_services_from_charts(remove) + self.add_services_to_charts(add) + + self.active_services = services + + def add_services_to_charts(self, services): + for svc in services: + if svc.type == TYPE_HOST: + self.charts['host_latency'].add_dimension([svc.latency_key(), svc.name, 'absolute', 1000, 1000000]) + if svc.type == TYPE_PROCESS: + self.charts['process_uptime'].add_dimension([svc.uptime_key(), svc.name]) + self.charts['process_threads'].add_dimension([svc.threads_key(), svc.name]) + self.charts['process_children'].add_dimension([svc.children_key(), svc.name]) + self.charts[svc.type.name].add_dimension([svc.key(), svc.name]) + + def remove_services_from_charts(self, services): + for svc in services: + if svc.type == TYPE_HOST: + self.charts['host_latency'].del_dimension(svc.latency_key(), False) + if svc.type == TYPE_PROCESS: + self.charts['process_uptime'].del_dimension(svc.uptime_key(), False) + self.charts['process_threads'].del_dimension(svc.threads_key(), False) + self.charts['process_children'].del_dimension(svc.children_key(), False) + self.charts[svc.type.name].del_dimension(svc.key(), False) + + +def create_service(root, typ): + if typ == TYPE_HOST: + return create_host_service(root) + elif typ == TYPE_PROCESS: + return create_process_service(root) + return create_base_service(root, typ) + + +def create_host_service(root): + svc = HostMonitService( + TYPE_HOST, + root.find('name').text, + root.find('status').text, + root.find('monitor').text, + ) + + latency = root.find('./icmp/responsetime') + if latency is not None: + svc.latency = latency.text + + return svc + + +def create_process_service(root): + svc = ProcessMonitService( + TYPE_PROCESS, + root.find('name').text, + root.find('status').text, + root.find('monitor').text, + ) + + uptime = root.find('uptime') + if uptime is not None: + svc.uptime = uptime.text + + threads = root.find('threads') + if threads is not None: + svc.threads = threads.text + + children = root.find('children') + if children is not None: + svc.children = children.text + + return svc + + +def create_base_service(root, typ): + return BaseMonitService( + typ, + root.find('name').text, + root.find('status').text, + root.find('monitor').text, + ) diff --git a/collectors/python.d.plugin/mysql/README.md b/collectors/python.d.plugin/mysql/README.md index eba9d7a2e..f7028ab68 100644 --- a/collectors/python.d.plugin/mysql/README.md +++ b/collectors/python.d.plugin/mysql/README.md @@ -218,6 +218,24 @@ It will produce following charts (if data is available): 45. **Flow Control** in ms * paused +46. **Users CPU time** in percentage + * users + +**Per user statistics:** + +1. **Rows Operations** in operations/s + * read + * send + * updated + * inserted + * deleted + +2. **Commands** in commands/s + * select + * update + * other + + ### configuration You can provide, per server, the following: @@ -234,7 +252,7 @@ You can provide, per server, the following: - ca: the path name of the Certificate Authority (CA) certificate file. This option, if used, must specify the same certificate used by the server. - capath: the path name of the directory that contains trusted SSL CA certificate files. - cipher: the list of permitted ciphers for SSL encryption. - + Here is an example for 3 servers: ```yaml @@ -260,6 +278,8 @@ remote: If no configuration is given, module will attempt to connect to mysql server via unix socket at `/var/run/mysqld/mysqld.sock` without password and with username `root` +`userstats` graph works only if you enable such plugin in MariaDB server and set proper mysql priviliges (SUPER or PROCESS). For more detail please check [MariaDB User Statistics page](https://mariadb.com/kb/en/library/user-statistics/) + --- [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fcollectors%2Fpython.d.plugin%2Fmysql%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() diff --git a/collectors/python.d.plugin/mysql/mysql.chart.py b/collectors/python.d.plugin/mysql/mysql.chart.py index 139fac158..82bd90794 100644 --- a/collectors/python.d.plugin/mysql/mysql.chart.py +++ b/collectors/python.d.plugin/mysql/mysql.chart.py @@ -11,6 +11,7 @@ from bases.FrameworkServices.MySQLService import MySQLService QUERY_GLOBAL = 'SHOW GLOBAL STATUS;' QUERY_SLAVE = 'SHOW SLAVE STATUS;' QUERY_VARIABLES = 'SHOW GLOBAL VARIABLES LIKE \'max_connections\';' +QUERY_USER_STATISTICS = 'SHOW USER_STATISTICS;' GLOBAL_STATS = [ 'Bytes_received', @@ -90,6 +91,7 @@ GLOBAL_STATS = [ 'Innodb_buffer_pool_write_requests', 'Innodb_buffer_pool_reads', 'Innodb_buffer_pool_wait_free', + 'Innodb_deadlocks', 'Qcache_hits', 'Qcache_lowmem_prunes', 'Qcache_inserts', @@ -149,6 +151,18 @@ SLAVE_STATS = [ ('Slave_IO_Running', slave_running) ] +USER_STATISTICS = [ + 'Select_commands', + 'Update_commands', + 'Other_commands', + 'Cpu_time', + 'Rows_read', + 'Rows_sent', + 'Rows_deleted', + 'Rows_inserted', + 'Rows_updated' +] + VARIABLES = [ 'max_connections' ] @@ -178,6 +192,7 @@ ORDER = [ 'innodb_os_log_fsync_writes', 'innodb_os_log_io', 'innodb_cur_row_lock', + 'innodb_deadlocks', 'innodb_rows', 'innodb_buffer_pool_pages', 'innodb_buffer_pool_flush_pages_requests', @@ -200,7 +215,8 @@ ORDER = [ 'galera_bytes', 'galera_queue', 'galera_conflicts', - 'galera_flow_control' + 'galera_flow_control', + 'userstats_cpu' ] CHARTS = { @@ -382,6 +398,13 @@ CHARTS = { ['Innodb_row_lock_current_waits', 'current_waits', 'absolute'] ] }, + 'innodb_deadlocks': { + 'options': [None, 'InnoDB Deadlocks', 'operations/s', 'innodb', + 'mysql.innodb_deadlocks', 'area'], + 'lines': [ + ['Innodb_deadlocks', 'deadlocks', 'incremental'] + ] + }, 'innodb_rows': { 'options': [None, 'InnoDB Row Operations', 'operations/s', 'innodb', 'mysql.innodb_rows', 'area'], 'lines': [ @@ -570,10 +593,45 @@ CHARTS = { 'lines': [ ['wsrep_flow_control_paused_ns', 'paused', 'incremental', 1, 1000000], ] + }, + 'userstats_cpu': { + 'options': [None, 'Users CPU time', 'percentage', 'userstats', 'mysql.userstats_cpu', 'stacked'], + 'lines': [] } } +def userstats_chart_template(name): + order = [ + 'userstats_rows_{0}'.format(name), + 'userstats_commands_{0}'.format(name) + ] + family = 'userstats {0}'.format(name) + + charts = { + order[0]: { + 'options': [None, 'Rows Operations', 'operations/s', family, 'mysql.userstats_rows', 'stacked'], + 'lines': [ + ['userstats_{0}_Rows_read'.format(name), 'read', 'incremental'], + ['userstats_{0}_Rows_send'.format(name), 'send', 'incremental'], + ['userstats_{0}_Rows_updated'.format(name), 'updated', 'incremental'], + ['userstats_{0}_Rows_inserted'.format(name), 'inserted', 'incremental'], + ['userstats_{0}_Rows_deleted'.format(name), 'deleted', 'incremental'] + ] + }, + order[1]: { + 'options': [None, 'Commands', 'commands/s', family, 'mysql.userstats_commands', 'stacked'], + 'lines': [ + ['userstats_{0}_Select_commands'.format(name), 'select', 'incremental'], + ['userstats_{0}_Update_commands'.format(name), 'update', 'incremental'], + ['userstats_{0}_Other_commands'.format(name), 'other', 'incremental'] + ] + } + } + + return order, charts + + class Service(MySQLService): def __init__(self, configuration=None, name=None): MySQLService.__init__(self, configuration=configuration, name=name) @@ -583,6 +641,7 @@ class Service(MySQLService): global_status=QUERY_GLOBAL, slave_status=QUERY_SLAVE, variables=QUERY_VARIABLES, + user_statistics=QUERY_USER_STATISTICS, ) def _get_data(self): @@ -612,6 +671,12 @@ class Service(MySQLService): else: self.queries.pop('slave_status') + if 'user_statistics' in raw_data: + if raw_data['user_statistics'][0]: + to_netdata.update(self.get_userstats(raw_data)) + else: + self.queries.pop('user_statistics') + if 'variables' in raw_data: variables = dict(raw_data['variables'][0]) for key in VARIABLES: @@ -619,3 +684,70 @@ class Service(MySQLService): to_netdata[key] = variables[key] return to_netdata or None + + # raw_data['user_statistics'] contains the following data structure: + # ( + # ( + # ('netdata', 42L, 0L, 1264L, 3.111252999999968, 2.968510299999994, 110267L, 19741424L, 0L, 0L, 1265L, 0L, + # 0L, 0L, 3L, 0L, 1301L, 0L, 0L, 7633L, 0L, 83L, 44L, 0L, 0L), + # ('root', 60L, 0L, 184L, 0.22856499999999966, 0.1601419999999998, 11605L, 1516513L, 0L, 9L, 220L, 0L, 2L, 1L, + # 6L, 4L,127L, 0L, 0L, 45L, 0L, 45L, 0L, 0L, 0L) + # ), + # ( + # ('User', 253, 9, 128, 128, 0, 0), + # ('Total_connections', 3, 2, 11, 11, 0, 0), + # ('Concurrent_connections', 3, 1, 11, 11, 0, 0), + # ('Connected_time', 3, 4, 11, 11, 0, 0), + # ('Busy_time', 5, 21, 21, 21, 31, 0), + # ('Cpu_time', 5, 18, 21, 21, 31, 0), + # ('Bytes_received', 8, 6, 21, 21, 0, 0), + # ('Bytes_sent', 8, 8, 21, 21, 0, 0), + # ('Binlog_bytes_written', 8, 1, 21, 21, 0, 0), + # ('Rows_read', 8, 1, 21, 21, 0, 0), + # ('Rows_sent', 8, 4, 21, 21, 0, 0), + # ('Rows_deleted', 8, 1, 21, 21, 0, 0), + # ('Rows_inserted', 8, 1, 21, 21, 0, 0), + # ('Rows_updated', 8, 1, 21, 21, 0, 0), + # ('Select_commands', 8, 1, 21, 21, 0, 0), + # ('Update_commands', 8, 1, 21, 21, 0, 0), + # ('Other_commands', 8, 4, 21, 21, 0, 0), + # ('Commit_transactions', 8, 1, 21, 21, 0, 0), + # ('Rollback_transactions', 8, 1, 21, 21, 0, 0), + # ('Denied_connections', 8, 4, 21, 21, 0, 0), + # ('Lost_connections', 8, 1, 21, 21, 0, 0), + # ('Access_denied', 8, 2, 21, 21, 0, 0), + # ('Empty_queries', 8, 2, 21, 21, 0, 0), + # ('Total_ssl_connections', 8, 1, 21, 21, 0, 0), + # ('Max_statement_time_exceeded', 8, 1, 21, 21, 0, 0)), + # ) + def get_userstats(self, raw_data): + data = dict() + userstats_vars = [e[0] for e in raw_data['user_statistics'][1]] + for i, _ in enumerate(raw_data['user_statistics'][0]): + user_name = raw_data['user_statistics'][0][i][0] + userstats = dict(zip(userstats_vars, raw_data['user_statistics'][0][i])) + + if len(self.charts) > 0: + if ('userstats_{0}_Cpu_time'.format(user_name)) not in self.charts['userstats_cpu']: + self.add_userstats_dimensions(user_name) + self.create_new_userstats_charts(user_name) + + for key in USER_STATISTICS: + if key in userstats: + data['userstats_{0}_{1}'.format(user_name, key)] = userstats[key] + + return data + + def add_userstats_dimensions(self, name): + self.charts['userstats_cpu'].add_dimension(['userstats_{0}_Cpu_time'.format(name), name, 'incremental', 100, 1]) + + def create_new_userstats_charts(self, tube): + order, charts = userstats_chart_template(tube) + + for chart_name in order: + params = [chart_name] + charts[chart_name]['options'] + dimensions = charts[chart_name]['lines'] + + new_chart = self.charts.add_chart(params) + for dimension in dimensions: + new_chart.add_dimension(dimension) diff --git a/collectors/python.d.plugin/python.d.conf b/collectors/python.d.plugin/python.d.conf index 63eecbba8..e2ee8eeec 100644 --- a/collectors/python.d.plugin/python.d.conf +++ b/collectors/python.d.plugin/python.d.conf @@ -41,7 +41,7 @@ chrony: no # dockerd: yes # dovecot: yes # elasticsearch: yes -# energi: yes +# energid: yes # this is just an example example: no @@ -88,6 +88,7 @@ nginx_log: no # redis: yes # rethinkdbs: yes # retroshare: yes +# riakkv: yes # samba: yes # sensors: yes # smartd_log: yes @@ -101,4 +102,4 @@ unbound: no # uwsgi: yes # varnish: yes # w1sensor: yes -# web_log: yes
\ No newline at end of file +# web_log: yes diff --git a/collectors/python.d.plugin/python_modules/bases/FrameworkServices/UrlService.py b/collectors/python.d.plugin/python_modules/bases/FrameworkServices/UrlService.py index 439456655..b6f75bd5c 100644 --- a/collectors/python.d.plugin/python_modules/bases/FrameworkServices/UrlService.py +++ b/collectors/python.d.plugin/python_modules/bases/FrameworkServices/UrlService.py @@ -6,6 +6,8 @@ import urllib3 +from distutils.version import StrictVersion as version + from bases.FrameworkServices.SimpleService import SimpleService try: @@ -14,9 +16,30 @@ except AttributeError: pass +# https://github.com/urllib3/urllib3/blob/master/CHANGES.rst#19-2014-07-04 +# New retry logic and urllib3.util.retry.Retry configuration object. (Issue https://github.com/urllib3/urllib3/pull/326) +URLLIB3_MIN_REQUIRED_VERSION = '1.9' +URLLIB3_VERSION = urllib3.__version__ +URLLIB3 = 'urllib3' + + +def version_check(): + if version(URLLIB3_VERSION) >= version(URLLIB3_MIN_REQUIRED_VERSION): + return + + err = '{0} version: {1}, minimum required version: {2}, please upgrade'.format( + URLLIB3, + URLLIB3_VERSION, + URLLIB3_MIN_REQUIRED_VERSION, + ) + raise Exception(err) + + class UrlService(SimpleService): def __init__(self, configuration=None, name=None): + version_check() SimpleService.__init__(self, configuration=configuration, name=name) + self.debug("{0} version: {1}".format(URLLIB3, URLLIB3_VERSION)) self.url = self.configuration.get('url') self.user = self.configuration.get('user') self.password = self.configuration.get('pass') diff --git a/collectors/python.d.plugin/riakkv/Makefile.inc b/collectors/python.d.plugin/riakkv/Makefile.inc new file mode 100644 index 000000000..87d29f82f --- /dev/null +++ b/collectors/python.d.plugin/riakkv/Makefile.inc @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-3.0-or-later + +# THIS IS NOT A COMPLETE Makefile +# IT IS INCLUDED BY ITS PARENT'S Makefile.am +# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT + +# install these files +dist_python_DATA += riakkv/riakkv.chart.py +dist_pythonconfig_DATA += riakkv/riakkv.conf + +# do not install these files, but include them in the distribution +dist_noinst_DATA += riakkv/README.md riakkv/Makefile.inc + diff --git a/collectors/python.d.plugin/riakkv/README.md b/collectors/python.d.plugin/riakkv/README.md new file mode 100644 index 000000000..0bcf22c5b --- /dev/null +++ b/collectors/python.d.plugin/riakkv/README.md @@ -0,0 +1,110 @@ +# riakkv + +Monitors one or more Riak KV servers. + +**Requirements:** + +* An accessible `/stats` endpoint. See [the Riak KV configuration reference] + documentation](https://docs.riak.com/riak/kv/2.2.3/configuring/reference/#client-interfaces) + for how to enable this. + +The following charts are included, which are mostly derived from the metrics +listed +[here](https://docs.riak.com/riak/kv/latest/using/reference/statistics-monitoring/index.html#riak-metrics-to-graph). + +1. **Throughput** in operations/s + * **KV operations** + * gets + * puts + + * **Data type updates** + * counters + * sets + * maps + + * **Search queries** + * queries + + * **Search documents** + * indexed + + * **Strong consistency operations** + * gets + * puts + +2. **Latency** in milliseconds + * **KV latency** of the past minute + * get (mean, median, 95th / 99th / 100th percentile) + * put (mean, median, 95th / 99th / 100th percentile) + + * **Data type latency** of the past minute + * counter_merge (mean, median, 95th / 99th / 100th percentile) + * set_merge (mean, median, 95th / 99th / 100th percentile) + * map_merge (mean, median, 95th / 99th / 100th percentile) + + * **Search latency** of the past minute + * query (median, min, max, 95th / 99th percentile) + * index (median, min, max, 95th / 99th percentile) + + * **Strong consistency latency** of the past minute + * get (mean, median, 95th / 99th / 100th percentile) + * put (mean, median, 95th / 99th / 100th percentile) + +3. **Erlang VM metrics** + * **System counters** + * processes + + * **Memory allocation** in MB + * processes.allocated + * processes.used + +4. **General load / health metrics** + * **Siblings encountered in KV operations** during the past minute + * get (mean, median, 95th / 99th / 100th percentile) + + * **Object size in KV operations** during the past minute in KB + * get (mean, median, 95th / 99th / 100th percentile) + + * **Message queue length** in unprocessed messages + * vnodeq_size (mean, median, 95th / 99th / 100th percentile) + + * **Index operations** encountered by Search + * errors + + * **Protocol buffer connections** + * active + + * **Repair operations coordinated by this node** + * read + + * **Active finite state machines by kind** + * get + * put + * secondary_index + * list_keys + + * **Rejected finite state machines** + * get + * put + + * **Number of writes to Search failed due to bad data format by reason** + * bad_entry + * extract_fail + + +### configuration + +The module needs to be passed the full URL to Riak's stats endpoint. +For example: + +```yaml +myriak: + url: http://myriak.example.com:8098/stats +``` + +With no explicit configuration given, the module will attempt to connect to +`http://localhost:8098/stats`. + +The default update frequency for the plugin is set to 2 seconds as Riak +internally updates the metrics every second. If we were to update the metrics +every second, the resulting graph would contain odd jitter. diff --git a/collectors/python.d.plugin/riakkv/riakkv.chart.py b/collectors/python.d.plugin/riakkv/riakkv.chart.py new file mode 100644 index 000000000..f81e177a5 --- /dev/null +++ b/collectors/python.d.plugin/riakkv/riakkv.chart.py @@ -0,0 +1,315 @@ +# -*- coding: utf-8 -*- +# Description: riak netdata python.d module +# +# See also: +# https://docs.riak.com/riak/kv/latest/using/reference/statistics-monitoring/index.html + +from json import loads + +from bases.FrameworkServices.UrlService import UrlService + +# Riak updates the metrics at the /stats endpoint every 1 second. +# If we use `update_every = 1` here, that means we might get weird jitter in the graph, +# so the default is set to 2 seconds to prevent it. +update_every = 2 + +# charts order (can be overridden if you want less charts, or different order) +ORDER = [ + # Throughput metrics + # https://docs.riak.com/riak/kv/latest/using/reference/statistics-monitoring/index.html#throughput-metrics + # Collected in totals. + "kv.node_operations", # K/V node operations. + "dt.vnode_updates", # Data type vnode updates. + "search.queries", # Search queries on the node. + "search.documents", # Documents indexed by Search. + "consistent.operations", # Consistent node operations. + + # Latency metrics + # https://docs.riak.com/riak/kv/latest/using/reference/statistics-monitoring/index.html#throughput-metrics + # Collected for the past minute in milliseconds, + # returned from riak in microseconds. + "kv.latency.get", # K/V GET FSM traversal latency. + "kv.latency.put", # K/V PUT FSM traversal latency. + "dt.latency.counter", # Update Counter Data type latency. + "dt.latency.set", # Update Set Data type latency. + "dt.latency.map", # Update Map Data type latency. + "search.latency.query", # Search query latency. + "search.latency.index", # Time it takes for search to index a new document. + "consistent.latency.get", # Strong consistent read latency. + "consistent.latency.put", # Strong consistent write latency. + + # Erlang resource usage metrics + # https://docs.riak.com/riak/kv/latest/using/reference/statistics-monitoring/index.html#erlang-resource-usage-metrics + # Processes collected as a gauge, + # memory collected as Megabytes, returned as bytes from Riak. + "vm.processes", # Number of processes currently running in the Erlang VM. + "vm.memory.processes", # Total amount of memory allocated & used for Erlang processes. + + # General Riak Load / Health metrics + # https://docs.riak.com/riak/kv/latest/using/reference/statistics-monitoring/index.html#general-riak-load-health-metrics + # The following are collected by Riak over the past minute: + "kv.siblings_encountered.get", # Siblings encountered during GET operations by this node. + "kv.objsize.get", # Object size encountered by this node. + "search.vnodeq_size", # Number of unprocessed messages in the vnode message queues (Search). + # The following are calculated in total, or as gauges: + "search.index_errors", # Errors of the search subsystem while indexing documents. + "core.pbc", # Number of currently active protocol buffer connections. + "core.repairs", # Total read repair operations coordinated by this node. + "core.fsm_active", # Active finite state machines by kind. + "core.fsm_rejected", # Rejected finite state machines by kind. + + # General Riak Search Load / Health metrics + # https://docs.riak.com/riak/kv/latest/using/reference/statistics-monitoring/index.html#general-riak-search-load-health-metrics + # Reported as counters. + "search.errors", # Write and read errors of the Search subsystem. +] + +CHARTS = { + # Throughput metrics + "kv.node_operations": { + "options": [None, "Reads & writes coordinated by this node", "operations/s", "throughput", "riak.kv.throughput", "line"], + "lines": [ + ["node_gets_total", "gets", "incremental"], + ["node_puts_total", "puts", "incremental"] + ] + }, + "dt.vnode_updates": { + "options": [None, "Update operations coordinated by local vnodes by data type", "operations/s", "throughput", "riak.dt.vnode_updates", "line"], + "lines": [ + ["vnode_counter_update_total", "counters", "incremental"], + ["vnode_set_update_total", "sets", "incremental"], + ["vnode_map_update_total", "maps", "incremental"], + ] + }, + "search.queries": { + "options": [None, "Search queries on the node", "queries/s", "throughput", "riak.search", "line"], + "lines": [ + ["search_query_throughput_count", "queries", "incremental"] + ] + }, + "search.documents": { + "options": [None, "Documents indexed by search", "documents/s", "throughput", "riak.search.documents", "line"], + "lines": [ + ["search_index_throughput_count", "indexed", "incremental"] + ] + }, + "consistent.operations": { + "options": [None, "Consistent node operations", "operations/s", "throughput", "riak.consistent.operations", "line"], + "lines": [ + ["consistent_gets_total", "gets", "incremental"], + ["consistent_puts_total", "puts", "incremental"], + ] + }, + + # Latency metrics + "kv.latency.get": { + "options": [None, "Time between reception of a client GET request and subsequent response to client", "ms", "latency", "riak.kv.latency.get", "line"], + "lines": [ + ["node_get_fsm_time_mean", "mean", "absolute", 1, 1000], + ["node_get_fsm_time_median", "median", "absolute", 1, 1000], + ["node_get_fsm_time_95", "95", "absolute", 1, 1000], + ["node_get_fsm_time_99", "99", "absolute", 1, 1000], + ["node_get_fsm_time_100", "100", "absolute", 1, 1000], + ] + }, + "kv.latency.put": { + "options": [None, "Time between reception of a client PUT request and subsequent response to client", "ms", "latency", "riak.kv.latency.put", "line"], + "lines": [ + ["node_put_fsm_time_mean", "mean", "absolute", 1, 1000], + ["node_put_fsm_time_median", "median", "absolute", 1, 1000], + ["node_put_fsm_time_95", "95", "absolute", 1, 1000], + ["node_put_fsm_time_99", "99", "absolute", 1, 1000], + ["node_put_fsm_time_100", "100", "absolute", 1, 1000], + ] + }, + "dt.latency.counter": { + "options": [None, "Time it takes to perform an Update Counter operation", "ms", "latency", "riak.dt.latency.counter_merge", "line"], + "lines": [ + ["object_counter_merge_time_mean", "mean", "absolute", 1, 1000], + ["object_counter_merge_time_median", "median", "absolute", 1, 1000], + ["object_counter_merge_time_95", "95", "absolute", 1, 1000], + ["object_counter_merge_time_99", "99", "absolute", 1, 1000], + ["object_counter_merge_time_100", "100", "absolute", 1, 1000], + ] + }, + "dt.latency.set": { + "options": [None, "Time it takes to perform an Update Set operation", "ms", "latency", "riak.dt.latency.set_merge", "line"], + "lines": [ + ["object_set_merge_time_mean", "mean", "absolute", 1, 1000], + ["object_set_merge_time_median", "median", "absolute", 1, 1000], + ["object_set_merge_time_95", "95", "absolute", 1, 1000], + ["object_set_merge_time_99", "99", "absolute", 1, 1000], + ["object_set_merge_time_100", "100", "absolute", 1, 1000], + ] + }, + "dt.latency.map": { + "options": [None, "Time it takes to perform an Update Map operation", "ms", "latency", "riak.dt.latency.map_merge", "line"], + "lines": [ + ["object_map_merge_time_mean", "mean", "absolute", 1, 1000], + ["object_map_merge_time_median", "median", "absolute", 1, 1000], + ["object_map_merge_time_95", "95", "absolute", 1, 1000], + ["object_map_merge_time_99", "99", "absolute", 1, 1000], + ["object_map_merge_time_100", "100", "absolute", 1, 1000], + ] + }, + "search.latency.query": { + "options": [None, "Search query latency", "ms", "latency", "riak.search.latency.query", "line"], + "lines": [ + ["search_query_latency_median", "median", "absolute", 1, 1000], + ["search_query_latency_min", "min", "absolute", 1, 1000], + ["search_query_latency_95", "95", "absolute", 1, 1000], + ["search_query_latency_99", "99", "absolute", 1, 1000], + ["search_query_latency_999", "999", "absolute", 1, 1000], + ["search_query_latency_max", "max", "absolute", 1, 1000], + ] + }, + "search.latency.index": { + "options": [None, "Time it takes Search to index a new document", "ms", "latency", "riak.search.latency.index", "line"], + "lines": [ + ["search_index_latency_median", "median", "absolute", 1, 1000], + ["search_index_latency_min", "min", "absolute", 1, 1000], + ["search_index_latency_95", "95", "absolute", 1, 1000], + ["search_index_latency_99", "99", "absolute", 1, 1000], + ["search_index_latency_999", "999", "absolute", 1, 1000], + ["search_index_latency_max", "max", "absolute", 1, 1000], + ] + }, + + # Riak Strong Consistency metrics + "consistent.latency.get": { + "options": [None, "Strongly consistent read latency", "ms", "latency", "riak.consistent.latency.get", "line"], + "lines": [ + ["consistent_get_time_mean", "mean", "absolute", 1, 1000], + ["consistent_get_time_median", "median", "absolute", 1, 1000], + ["consistent_get_time_95", "95", "absolute", 1, 1000], + ["consistent_get_time_99", "99", "absolute", 1, 1000], + ["consistent_get_time_100", "100", "absolute", 1, 1000], + ] + }, + "consistent.latency.put": { + "options": [None, "Strongly consistent write latency", "ms", "latency", "riak.consistent.latency.put", "line"], + "lines": [ + ["consistent_put_time_mean", "mean", "absolute", 1, 1000], + ["consistent_put_time_median", "median", "absolute", 1, 1000], + ["consistent_put_time_95", "95", "absolute", 1, 1000], + ["consistent_put_time_99", "99", "absolute", 1, 1000], + ["consistent_put_time_100", "100", "absolute", 1, 1000], + ] + }, + + # BEAM metrics + "vm.processes": { + "options": [None, "Total processes running in the Erlang VM", "total", "vm", "riak.vm", "line"], + "lines": [ + ["sys_process_count", "processes", "absolute"], + ] + }, + "vm.memory.processes": { + "options": [None, "Memory allocated & used by Erlang processes", "MB", "vm", "riak.vm.memory.processes", "line"], + "lines": [ + ["memory_processes", "allocated", "absolute", 1, 1024 * 1024], + ["memory_processes_used", "used", "absolute", 1, 1024 * 1024] + ] + }, + + # General Riak Load/Health metrics + "kv.siblings_encountered.get": { + "options": [None, "Number of siblings encountered during GET operations by this node during the past minute", "siblings", "load", "riak.kv.siblings_encountered.get", "line"], + "lines": [ + ["node_get_fsm_siblings_mean", "mean", "absolute"], + ["node_get_fsm_siblings_median", "median", "absolute"], + ["node_get_fsm_siblings_95", "95", "absolute"], + ["node_get_fsm_siblings_99", "99", "absolute"], + ["node_get_fsm_siblings_100", "100", "absolute"], + ] + }, + "kv.objsize.get": { + "options": [None, "Object size encountered by this node during the past minute", "KB", "load", "riak.kv.objsize.get", "line"], + "lines": [ + ["node_get_fsm_objsize_mean", "mean", "absolute", 1, 1024], + ["node_get_fsm_objsize_median", "median", "absolute", 1, 1024], + ["node_get_fsm_objsize_95", "95", "absolute", 1, 1024], + ["node_get_fsm_objsize_99", "99", "absolute", 1, 1024], + ["node_get_fsm_objsize_100", "100", "absolute", 1, 1024], + ] + }, + "search.vnodeq_size": { + "options": [None, "Number of unprocessed messages in the vnode message queues of Search on this node in the past minute", "messages", "load", "riak.search.vnodeq_size", "line"], + "lines": [ + ["riak_search_vnodeq_mean", "mean", "absolute"], + ["riak_search_vnodeq_median", "median", "absolute"], + ["riak_search_vnodeq_95", "95", "absolute"], + ["riak_search_vnodeq_99", "99", "absolute"], + ["riak_search_vnodeq_100", "100", "absolute"], + ] + }, + "search.index_errors": { + "options": [None, "Number of document index errors encountered by Search", "errors", "load", "riak.search.index", "line"], + "lines": [ + ["search_index_fail_count", "errors", "absolute"] + ] + }, + "core.pbc": { + "options": [None, "Protocol buffer connections by status", "connections", "load", "riak.core.protobuf_connections", "line"], + "lines": [ + ["pbc_active", "active", "absolute"], + # ["pbc_connects", "established_pastmin", "absolute"] + ] + }, + "core.repairs": { + "options": [None, "Number of repair operations this node has coordinated", "repairs", "load", "riak.core.repairs", "line"], + "lines": [ + ["read_repairs", "read", "absolute"] + ] + }, + "core.fsm_active": { + "options": [None, "Active finite state machines by kind", "fsms", "load", "riak.core.fsm_active", "line"], + "lines": [ + ["node_get_fsm_active", "get", "absolute"], + ["node_put_fsm_active", "put", "absolute"], + ["index_fsm_active", "secondary index", "absolute"], + ["list_fsm_active", "list keys", "absolute"] + ] + }, + "core.fsm_rejected": { + # Writing "Sidejob's" here seems to cause some weird issues: it results in this chart being rendered in + # its own context and additionally, moves the entire Riak graph all the way up to the top of the Netdata + # dashboard for some reason. + "options": [None, "Finite state machines being rejected by Sidejobs overload protection", "fsms", "load", "riak.core.fsm_rejected", "line"], + "lines": [ + ["node_get_fsm_rejected", "get", "absolute"], + ["node_put_fsm_rejected", "put", "absolute"] + ] + }, + + # General Riak Search Load / Health metrics + "search.errors": { + "options": [None, "Number of writes to Search failed due to bad data format by reason", "writes", "load", "riak.search.index", "line"], + "lines": [ + ["search_index_bad_entry_count", "bad_entry", "absolute"], + ["search_index_extract_fail_count", "extract_fail", "absolute"], + ] + } +} + + +class Service(UrlService): + def __init__(self, configuration=None, name=None): + UrlService.__init__(self, configuration=configuration, name=name) + self.order = ORDER + self.definitions = CHARTS + + def _get_data(self): + """ + Format data received from http request + :return: dict + """ + raw = self._get_raw_data() + if not raw: + return None + + try: + return loads(raw) + except (TypeError, ValueError) as err: + self.error(err) + return None diff --git a/collectors/python.d.plugin/riakkv/riakkv.conf b/collectors/python.d.plugin/riakkv/riakkv.conf new file mode 100644 index 000000000..be01c48ac --- /dev/null +++ b/collectors/python.d.plugin/riakkv/riakkv.conf @@ -0,0 +1,68 @@ +# netdata python.d.plugin configuration for riak +# +# This file is in YaML format. Generally the format is: +# +# name: value +# +# There are 2 sections: +# - global variables +# - one or more JOBS +# +# JOBS allow you to collect values from multiple sources. +# Each source will have its own set of charts. +# +# JOB parameters have to be indented (using spaces only, example below). + +# ---------------------------------------------------------------------- +# Global Variables +# These variables set the defaults for all JOBs, however each JOB +# may define its own, overriding the defaults. + +# update_every sets the default data collection frequency. +# If unset, the python.d.plugin default is used. +# update_every: 1 + +# priority controls the order of charts at the netdata dashboard. +# Lower numbers move the charts towards the top of the page. +# If unset, the default for python.d.plugin is used. +# priority: 60000 + +# penalty indicates whether to apply penalty to update_every in case of failures. +# Penalty will increase every 5 failed updates in a row. Maximum penalty is 10 minutes. +# penalty: yes + +# autodetection_retry sets the job re-check interval in seconds. +# The job is not deleted if check fails. +# Attempts to start the job are made once every autodetection_retry. +# This feature is disabled by default. +# autodetection_retry: 0 + +# ---------------------------------------------------------------------- +# JOBS (data collection sources) +# +# The default JOBS share the same *name*. JOBS with the same name +# are mutually exclusive. Only one of them will be allowed running at +# any time. This allows autodetection to try several alternatives and +# pick the one that works. +# +# Any number of jobs is supported. +# +# All python.d.plugin JOBS (for all its modules) support a set of +# predefined parameters. These are: +# +# job_name: +# name: myname # the JOB's name as it will appear at the +# # dashboard (by default is the job_name) +# # JOBs sharing a name are mutually exclusive +# update_every: 1 # the JOB's data collection frequency +# priority: 60000 # the JOB's order on the dashboard +# penalty: yes # the JOB's penalty +# autodetection_retry: 0 # the JOB's re-check interval in seconds +# +# +# ---------------------------------------------------------------------- +# AUTO-DETECTION JOBS +# only one of them will run (they have the same name) + +local: + url : 'http://localhost:8098/stats' diff --git a/collectors/python.d.plugin/smartd_log/README.md b/collectors/python.d.plugin/smartd_log/README.md index 3b0816fb8..f6584be70 100644 --- a/collectors/python.d.plugin/smartd_log/README.md +++ b/collectors/python.d.plugin/smartd_log/README.md @@ -85,7 +85,11 @@ For this you need to set `smartd_opts` (or `SMARTD_ARGS`, check _smartd.service_ # dump smartd attrs info every 600 seconds smartd_opts="-A /var/log/smartd/ -i 600" ``` - +You may need to create the smartd directory before smartd will write to it: +``` +mkdir -p /var/log/smartd +``` +Otherwise, all the smartd `.csv` files may get written to `/var/lib/smartmontools` (default location). See also [https://linux.die.net/man/8/smartd](https://linux.die.net/man/8/smartd) for more info on the `-A --attributelog=PREFIX` command. `smartd` appends logs at every run. It's strongly recommended to use `logrotate` for smartd files. diff --git a/collectors/python.d.plugin/tomcat/tomcat.chart.py b/collectors/python.d.plugin/tomcat/tomcat.chart.py index 01578c56e..ab3003304 100644 --- a/collectors/python.d.plugin/tomcat/tomcat.chart.py +++ b/collectors/python.d.plugin/tomcat/tomcat.chart.py @@ -5,11 +5,17 @@ # SPDX-License-Identifier: GPL-3.0-or-later import xml.etree.ElementTree as ET +import re from bases.FrameworkServices.UrlService import UrlService MiB = 1 << 20 +# Regex fix for Tomcat single quote XML attributes +# affecting Tomcat < 8.5.24 & 9.0.2 running with Java > 9 +# cf. https://bz.apache.org/bugzilla/show_bug.cgi?id=61603 +single_quote_regex = re.compile(r"='([^']+)'([^']+)''") + ORDER = [ 'accesses', 'bandwidth', @@ -95,6 +101,32 @@ class Service(UrlService): self.definitions = CHARTS self.url = self.configuration.get('url', 'http://127.0.0.1:8080/manager/status?XML=true') self.connector_name = self.configuration.get('connector_name', None) + self.parse = self.xml_parse + + def xml_parse(self, data): + try: + return ET.fromstring(data) + except ET.ParseError: + self.debug('%s is not a valid XML page. Please add "?XML=true" to tomcat status page.' % self.url) + return None + + def xml_single_quote_fix_parse(self, data): + data = single_quote_regex.sub(r"='\g<1>\g<2>'", data) + return self.xml_parse(data) + + def check(self): + self._manager = self._build_manager() + + raw_data = self._get_raw_data() + if not raw_data: + return False + + if single_quote_regex.search(raw_data): + self.warning('Tomcat status page is returning invalid single quote XML, please consider upgrading ' + 'your Tomcat installation. See https://bz.apache.org/bugzilla/show_bug.cgi?id=61603') + self.parse = self.xml_single_quote_fix_parse + + return self.parse(raw_data) is not None def _get_data(self): """ @@ -104,11 +136,10 @@ class Service(UrlService): data = None raw_data = self._get_raw_data() if raw_data: - try: - xml = ET.fromstring(raw_data) - except ET.ParseError: - self.debug('%s is not a vaild XML page. Please add "?XML=true" to tomcat status page.' % self.url) + xml = self.parse(raw_data) + if xml is None: return None + data = {} jvm = xml.find('jvm') @@ -153,7 +184,7 @@ class Service(UrlService): data['metaspace_committed'] = pool.get('usageCommitted') data['metaspace_max'] = pool.get('usageMax') - if connector: + if connector is not None: thread_info = connector.find('threadInfo') data['currentThreadsBusy'] = thread_info.get('currentThreadsBusy') data['currentThreadCount'] = thread_info.get('currentThreadCount') diff --git a/collectors/python.d.plugin/varnish/varnish.chart.py b/collectors/python.d.plugin/varnish/varnish.chart.py index 70af50ccb..58745e24d 100644 --- a/collectors/python.d.plugin/varnish/varnish.chart.py +++ b/collectors/python.d.plugin/varnish/varnish.chart.py @@ -5,9 +5,8 @@ import re -from bases.collection import find_binary from bases.FrameworkServices.ExecutableService import ExecutableService - +from bases.collection import find_binary ORDER = [ 'session_connections', @@ -138,6 +137,18 @@ CHARTS = { VARNISHSTAT = 'varnishstat' +re_version = re.compile(r'varnish-(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)') + + +class VarnishVersion: + def __init__(self, major, minor, patch): + self.major = major + self.minor = minor + self.patch = patch + + def __str__(self): + return '{0}.{1}.{2}'.format(self.major, self.minor, self.patch) + class Parser: _backend_new = re.compile(r'VBE.([\d\w_.]+)\(.*?\).(beresp[\w_]+)\s+(\d+)') @@ -185,10 +196,32 @@ class Service(ExecutableService): self.error("can't locate '{0}' binary or binary is not executable by user netdata".format(VARNISHSTAT)) return False + command = [varnishstat, '-V'] + reply = self._get_raw_data(stderr=True, command=command) + if not reply: + self.error( + "no output from '{0}'. Is varnish running? Not enough privileges?".format(' '.join(self.command))) + return False + + ver = parse_varnish_version(reply) + if not ver: + self.error("failed to parse reply from '{0}', used regex :'{1}', reply : {2}".format( + ' '.join(command), + re_version.pattern, + reply, + )) + return False + if self.instance_name: - self.command = [varnishstat, '-1', '-n', self.instance_name, '-t', '1'] + self.command = [varnishstat, '-1', '-n', self.instance_name] else: - self.command = [varnishstat, '-1', '-t', '1'] + self.command = [varnishstat, '-1'] + + if ver.major > 4: + self.command.extend(['-t', '1']) + + self.info("varnish version: {0}, will use command: '{1}'".format(ver, ' '.join(self.command))) + return True def check(self): @@ -198,14 +231,14 @@ class Service(ExecutableService): # STDOUT is not empty reply = self._get_raw_data() if not reply: - self.error("No output from 'varnishstat'. Is it running? Not enough privileges?") + self.error("no output from '{0}'. Is it running? Not enough privileges?".format(' '.join(self.command))) return False self.parser.init(reply) # Output is parsable if not self.parser.re_default: - self.error('Cant parse the output...') + self.error('cant parse the output...') return False if self.parser.re_backend: @@ -260,3 +293,16 @@ class Service(ExecutableService): self.order.insert(0, chart_name) self.definitions.update(chart) + + +def parse_varnish_version(lines): + m = re_version.search(lines[0]) + if not m: + return None + + m = m.groupdict() + return VarnishVersion( + int(m['major']), + int(m['minor']), + int(m['patch']), + ) diff --git a/collectors/python.d.plugin/web_log/web_log.chart.py b/collectors/python.d.plugin/web_log/web_log.chart.py index 6d6a261c4..fa5a8bc3e 100644 --- a/collectors/python.d.plugin/web_log/web_log.chart.py +++ b/collectors/python.d.plugin/web_log/web_log.chart.py @@ -4,9 +4,8 @@ # SPDX-License-Identifier: GPL-3.0-or-later import bisect -import re import os - +import re from collections import namedtuple, defaultdict from copy import deepcopy @@ -660,7 +659,7 @@ class Web: r' (?P<bytes_sent>\d+)' r' (?P<resp_length>\d+)' r' (?P<resp_time>\d+\.\d+)' - r' (?P<resp_time_upstream>[\d.-]+) ') + r' (?P<resp_time_upstream>[\d.-]+)') nginx_ext_append = re.compile(r'(?P<address>[\da-f.:]+)' r' -.*?"(?P<request>[^"]*)"' |