summaryrefslogtreecommitdiffstats
path: root/conf.d
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--conf.d/Makefile.am7
-rw-r--r--conf.d/Makefile.in19
-rw-r--r--conf.d/apps_groups.conf2
-rw-r--r--conf.d/health.d/apache.conf9
-rw-r--r--conf.d/health.d/cpu.conf37
-rw-r--r--conf.d/health.d/disks.conf95
-rw-r--r--conf.d/health.d/entropy.conf15
-rw-r--r--conf.d/health.d/memcached.conf42
-rw-r--r--conf.d/health.d/mysql.conf13
-rw-r--r--conf.d/health.d/named.conf8
-rw-r--r--conf.d/health.d/net.conf51
-rw-r--r--conf.d/health.d/nginx.conf8
-rw-r--r--conf.d/health.d/qos.conf2
-rw-r--r--conf.d/health.d/ram.conf10
-rw-r--r--conf.d/health.d/redis.conf8
-rw-r--r--conf.d/health.d/retroshare.conf25
-rw-r--r--conf.d/health.d/softnet.conf21
-rw-r--r--conf.d/health.d/squid.conf8
-rw-r--r--conf.d/health.d/swap.conf22
-rw-r--r--conf.d/health.d/tcp_resets.conf32
-rw-r--r--conf.d/health_alarm_notify.conf222
-rw-r--r--conf.d/health_email_recipients.conf2
-rw-r--r--conf.d/python.d/mysql.conf4
-rw-r--r--conf.d/python.d/nginx.conf27
-rw-r--r--conf.d/python.d/phpfpm.conf7
-rw-r--r--conf.d/python.d/retroshare.conf67
26 files changed, 647 insertions, 116 deletions
diff --git a/conf.d/Makefile.am b/conf.d/Makefile.am
index 02fe86b0..066744ca 100644
--- a/conf.d/Makefile.am
+++ b/conf.d/Makefile.am
@@ -7,6 +7,8 @@ dist_config_DATA = \
apps_groups.conf \
charts.d.conf \
python.d.conf \
+ health_alarm_notify.conf \
+ health_email_recipients.conf \
$(NULL)
chartsconfigdir=$(configdir)/charts.d
@@ -34,6 +36,7 @@ dist_pythonconfig_DATA = \
python.d/phpfpm.conf \
python.d/postfix.conf \
python.d/redis.conf \
+ python.d/retroshare.conf \
python.d/sensors.conf \
python.d/squid.conf \
python.d/tomcat.conf \
@@ -45,13 +48,17 @@ dist_healthconfig_DATA = \
health.d/cpu.conf \
health.d/disks.conf \
health.d/entropy.conf \
+ health.d/tcp_resets.conf \
health.d/memcached.conf \
+ health.d/mysql.conf \
health.d/named.conf \
health.d/net.conf \
health.d/nginx.conf \
health.d/qos.conf \
health.d/ram.conf \
health.d/redis.conf \
+ health.d/retroshare.conf \
+ health.d/softnet.conf \
health.d/swap.conf \
health.d/squid.conf \
$(NULL)
diff --git a/conf.d/Makefile.in b/conf.d/Makefile.in
index 9356f60e..823713bf 100644
--- a/conf.d/Makefile.in
+++ b/conf.d/Makefile.in
@@ -84,8 +84,13 @@ DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am \
$(dist_healthconfig_DATA) $(dist_nodeconfig_DATA) \
$(dist_pythonconfig_DATA)
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/m4/ax_pthread.m4 \
- $(top_srcdir)/configure.ac
+am__aclocal_m4_deps = $(top_srcdir)/m4/ax_c___atomic.m4 \
+ $(top_srcdir)/m4/ax_c__generic.m4 \
+ $(top_srcdir)/m4/ax_c_mallinfo.m4 \
+ $(top_srcdir)/m4/ax_c_mallopt.m4 \
+ $(top_srcdir)/m4/ax_check_compile_flag.m4 \
+ $(top_srcdir)/m4/ax_pthread.m4 $(top_srcdir)/m4/jemalloc.m4 \
+ $(top_srcdir)/m4/tcmalloc.m4 $(top_srcdir)/configure.ac
am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
$(ACLOCAL_M4)
mkinstalldirs = $(install_sh) -d
@@ -212,6 +217,7 @@ PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
PTHREAD_LIBS = @PTHREAD_LIBS@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
+SSE_CANDIDATE = @SSE_CANDIDATE@
STRIP = @STRIP@
UUID_CFLAGS = @UUID_CFLAGS@
UUID_LIBS = @UUID_LIBS@
@@ -244,6 +250,8 @@ datarootdir = @datarootdir@
docdir = @docdir@
dvidir = @dvidir@
exec_prefix = @exec_prefix@
+has_jemalloc = @has_jemalloc@
+has_tcmalloc = @has_tcmalloc@
host = @host@
host_alias = @host_alias@
host_cpu = @host_cpu@
@@ -288,6 +296,8 @@ dist_config_DATA = \
apps_groups.conf \
charts.d.conf \
python.d.conf \
+ health_alarm_notify.conf \
+ health_email_recipients.conf \
$(NULL)
chartsconfigdir = $(configdir)/charts.d
@@ -315,6 +325,7 @@ dist_pythonconfig_DATA = \
python.d/phpfpm.conf \
python.d/postfix.conf \
python.d/redis.conf \
+ python.d/retroshare.conf \
python.d/sensors.conf \
python.d/squid.conf \
python.d/tomcat.conf \
@@ -326,13 +337,17 @@ dist_healthconfig_DATA = \
health.d/cpu.conf \
health.d/disks.conf \
health.d/entropy.conf \
+ health.d/tcp_resets.conf \
health.d/memcached.conf \
+ health.d/mysql.conf \
health.d/named.conf \
health.d/net.conf \
health.d/nginx.conf \
health.d/qos.conf \
health.d/ram.conf \
health.d/redis.conf \
+ health.d/retroshare.conf \
+ health.d/softnet.conf \
health.d/swap.conf \
health.d/squid.conf \
$(NULL)
diff --git a/conf.d/apps_groups.conf b/conf.d/apps_groups.conf
index 0a6f55cd..57357a87 100644
--- a/conf.d/apps_groups.conf
+++ b/conf.d/apps_groups.conf
@@ -114,7 +114,7 @@ ha: corosync hs_logd ha_logd stonithd
pbx: asterisk safe_asterisk *vicidial*
sip: opensips* stund
murmur: murmurd
-vines: *vines*
+xmpp: *vines* *prosody*
# -----------------------------------------------------------------------------
# monitoring
diff --git a/conf.d/health.d/apache.conf b/conf.d/health.d/apache.conf
index 1fddbc99..0aaf0e00 100644
--- a/conf.d/health.d/apache.conf
+++ b/conf.d/health.d/apache.conf
@@ -4,10 +4,11 @@
template: apache_last_collected_secs
on: apache.requests
calc: $now - $last_collected_t
- every: 10s
- warn: $this > ( 5 * $update_every)
- crit: $this > (10 * $update_every)
units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
info: number of seconds since the last successful data collection
-
+ to: webmaster
diff --git a/conf.d/health.d/cpu.conf b/conf.d/health.d/cpu.conf
index 9332e508..4d79fc79 100644
--- a/conf.d/health.d/cpu.conf
+++ b/conf.d/health.d/cpu.conf
@@ -1,24 +1,33 @@
-template: 5min_cpu_pcent
+template: 10min_cpu_usage
on: system.cpu
- lookup: average -5m unaligned of user,system,nice,softirq,irq,guest,guest_nice
- every: 1m
- warn: $this > 90
+ lookup: average -10m unaligned of user,system,nice,softirq,irq,guest,guest_nice
units: %
- info: average cpu utilization for the last 5 minutes
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (70) : (80))
+ crit: $this > (($status == $CRITICAL) ? (80) : (90))
+ delay: down 15m multiplier 1.5 max 1h
+ info: average cpu utilization for the last 10 minutes
+ to: sysadmin
-template: 5min_iowait_cpu_pcent
+template: 10min_cpu_iowait
on: system.cpu
- lookup: average -5m unaligned of iowait
- every: 1m
- warn: $this > 10
+ lookup: average -10m unaligned of iowait
units: %
- info: average wait I/O for the last 5 minutes
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (5) : (10))
+ crit: $this > (($status == $CRITICAL) ? (20) : (30))
+ delay: down 15m multiplier 1.5 max 1h
+ info: average CPU wait I/O for the last 10 minutes
+ to: sysadmin
-template: 20min_steal_cpu_pcent
+template: 20min_steal_cpu
on: system.cpu
lookup: average -20m unaligned of steal
- every: 5m
- warn: $this > 10
units: %
- info: average stolen CPU time for the last 20 minutes
+ every: 5m
+ warn: $this > (($status >= $WARNING) ? (5) : (10))
+ crit: $this > (($status == $CRITICAL) ? (20) : (30))
+ delay: down 15m multiplier 1.5 max 1h
+ info: average CPU steal time for the last 20 minutes
+ to: sysadmin
diff --git a/conf.d/health.d/disks.conf b/conf.d/health.d/disks.conf
index c38f1a0a..cc7a4766 100644
--- a/conf.d/health.d/disks.conf
+++ b/conf.d/health.d/disks.conf
@@ -1,18 +1,59 @@
# -----------------------------------------------------------------------------
+# make sure we collect values for each disk
+
+# for mount points
+template: disk_space_last_collected_secs
+ on: disk.space
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection of the mount point
+ to: sysadmin
+
+# for block devices
+template: disk_last_collected_secs
+ on: disk.io
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection of the block device
+ to: sysadmin
+
+
+# -----------------------------------------------------------------------------
# low disk space
# checking the latest collected values
# raise an alarm if the disk is low on
# available disk space
-template: disk_full_percent
+template: disk_space_usage
on: disk.space
calc: $used * 100 / ($avail + $used)
- every: 1m
- warn: $this > 80
- crit: $this > 95
units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING ) ? (70) : (80))
+ crit: $this > (($status == $CRITICAL) ? (85) : (95))
+ delay: up 1m down 15m multiplier 1.5 max 1h
info: current disk space usage
+ to: sysadmin
+
+template: disk_inode_usage
+ on: disk.inodes
+ calc: $used * 100 / ($avail + $used)
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (75) : (80))
+ crit: $this > (($status == $CRITICAL) ? (90) : (95))
+ delay: up 1m down 15m multiplier 1.5 max 1h
+ info: current disk inode usage
+ to: sysadmin
# -----------------------------------------------------------------------------
@@ -20,7 +61,7 @@ template: disk_full_percent
# calculate the rate the disk fills
# use as base, the available space change
-# during the last 30 minutes
+# during the last hour
# this is just a calculation - it has no alarm
# we will use it in the next template to find
@@ -28,25 +69,27 @@ template: disk_full_percent
template: disk_fill_rate
on: disk.space
- lookup: max -1s at -30m unaligned of avail
- calc: ($this - $avail) / ($now - $after)
- every: 15s
- units: MB/s
- info: average rate the disk fills up (positive), or frees up (negative) space, for the last 30 minutes
+ lookup: min -10m at -50m unaligned of avail
+ calc: ($this - $avail) / (($now - $after) / 3600)
+ every: 1m
+ units: GB/hour
+ info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour
# calculate the hours remaining
# if the disk continues to fill
# in this rate
-template: disk_full_after_hours
+template: out_of_disk_space_time
on: disk.space
- calc: $avail / $disk_fill_rate / 3600
- every: 10s
- warn: $this > 0 and $this < 48
- crit: $this > 0 and $this < 24
+ calc: $avail / $disk_fill_rate
units: hours
- info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last 30 minutes
+ every: 10s
+ warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8))
+ crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
+ delay: down 15m multiplier 1.2 max 1h
+ info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour
+ to: sysadmin
# -----------------------------------------------------------------------------
@@ -59,13 +102,15 @@ template: disk_full_after_hours
template: 10min_disk_utilization
on: disk.util
lookup: average -10m unaligned
+ units: %
every: 1m
green: 90
red: 98
- warn: $this > $green
- crit: $this > $red
- units: %
+ warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1))
+ crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1))
+ delay: down 15m multiplier 1.2 max 1h
info: the percentage of time the disk was busy, during the last 10 minutes
+ to: sysadmin
# raise an alarm if the disk backlog
@@ -76,10 +121,12 @@ template: 10min_disk_utilization
template: 10min_disk_backlog
on: disk.backlog
lookup: average -10m unaligned
- every: 1m
- green: 1000
- red: 2000
- warn: $this > $green
- crit: $this > $red
units: ms
+ every: 1m
+ green: 2000
+ red: 5000
+ warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1))
+ crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1))
+ delay: down 15m multiplier 1.2 max 1h
info: average of the kernel estimated disk backlog, for the last 10 minutes
+ to: sysadmin
diff --git a/conf.d/health.d/entropy.conf b/conf.d/health.d/entropy.conf
index 6f8b6e85..d0eca8a6 100644
--- a/conf.d/health.d/entropy.conf
+++ b/conf.d/health.d/entropy.conf
@@ -1,13 +1,14 @@
# check if entropy is too low
# the alarm is checked every 1 minute
-# and examines the last 30 minutes of data
+# and examines the last hour of data
- alarm: min_30min_entropy
+ alarm: 1hour_lowest_entropy
on: system.entropy
- lookup: min -30m unaligned
- every: 1m
- warn: $this < 200
- crit: $this < 100
+ lookup: min -1h unaligned
units: entries
- info: minimum entries in the random numbers pool (entropy), for the last 30 minutes
+ every: 5m
+ warn: $this < (($status >= $WARNING) ? (200) : (100))
+ delay: down 1h multiplier 1.5 max 1h
+ info: minimum entries in the random numbers pool in the last 30 minutes
+ to: silent
diff --git a/conf.d/health.d/memcached.conf b/conf.d/health.d/memcached.conf
index 05ff1471..46a8ca0e 100644
--- a/conf.d/health.d/memcached.conf
+++ b/conf.d/health.d/memcached.conf
@@ -4,43 +4,49 @@
template: memcached_last_collected_secs
on: memcached.cache
calc: $now - $last_collected_t
- every: 10s
- warn: $this > ( 5 * $update_every)
- crit: $this > (10 * $update_every)
units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
info: number of seconds since the last successful data collection
+ to: dba
# detect if memcached cache is full
-template: cache_full_pcent
+template: memcached_cache_memory_usage
on: memcached.cache
calc: $used * 100 / ($used + $available)
- every: 10s
- warn: $this > 80
- crit: $this > 90
units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (70) : (80))
+ crit: $this > (($status == $CRITICAL) ? (80) : (90))
+ delay: up 0 down 15m multiplier 1.5 max 1h
info: current cache memory usage
+ to: dba
# find the rate memcached cache is filling
template: cache_fill_rate
on: memcached.cache
- lookup: max -1s at -30m unaligned of available
- calc: ($this - $available) / ($now - $after)
- every: 15s
- units: KB/s
- info: average rate the cache fills up (positive), or frees up (negative) space, for the last 30 minutes
+ lookup: min -10m at -50m unaligned of available
+ calc: ($this - $available) / (($now - $after) / 3600)
+ units: KB/hour
+ every: 1m
+ info: average rate the cache fills up (positive), or frees up (negative) space, for the last hour
# find the hours remaining until memcached cache is full
-template: cache_full_after_hours
+template: out_of_cache_space_time
on: memcached.cache
- calc: $available / $cache_fill_rate / 3600
- every: 10s
- warn: $this > 0 and $this < 48
- crit: $this > 0 and $this < 24
+ calc: $available / $cache_fill_rate
units: hours
- info: estimated time the cache will run out of space, if the system continues to add data with the rate of the last 30 minutes
+ every: 10s
+ warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8))
+ crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
+ delay: down 15m multiplier 1.5 max 1h
+ info: estimated time the cache will run out of space, if the system continues to add data with the rate of the last hour
+ to: dba
diff --git a/conf.d/health.d/mysql.conf b/conf.d/health.d/mysql.conf
new file mode 100644
index 00000000..a2cfa3ec
--- /dev/null
+++ b/conf.d/health.d/mysql.conf
@@ -0,0 +1,13 @@
+
+# make sure mysql is running
+
+template: mysql_last_collected_secs
+ on: mysql.queries
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: dba
diff --git a/conf.d/health.d/named.conf b/conf.d/health.d/named.conf
index e46d1d33..f2eaa83c 100644
--- a/conf.d/health.d/named.conf
+++ b/conf.d/health.d/named.conf
@@ -4,9 +4,11 @@
template: named_last_collected_secs
on: named.global_queries
calc: $now - $last_collected_t
- every: 10s
- warn: $this > ( 5 * $update_every)
- crit: $this > (10 * $update_every)
units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
info: number of seconds since the last successful data collection
+ to: domainadmin
diff --git a/conf.d/health.d/net.conf b/conf.d/health.d/net.conf
index f65bc4fc..7753aa18 100644
--- a/conf.d/health.d/net.conf
+++ b/conf.d/health.d/net.conf
@@ -1,27 +1,48 @@
+# -----------------------------------------------------------------------------
+# make sure we collect values for each interface
+
+template: interface_last_collected_secs
+ on: net.net
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: sysadmin
+
+
+# -----------------------------------------------------------------------------
# check if an interface is dropping packets
-# the alarm is checked every 10 seconds
-# and examines the last 30 minutes of data
+# the alarm is checked every 1 minute
+# and examines the last hour of data
-template: 30min_packet_drops
+template: 1hour_packet_drops
on: net.drops
- lookup: sum -30m unaligned absolute
- every: 1m
- crit: $this > 0
+ lookup: sum -1h unaligned absolute
units: packets
- info: dropped packets in the last 30 minutes
+ every: 1m
+ warn: $this > 0
+ delay: down 30m multiplier 1.5 max 1h
+ info: interface dropped packets in the last hour
+ to: sysadmin
+# -----------------------------------------------------------------------------
+
# check if an interface is having FIFO
# buffer errors
-# the alarm is checked every 10 seconds
-# and examines the last 30 minutes of data
+# the alarm is checked every 1 minute
+# and examines the last hour of data
-template: 30min_fifo_errors
+template: 1hour_fifo_errors
on: net.fifo
- lookup: sum -30m unaligned absolute
- every: 1m
- crit: $this > 0
+ lookup: sum -1h unaligned absolute
units: errors
- info: network interface fifo errors in the last 30 minutes
-
+ every: 1m
+ warn: $this > 0
+ delay: down 30m multiplier 1.5 max 1h
+ info: interface fifo errors in the last hour
+ to: sysadmin
diff --git a/conf.d/health.d/nginx.conf b/conf.d/health.d/nginx.conf
index da13008e..d70d6a59 100644
--- a/conf.d/health.d/nginx.conf
+++ b/conf.d/health.d/nginx.conf
@@ -4,9 +4,11 @@
template: nginx_last_collected_secs
on: nginx.requests
calc: $now - $last_collected_t
- every: 10s
- warn: $this > ( 5 * $update_every)
- crit: $this > (10 * $update_every)
units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
info: number of seconds since the last successful data collection
+ to: webmaster
diff --git a/conf.d/health.d/qos.conf b/conf.d/health.d/qos.conf
index ac3bf8ff..9e5939fd 100644
--- a/conf.d/health.d/qos.conf
+++ b/conf.d/health.d/qos.conf
@@ -8,5 +8,7 @@
# lookup: sum -10m unaligned absolute
# every: 30s
# warn: $this > 0
+# delay: up 0 down 30m multiplier 1.5 max 1h
# units: packets
# info: dropped packets in the last 30 minutes
+# to: sysadmin
diff --git a/conf.d/health.d/ram.conf b/conf.d/health.d/ram.conf
index 1d368112..216b82fe 100644
--- a/conf.d/health.d/ram.conf
+++ b/conf.d/health.d/ram.conf
@@ -1,9 +1,11 @@
- alarm: used_ram_pcent
+ alarm: ram_in_use
on: system.ram
calc: $used * 100 / ($used + $cached + $free)
- every: 10s
- warn: $this > 80
- crit: $this > 90
units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (70) : (80))
+ crit: $this > (($status == $CRITICAL) ? (80) : (90))
+ delay: down 15m multiplier 1.5 max 1h
info: system RAM usage
+ to: sysadmin
diff --git a/conf.d/health.d/redis.conf b/conf.d/health.d/redis.conf
index 3750176c..3e648d85 100644
--- a/conf.d/health.d/redis.conf
+++ b/conf.d/health.d/redis.conf
@@ -4,9 +4,11 @@
template: redis_last_collected_secs
on: redis.operations
calc: $now - $last_collected_t
- every: 10s
- warn: $this > ( 5 * $update_every)
- crit: $this > (10 * $update_every)
units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
info: number of seconds since the last successful data collection
+ to: dba
diff --git a/conf.d/health.d/retroshare.conf b/conf.d/health.d/retroshare.conf
new file mode 100644
index 00000000..1af7b468
--- /dev/null
+++ b/conf.d/health.d/retroshare.conf
@@ -0,0 +1,25 @@
+# make sure RetroShare is running
+
+template: retroshare_last_collected_secs
+ on: retroshare.peers
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: sysadmin
+
+# make sure the DHT is fine when active
+
+template: retroshare_dht_working
+ on: retroshare.dht
+ calc: $dht_size_all
+ units: peers
+ every: 1m
+ warn: $this < (($status >= $WARNING) ? (120) : (100))
+ crit: $this < (($status == $CRITICAL) ? (10) : (1))
+ delay: up 0 down 15m multiplier 1.5 max 1h
+ info: Checks if the DHT has enough peers to operate
+ to: sysadmin
diff --git a/conf.d/health.d/softnet.conf b/conf.d/health.d/softnet.conf
new file mode 100644
index 00000000..0c3709f4
--- /dev/null
+++ b/conf.d/health.d/softnet.conf
@@ -0,0 +1,21 @@
+# check for common /proc/net/softnet_stat errors
+
+ alarm: 1hour_netdev_backlog_exceeded
+ on: system.softnet_stat
+ lookup: sum -1h unaligned absolute of dropped
+ units: packets
+ every: 1m
+ warn: $this > 0
+ delay: down 30m multiplier 1.5 max 1h
+ info: number of packets dropped because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets)
+ to: sysadmin
+
+ alarm: 1hour_netdev_budget_ran_outs
+ on: system.softnet_stat
+ lookup: sum -1h unaligned absolute of squeezed
+ units: events
+ every: 1m
+ warn: $this > 0
+ delay: down 30m multiplier 1.5 max 1h
+ info: number of times ksoftirq ran out of sysctl net.core.netdev_budget or time slice, with work remaining (this can be a cause for dropped packets)
+ to: silent
diff --git a/conf.d/health.d/squid.conf b/conf.d/health.d/squid.conf
index cc5ce1c3..76143c5d 100644
--- a/conf.d/health.d/squid.conf
+++ b/conf.d/health.d/squid.conf
@@ -4,9 +4,11 @@
template: squid_last_collected_secs
on: squid.clients_requests
calc: $now - $last_collected_t
- every: 10s
- warn: $this > ( 5 * $update_every)
- crit: $this > (10 * $update_every)
units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
info: number of seconds since the last successful data collection
+ to: proxyadmin
diff --git a/conf.d/health.d/swap.conf b/conf.d/health.d/swap.conf
index 552dd310..0cfa888c 100644
--- a/conf.d/health.d/swap.conf
+++ b/conf.d/health.d/swap.conf
@@ -4,17 +4,21 @@
lookup: sum -30m unaligned absolute of out
# we have to convert KB to MB by dividing $this (i.e. the result of the lookup) with 1024
calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
- every: 1m
- warn: $this > 1
- crit: $this > 10
units: % of RAM
- info: the sum of all memory swapped out during the last 30 minutes, as a percentage of the available RAM
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (5) : (10))
+ crit: $this > (($status == $CRITICAL) ? (15) : (20))
+ delay: up 0 down 15m multiplier 1.5 max 1h
+ info: the amount of memory swapped in the last 30 minutes, as a percentage of the system RAM
+ to: sysadmin
- alarm: pcent_of_ram_in_swap
+ alarm: used_swap_space
on: system.swap
calc: $used * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
- every: 10s
- warn: $this > 10
- crit: $this > 50
units: % of RAM
- info: the currently used swap space, as a percentage of the available RAM
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (15) : (20))
+ crit: $this > (($status == $CRITICAL) ? (40) : (50))
+ delay: up 0 down 15m multiplier 1.5 max 1h
+ info: the swap memory used, as a percentage of the system RAM
+ to: sysadmin
diff --git a/conf.d/health.d/tcp_resets.conf b/conf.d/health.d/tcp_resets.conf
new file mode 100644
index 00000000..8e93c479
--- /dev/null
+++ b/conf.d/health.d/tcp_resets.conf
@@ -0,0 +1,32 @@
+# -----------------------------------------------------------------------------
+
+ alarm: ipv4_tcphandshake_last_collected_secs
+ on: ipv4.tcphandshake
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+ delay: up 0 down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: sysadmin
+
+# -----------------------------------------------------------------------------
+
+ alarm: 1m_ipv4_tcp_resets
+ on: ipv4.tcphandshake
+ lookup: average -1m at -10s unaligned absolute of OutRsts
+ units: tcp resets/s
+ every: 10s
+ info: average TCP RESETS this host is sending, over the last minute
+
+ alarm: 10s_ipv4_tcp_resets
+ on: ipv4.tcphandshake
+ lookup: average -10s unaligned absolute of OutRsts
+ units: tcp resets/s
+ every: 10s
+ warn: $this > ((($1m_ipv4_tcp_resets < 5)?(5):($1m_ipv4_tcp_resets)) * (($status >= $WARNING) ? (1) : (4)))
+ delay: up 0 down 60m multiplier 1.2 max 2h
+ info: average TCP RESETS this host is sending, over the last 10 seconds (this can be an indication that a port scan is made, or that a service running on this host has crashed)
+ to: sysadmin
+
diff --git a/conf.d/health_alarm_notify.conf b/conf.d/health_alarm_notify.conf
new file mode 100644
index 00000000..d451cafe
--- /dev/null
+++ b/conf.d/health_alarm_notify.conf
@@ -0,0 +1,222 @@
+# Configuration for alarm notifications
+#
+# This configuration is used by: alarm-notify.sh
+# changes take effect immediately (the next alarm will use them).
+#
+# alarm-notify.sh can send:
+# - e-mails (using the sendmail command),
+# - push notifications to your mobile phone (pushover.net),
+# - messages to your slack team (slack.com),
+# - messages to your telegram chat / group chat (telegram.org)
+#
+# The 'to' line given at netdata alarms defines a *role*, so that many
+# people can be notified for each role.
+#
+# This file is a BASH script itself.
+#
+#
+###############################################################################
+# proxy configuration
+#
+# If you need to send curl based notifications (pushover, slack, telegram)
+# via a proxy, set these to your proxy address:
+#export http_proxy="http://10.0.0.1:3128/"
+#export https_proxy="http://10.0.0.1:3128/"
+
+
+###############################################################################
+# notifications images
+#
+# Images in notifications need to be downloaded from an Internet facing site.
+# To allow notification providers fetch the icons/images, by default we set
+# the URL of the global public netdata registry.
+# If you have an Internet facing netdata (or you have copied the images/ folder
+# of netdata to your web server), set its URL here, to fetch the notification
+# images from it.
+#images_base_url="http://my.public.netdata.server:19999"
+
+
+###############################################################################
+# external commands
+
+# The full path to the sendmail command.
+# If empty, the system $PATH will be searched for it.
+# If not found, email notifications will be disabled (silently).
+sendmail=""
+
+# The full path of the curl command.
+# If empty, the system $PATH will be searched for it.
+# If not found, pushover, telegram and slack notifications will be
+# silently disabled.
+curl=""
+
+
+###############################################################################
+# NOTE ABOUT RECIPIENTS
+#
+# When you define recipients (all types):
+#
+# - emails addresses
+# - pushover user tokens
+# - telegram chat ids
+# - slack channels
+#
+# You can append |critical to limit the notifications to be sent.
+#
+# In these examples, the first recipient receives all the alarms
+# while the second one receives only the critical ones:
+#
+# email : "user1@example.com user2@example.com|critical"
+# pushover: "2987343...9437837 8756278...2362736|critical"
+# telegram: "111827421 112746832|critical"
+# slack : "alarms disasters|critical"
+#
+# If a recipient is set to empty string, the default recipient of the given
+# notification method (email, pushover, telegram, slack) will be used.
+# To disable a notification, use the recipient called: disabled
+# This works for all notification methods (including the default recipients).
+
+
+###############################################################################
+# email global notification options
+
+# multiple recipients can be given like this:
+# "admin1@example.com admin2@example.com ..."
+
+# enable/disable sending emails
+SEND_EMAIL="YES"
+
+# if a role recipient is not configured, an email will be send to:
+DEFAULT_RECIPIENT_EMAIL="root"
+# to receive only critical alarms, set it to "root|critical"
+
+
+###############################################################################
+# pushover (pushover.net) global notification options
+
+# multiple recipients can be given like this:
+# "USERTOKEN1 USERTOKEN2 ..."
+
+# enable/disable sending pushover notifications
+SEND_PUSHOVER="YES"
+
+# Login to pushover.net to get your pushover app token.
+# You need only one for all your netdata servers (or you can have one for
+# each of your netdata - your call).
+# Without an app token, netdata cannot send pushover notifications.
+PUSHOVER_APP_TOKEN=""
+
+# if a role's recipients are not configured, a notification will be send to
+# this pushover user token (empty = do not send a notification for unconfigured
+# roles):
+DEFAULT_RECIPIENT_PUSHOVER=""
+
+
+###############################################################################
+# telegram (telegram.org) global notification options
+
+# To get your chat ID send the command /my_id to telegram bot @get_id.
+# Users also need to open a query with the bot (see below).
+
+# note: multiple recipients can be given like this:
+# "CHAT_ID_1 CHAT_ID_1 ..."
+
+# enable/disable sending telegram messages
+SEND_TELEGRAM="YES"
+
+# Contact the bot @BotFather to create a new bot and receive a bot token.
+# Without it, netdata cannot send telegram messages.
+TELEGRAM_BOT_TOKEN=""
+
+# If a role's recipients are not configured, a message will be send to
+# this chat id (empty = do not send a notification for unconfigured roles):
+DEFAULT_RECIPIENT_TELEGRAM=""
+
+
+###############################################################################
+# slack (slack.com) global notification options
+
+# multiple recipients can be given like this:
+# "CHANNEL1 CHANNEL2 ..."
+
+# enable/disable sending slack notifications
+SEND_SLACK="YES"
+
+# Login to slack.com and create an incoming webhook. You need only one for all
+# your netdata servers (or you can have one for each of your netdata).
+# Without it, netdata cannot send slack notifications.
+# Get yours from: https://api.slack.com/incoming-webhooks
+SLACK_WEBHOOK_URL=""
+
+# if a role's recipients are not configured, a notification will be send to
+# this slack channel (empty = do not send a notification for unconfigured
+# roles):
+DEFAULT_RECIPIENT_SLACK=""
+
+
+###############################################################################
+# RECIPIENTS PER ROLE
+
+# -----------------------------------------------------------------------------
+# generic system alarms
+# CPU, disks, network interfaces, entropy, etc
+
+role_recipients_email[sysadmin]="${DEFAULT_RECIPIENT_EMAIL}"
+
+role_recipients_pushover[sysadmin]="${DEFAULT_RECIPIENT_PUSHOVER}"
+
+role_recipients_telegram[sysadmin]="${DEFAULT_RECIPIENT_TELEGRAM}"
+
+role_recipients_slack[sysadmin]="${DEFAULT_RECIPIENT_SLACK}"
+
+
+# -----------------------------------------------------------------------------
+# DNS related alarms
+
+role_recipients_email[domainadmin]="${DEFAULT_RECIPIENT_EMAIL}"
+
+role_recipients_pushover[domainadmin]="${DEFAULT_RECIPIENT_PUSHOVER}"
+
+role_recipients_telegram[domainadmin]="${DEFAULT_RECIPIENT_TELEGRAM}"
+
+role_recipients_slack[domainadmin]="${DEFAULT_RECIPIENT_SLACK}"
+
+
+# -----------------------------------------------------------------------------
+# database servers alarms
+# mysql, redis, memcached, etc
+
+role_recipients_email[dba]="${DEFAULT_RECIPIENT_EMAIL}"
+
+role_recipients_pushover[dba]="${DEFAULT_RECIPIENT_PUSHOVER}"
+
+role_recipients_telegram[dba]="${DEFAULT_RECIPIENT_TELEGRAM}"
+
+role_recipients_slack[dba]="${DEFAULT_RECIPIENT_SLACK}"
+
+
+# -----------------------------------------------------------------------------
+# web servers alarms
+# apache, nginx, etc
+
+role_recipients_email[webmaster]="${DEFAULT_RECIPIENT_EMAIL}"
+
+role_recipients_pushover[webmaster]="${DEFAULT_RECIPIENT_PUSHOVER}"
+
+role_recipients_telegram[webmaster]="${DEFAULT_RECIPIENT_TELEGRAM}"
+
+role_recipients_slack[webmaster]="${DEFAULT_RECIPIENT_SLACK}"
+
+
+# -----------------------------------------------------------------------------
+# proxy servers alarms
+# apache, nginx, etc
+
+role_recipients_email[proxyadmin]="${DEFAULT_RECIPIENT_EMAIL}"
+
+role_recipients_pushover[proxyadmin]="${DEFAULT_RECIPIENT_PUSHOVER}"
+
+role_recipients_telegram[proxyadmin]="${DEFAULT_RECIPIENT_TELEGRAM}"
+
+role_recipients_slack[proxyadmin]="${DEFAULT_RECIPIENT_SLACK}"
+
diff --git a/conf.d/health_email_recipients.conf b/conf.d/health_email_recipients.conf
new file mode 100644
index 00000000..f56c6c64
--- /dev/null
+++ b/conf.d/health_email_recipients.conf
@@ -0,0 +1,2 @@
+# OBSOLETE FILE
+# REPLACED WITH health_alarm_notify.conf
diff --git a/conf.d/python.d/mysql.conf b/conf.d/python.d/mysql.conf
index d247b89a..8fbbe651 100644
--- a/conf.d/python.d/mysql.conf
+++ b/conf.d/python.d/mysql.conf
@@ -104,6 +104,8 @@ tcp:
# pass : ''
host : 'localhost'
port : '3306'
+ # keep in mind port might be ignored by mysql, if host = 'localhost'
+ # http://serverfault.com/questions/337818/how-to-force-mysql-to-connect-by-tcp-instead-of-a-unix-socket/337844#337844
tcpipv4:
name : 'local'
@@ -158,6 +160,8 @@ tcp_root:
# pass : ''
host : 'localhost'
port : '3306'
+ # keep in mind port might be ignored by mysql, if host = 'localhost'
+ # http://serverfault.com/questions/337818/how-to-force-mysql-to-connect-by-tcp-instead-of-a-unix-socket/337844#337844
tcpipv4_root:
name : 'local'
diff --git a/conf.d/python.d/nginx.conf b/conf.d/python.d/nginx.conf
index 1a27d67c..645925a5 100644
--- a/conf.d/python.d/nginx.conf
+++ b/conf.d/python.d/nginx.conf
@@ -1,5 +1,17 @@
# netdata python.d.plugin configuration for nginx
#
+# You must have ngx_http_stub_status_module configured on your nginx server for this
+# plugin to work. The following is an example config.
+# It must be located inside a server { } block.
+#
+# location /stub_status {
+# stub_status;
+# # Security: Only allow access from the IP below.
+# allow 192.168.1.200;
+# # Deny anyone else
+# deny all;
+# }
+#
# This file is in YaML format. Generally the format is:
#
# name: value
@@ -47,14 +59,15 @@
# predefined parameters. These are:
#
# job_name:
-# name: myname # the JOB's name as it will appear at the
-# # dashboard (by default is the job_name)
+# name: my_name # the JOB's name as it will appear at the
+# # dashboard. If name: is not supplied the
+# # job_name: will be used (use _ for spaces)
# # JOBs sharing a name are mutually exclusive
# update_every: 1 # the JOB's data collection frequency
# priority: 60000 # the JOB's order on the dashboard
# retries: 5 # the JOB's number of restoration attempts
#
-# Additionally to the above, nginx also supports the following:
+# Additionally to the above, this plugin also supports the following:
#
# url: 'URL' # the URL to fetch nginx's status stats
#
@@ -63,6 +76,14 @@
# user: 'username'
# pass: 'password'
#
+# Example
+#
+# RemoteNginx:
+# name : 'Reverse_Proxy'
+# url : 'http://yourdomain.com/stub_status'
+#
+# "RemoteNginx" will show up in Netdata logs. "Reverse Proxy" will show up in the menu
+# in the nginx section.
# ----------------------------------------------------------------------
# AUTO-DETECTION JOBS
diff --git a/conf.d/python.d/phpfpm.conf b/conf.d/python.d/phpfpm.conf
index 06d2367a..f5d067cc 100644
--- a/conf.d/python.d/phpfpm.conf
+++ b/conf.d/python.d/phpfpm.conf
@@ -57,6 +57,7 @@
# Additionally to the above, PHP-FPM also supports the following:
#
# url: 'URL' # the URL to fetch nginx's status stats
+# # Be sure and include ?full&status at the end of the url
#
# if the URL is password protected, the following are supported:
#
@@ -70,13 +71,13 @@
localhost:
name : 'local'
- url : "http://localhost/status"
+ url : "http://localhost/status?full&json"
localipv4:
name : 'local'
- url : "http://127.0.0.1/status"
+ url : "http://127.0.0.1/status?full&json"
localipv6:
name : 'local'
- url : "http://::1/status"
+ url : "http://::1/status?full&json"
diff --git a/conf.d/python.d/retroshare.conf b/conf.d/python.d/retroshare.conf
new file mode 100644
index 00000000..79614373
--- /dev/null
+++ b/conf.d/python.d/retroshare.conf
@@ -0,0 +1,67 @@
+# netdata python.d.plugin configuration for RetroShare
+#
+# This file is in YaML format. Generally the format is:
+#
+# name: value
+#
+# There are 2 sections:
+# - global variables
+# - one or more JOBS
+#
+# JOBS allow you to collect values from multiple sources.
+# Each source will have its own set of charts.
+#
+# JOB parameters have to be indented (using spaces only, example below).
+
+# ----------------------------------------------------------------------
+# Global Variables
+# These variables set the defaults for all JOBs, however each JOB
+# may define its own, overriding the defaults.
+
+# update_every sets the default data collection frequency.
+# If unset, the python.d.plugin default is used.
+# update_every: 1
+
+# priority controls the order of charts at the netdata dashboard.
+# Lower numbers move the charts towards the top of the page.
+# If unset, the default for python.d.plugin is used.
+# priority: 60000
+
+# retries sets the number of retries to be made in case of failures.
+# If unset, the default for python.d.plugin is used.
+# Attempts to restore the service are made once every update_every
+# and only if the module has collected values in the past.
+# retries: 5
+
+# ----------------------------------------------------------------------
+# JOBS (data collection sources)
+#
+# The default JOBS share the same *name*. JOBS with the same name
+# are mutually exclusive. Only one of them will be allowed running at
+# any time. This allows autodetection to try several alternatives and
+# pick the one that works.
+#
+# Any number of jobs is supported.
+#
+# All python.d.plugin JOBS (for all its modules) support a set of
+# predefined parameters. These are:
+#
+# job_name:
+# name: myname # the JOB's name as it will appear at the
+# # dashboard (by default is the job_name)
+# # JOBs sharing a name are mutually exclusive
+# update_every: 1 # the JOB's data collection frequency
+# priority: 60000 # the JOB's order on the dashboard
+# retries: 5 # the JOB's number of restoration attempts
+#
+# Additionally to the above, RetroShare also supports the following:
+#
+# - url: 'url' # the URL to the WebUI
+#
+# ----------------------------------------------------------------------
+# AUTO-DETECTION JOBS
+# only one of them will run (they have the same name)
+
+localhost:
+ name: 'local'
+ url: 'http://localhost:9090'