Merge tag 'upstream/1.4.0+dfsg'

Upstream version 1.4.0+dfsg
author: Federico Ceratto <federico.ceratto@gmail.com> 2016-11-23 15:49:14 +0000
committer: Federico Ceratto <federico.ceratto@gmail.com> 2016-11-23 15:49:14 +0000
commit: 68141d9dac0c08e51d257feef16a79086dd8a2df (patch)
tree: f4a0f5d31ed2194b5991130754b297b9c8c076e6 /conf.d
parent: Release v. 1.3.0+dfsg-1 to Unstable (diff)
parent: New upstream version 1.4.0+dfsg (diff)
download: netdata-68141d9dac0c08e51d257feef16a79086dd8a2df.tar.xz
netdata-68141d9dac0c08e51d257feef16a79086dd8a2df.zip
26 files changed, 647 insertions, 116 deletions
diff --git a/conf.d/Makefile.am b/conf.d/Makefile.am
index 02fe86b01..066744cab 100644
--- a/conf.d/Makefile.am
+++ b/conf.d/Makefile.am
@@ -7,6 +7,8 @@ dist_config_DATA = \
 	apps_groups.conf \
 	charts.d.conf \
 	python.d.conf \
+	health_alarm_notify.conf \
+	health_email_recipients.conf \
 	$(NULL)
 
 chartsconfigdir=$(configdir)/charts.d
@@ -34,6 +36,7 @@ dist_pythonconfig_DATA = \
 	python.d/phpfpm.conf \
 	python.d/postfix.conf \
 	python.d/redis.conf \
+	python.d/retroshare.conf \
 	python.d/sensors.conf \
 	python.d/squid.conf \
 	python.d/tomcat.conf \
@@ -45,13 +48,17 @@ dist_healthconfig_DATA = \
 	health.d/cpu.conf \
 	health.d/disks.conf \
 	health.d/entropy.conf \
+	health.d/tcp_resets.conf \
 	health.d/memcached.conf \
+	health.d/mysql.conf \
 	health.d/named.conf \
 	health.d/net.conf \
 	health.d/nginx.conf \
 	health.d/qos.conf \
 	health.d/ram.conf \
 	health.d/redis.conf \
+	health.d/retroshare.conf \
+	health.d/softnet.conf \
 	health.d/swap.conf \
 	health.d/squid.conf \
 	$(NULL)
diff --git a/conf.d/Makefile.in b/conf.d/Makefile.in
index 9356f60e2..823713bfa 100644
--- a/conf.d/Makefile.in
+++ b/conf.d/Makefile.in
@@ -84,8 +84,13 @@ DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am \
 	$(dist_healthconfig_DATA) $(dist_nodeconfig_DATA) \
 	$(dist_pythonconfig_DATA)
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/m4/ax_pthread.m4 \
-	$(top_srcdir)/configure.ac
+am__aclocal_m4_deps = $(top_srcdir)/m4/ax_c___atomic.m4 \
+	$(top_srcdir)/m4/ax_c__generic.m4 \
+	$(top_srcdir)/m4/ax_c_mallinfo.m4 \
+	$(top_srcdir)/m4/ax_c_mallopt.m4 \
+	$(top_srcdir)/m4/ax_check_compile_flag.m4 \
+	$(top_srcdir)/m4/ax_pthread.m4 $(top_srcdir)/m4/jemalloc.m4 \
+	$(top_srcdir)/m4/tcmalloc.m4 $(top_srcdir)/configure.ac
 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
 	$(ACLOCAL_M4)
 mkinstalldirs = $(install_sh) -d
@@ -212,6 +217,7 @@ PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
 PTHREAD_LIBS = @PTHREAD_LIBS@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
+SSE_CANDIDATE = @SSE_CANDIDATE@
 STRIP = @STRIP@
 UUID_CFLAGS = @UUID_CFLAGS@
 UUID_LIBS = @UUID_LIBS@
@@ -244,6 +250,8 @@ datarootdir = @datarootdir@
 docdir = @docdir@
 dvidir = @dvidir@
 exec_prefix = @exec_prefix@
+has_jemalloc = @has_jemalloc@
+has_tcmalloc = @has_tcmalloc@
 host = @host@
 host_alias = @host_alias@
 host_cpu = @host_cpu@
@@ -288,6 +296,8 @@ dist_config_DATA = \
 	apps_groups.conf \
 	charts.d.conf \
 	python.d.conf \
+	health_alarm_notify.conf \
+	health_email_recipients.conf \
 	$(NULL)
 
 chartsconfigdir = $(configdir)/charts.d
@@ -315,6 +325,7 @@ dist_pythonconfig_DATA = \
 	python.d/phpfpm.conf \
 	python.d/postfix.conf \
 	python.d/redis.conf \
+	python.d/retroshare.conf \
 	python.d/sensors.conf \
 	python.d/squid.conf \
 	python.d/tomcat.conf \
@@ -326,13 +337,17 @@ dist_healthconfig_DATA = \
 	health.d/cpu.conf \
 	health.d/disks.conf \
 	health.d/entropy.conf \
+	health.d/tcp_resets.conf \
 	health.d/memcached.conf \
+	health.d/mysql.conf \
 	health.d/named.conf \
 	health.d/net.conf \
 	health.d/nginx.conf \
 	health.d/qos.conf \
 	health.d/ram.conf \
 	health.d/redis.conf \
+	health.d/retroshare.conf \
+	health.d/softnet.conf \
 	health.d/swap.conf \
 	health.d/squid.conf \
 	$(NULL)
diff --git a/conf.d/apps_groups.conf b/conf.d/apps_groups.conf
index 0a6f55cd7..57357a873 100644
--- a/conf.d/apps_groups.conf
+++ b/conf.d/apps_groups.conf
@@ -114,7 +114,7 @@ ha: corosync hs_logd ha_logd stonithd
 pbx: asterisk safe_asterisk *vicidial*
 sip: opensips* stund
 murmur: murmurd
-vines: *vines*
+xmpp: *vines* *prosody*
 
 # -----------------------------------------------------------------------------
 # monitoring
diff --git a/conf.d/health.d/apache.conf b/conf.d/health.d/apache.conf
index 1fddbc99f..0aaf0e003 100644
--- a/conf.d/health.d/apache.conf
+++ b/conf.d/health.d/apache.conf
@@ -4,10 +4,11 @@
 template: apache_last_collected_secs
       on: apache.requests
     calc: $now - $last_collected_t
-   every: 10s
-    warn: $this > ( 5 * $update_every)
-    crit: $this > (10 * $update_every)
    units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection
-
+      to: webmaster
 
diff --git a/conf.d/health.d/cpu.conf b/conf.d/health.d/cpu.conf
index 9332e508a..4d79fc799 100644
--- a/conf.d/health.d/cpu.conf
+++ b/conf.d/health.d/cpu.conf
@@ -1,24 +1,33 @@
 
-template: 5min_cpu_pcent
+template: 10min_cpu_usage
       on: system.cpu
-  lookup: average -5m unaligned of user,system,nice,softirq,irq,guest,guest_nice
-   every: 1m
-    warn: $this > 90
+  lookup: average -10m unaligned of user,system,nice,softirq,irq,guest,guest_nice
    units: %
-    info: average cpu utilization for the last 5 minutes
+   every: 1m
+    warn: $this > (($status >= $WARNING)  ? (70) : (80))
+    crit: $this > (($status == $CRITICAL) ? (80) : (90))
+   delay: down 15m multiplier 1.5 max 1h
+    info: average cpu utilization for the last 10 minutes
+      to: sysadmin
 
-template: 5min_iowait_cpu_pcent
+template: 10min_cpu_iowait
       on: system.cpu
-  lookup: average -5m unaligned of iowait
-   every: 1m
-    warn: $this > 10
+  lookup: average -10m unaligned of iowait
    units: %
-    info: average wait I/O for the last 5 minutes
+   every: 1m
+    warn: $this > (($status >= $WARNING)  ? (5)  : (10))
+    crit: $this > (($status == $CRITICAL) ? (20) : (30))
+   delay: down 15m multiplier 1.5 max 1h
+    info: average CPU wait I/O for the last 10 minutes
+      to: sysadmin
 
-template: 20min_steal_cpu_pcent
+template: 20min_steal_cpu
       on: system.cpu
   lookup: average -20m unaligned of steal
-   every: 5m
-    warn: $this > 10
    units: %
-    info: average stolen CPU time for the last 20 minutes
+   every: 5m
+    warn: $this > (($status >= $WARNING)  ? (5)  : (10))
+    crit: $this > (($status == $CRITICAL) ? (20) : (30))
+   delay: down 15m multiplier 1.5 max 1h
+    info: average CPU steal time for the last 20 minutes
+      to: sysadmin
diff --git a/conf.d/health.d/disks.conf b/conf.d/health.d/disks.conf
index c38f1a0a0..cc7a47660 100644
--- a/conf.d/health.d/disks.conf
+++ b/conf.d/health.d/disks.conf
@@ -1,18 +1,59 @@
 # -----------------------------------------------------------------------------
+# make sure we collect values for each disk
+
+# for mount points
+template: disk_space_last_collected_secs
+      on: disk.space
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection of the mount point
+      to: sysadmin
+
+# for block devices
+template: disk_last_collected_secs
+      on: disk.io
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection of the block device
+      to: sysadmin
+
+
+# -----------------------------------------------------------------------------
 # low disk space
 
 # checking the latest collected values
 # raise an alarm if the disk is low on
 # available disk space
 
-template: disk_full_percent
+template: disk_space_usage
       on: disk.space
     calc: $used * 100 / ($avail + $used)
-   every: 1m
-    warn: $this > 80
-    crit: $this > 95
    units: %
+   every: 1m
+    warn: $this > (($status >= $WARNING ) ? (70) : (80))
+    crit: $this > (($status == $CRITICAL) ? (85) : (95))
+   delay: up 1m down 15m multiplier 1.5 max 1h
     info: current disk space usage
+      to: sysadmin
+
+template: disk_inode_usage
+      on: disk.inodes
+    calc: $used * 100 / ($avail + $used)
+   units: %
+   every: 1m
+    warn: $this > (($status >= $WARNING)  ? (75) : (80))
+    crit: $this > (($status == $CRITICAL) ? (90) : (95))
+   delay: up 1m down 15m multiplier 1.5 max 1h
+    info: current disk inode usage
+      to: sysadmin
 
 
 # -----------------------------------------------------------------------------
@@ -20,7 +61,7 @@ template: disk_full_percent
 
 # calculate the rate the disk fills
 # use as base, the available space change
-# during the last 30 minutes
+# during the last hour
 
 # this is just a calculation - it has no alarm
 # we will use it in the next template to find
@@ -28,25 +69,27 @@ template: disk_full_percent
 
 template: disk_fill_rate
       on: disk.space
-  lookup: max -1s at -30m unaligned of avail
-    calc: ($this - $avail) / ($now - $after)
-   every: 15s
-   units: MB/s
-    info: average rate the disk fills up (positive), or frees up (negative) space, for the last 30 minutes
+  lookup: min -10m at -50m unaligned of avail
+    calc: ($this - $avail) / (($now - $after) / 3600)
+   every: 1m
+   units: GB/hour
+    info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour
 
 
 # calculate the hours remaining
 # if the disk continues to fill
 # in this rate
 
-template: disk_full_after_hours
+template: out_of_disk_space_time
       on: disk.space
-    calc: $avail / $disk_fill_rate / 3600
-   every: 10s
-    warn: $this > 0 and $this < 48
-    crit: $this > 0 and $this < 24
+    calc: $avail / $disk_fill_rate
    units: hours
-    info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last 30 minutes
+   every: 10s
+    warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
+    crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
+   delay: down 15m multiplier 1.2 max 1h
+    info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour
+      to: sysadmin
 
 
 # -----------------------------------------------------------------------------
@@ -59,13 +102,15 @@ template: disk_full_after_hours
 template: 10min_disk_utilization
       on: disk.util
   lookup: average -10m unaligned
+   units: %
    every: 1m
    green: 90
      red: 98
-    warn: $this > $green
-    crit: $this > $red
-   units: %
+    warn: $this > $green * (($status >= $WARNING)  ? (0.7) : (1))
+    crit: $this > $red   * (($status == $CRITICAL) ? (0.7) : (1))
+   delay: down 15m multiplier 1.2 max 1h
     info: the percentage of time the disk was busy, during the last 10 minutes
+      to: sysadmin
 
 
 # raise an alarm if the disk backlog
@@ -76,10 +121,12 @@ template: 10min_disk_utilization
 template: 10min_disk_backlog
       on: disk.backlog
   lookup: average -10m unaligned
-   every: 1m
-   green: 1000
-     red: 2000
-    warn: $this > $green
-    crit: $this > $red
    units: ms
+   every: 1m
+   green: 2000
+     red: 5000
+    warn: $this > $green * (($status >= $WARNING)  ? (0.7) : (1))
+    crit: $this > $red   * (($status == $CRITICAL) ? (0.7) : (1))
+   delay: down 15m multiplier 1.2 max 1h
     info: average of the kernel estimated disk backlog, for the last 10 minutes
+      to: sysadmin
diff --git a/conf.d/health.d/entropy.conf b/conf.d/health.d/entropy.conf
index 6f8b6e851..d0eca8a6c 100644
--- a/conf.d/health.d/entropy.conf
+++ b/conf.d/health.d/entropy.conf
@@ -1,13 +1,14 @@
 
 # check if entropy is too low
 # the alarm is checked every 1 minute
-# and examines the last 30 minutes of data
+# and examines the last hour of data
 
-   alarm: min_30min_entropy
+   alarm: 1hour_lowest_entropy
       on: system.entropy
-  lookup: min -30m unaligned
-   every: 1m
-    warn: $this < 200
-    crit: $this < 100
+  lookup: min -1h unaligned
    units: entries
-    info: minimum entries in the random numbers pool (entropy), for the last 30 minutes
+   every: 5m
+    warn: $this < (($status >= $WARNING) ? (200) : (100))
+   delay: down 1h multiplier 1.5 max 1h
+    info: minimum entries in the random numbers pool in the last 30 minutes
+      to: silent
diff --git a/conf.d/health.d/memcached.conf b/conf.d/health.d/memcached.conf
index 05ff14711..46a8ca0e5 100644
--- a/conf.d/health.d/memcached.conf
+++ b/conf.d/health.d/memcached.conf
@@ -4,43 +4,49 @@
 template: memcached_last_collected_secs
       on: memcached.cache
     calc: $now - $last_collected_t
-   every: 10s
-    warn: $this > ( 5 * $update_every)
-    crit: $this > (10 * $update_every)
    units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection
+      to: dba
 
 
 # detect if memcached cache is full
 
-template: cache_full_pcent
+template: memcached_cache_memory_usage
       on: memcached.cache
     calc: $used * 100 / ($used + $available)
-   every: 10s
-    warn: $this > 80
-    crit: $this > 90
    units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (70) : (80))
+    crit: $this > (($status == $CRITICAL) ? (80) : (90))
+   delay: up 0 down 15m multiplier 1.5 max 1h
     info: current cache memory usage
+      to: dba
 
 
 # find the rate memcached cache is filling
 
 template: cache_fill_rate
       on: memcached.cache
-  lookup: max -1s at -30m unaligned of available
-    calc: ($this - $available) / ($now - $after)
-   every: 15s
-   units: KB/s
-    info: average rate the cache fills up (positive), or frees up (negative) space, for the last 30 minutes
+  lookup: min -10m at -50m unaligned of available
+    calc: ($this - $available) / (($now - $after) / 3600)
+   units: KB/hour
+   every: 1m
+    info: average rate the cache fills up (positive), or frees up (negative) space, for the last hour
 
 
 # find the hours remaining until memcached cache is full
 
-template: cache_full_after_hours
+template: out_of_cache_space_time
       on: memcached.cache
-    calc: $available / $cache_fill_rate / 3600
-   every: 10s
-    warn: $this > 0 and $this < 48
-    crit: $this > 0 and $this < 24
+    calc: $available / $cache_fill_rate
    units: hours
-    info: estimated time the cache will run out of space, if the system continues to add data with the rate of the last 30 minutes
+   every: 10s
+    warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
+    crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
+   delay: down 15m multiplier 1.5 max 1h
+    info: estimated time the cache will run out of space, if the system continues to add data with the rate of the last hour
+      to: dba
diff --git a/conf.d/health.d/mysql.conf b/conf.d/health.d/mysql.conf
new file mode 100644
index 000000000..a2cfa3ec5
--- /dev/null
+++ b/conf.d/health.d/mysql.conf
@@ -0,0 +1,13 @@
+
+# make sure mysql is running
+
+template: mysql_last_collected_secs
+      on: mysql.queries
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: dba
diff --git a/conf.d/health.d/named.conf b/conf.d/health.d/named.conf
index e46d1d330..f2eaa83c7 100644
--- a/conf.d/health.d/named.conf
+++ b/conf.d/health.d/named.conf
@@ -4,9 +4,11 @@
 template: named_last_collected_secs
       on: named.global_queries
     calc: $now - $last_collected_t
-   every: 10s
-    warn: $this > ( 5 * $update_every)
-    crit: $this > (10 * $update_every)
    units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection
+      to: domainadmin
 
diff --git a/conf.d/health.d/net.conf b/conf.d/health.d/net.conf
index f65bc4fcb..7753aa184 100644
--- a/conf.d/health.d/net.conf
+++ b/conf.d/health.d/net.conf
@@ -1,27 +1,48 @@
+# -----------------------------------------------------------------------------
+# make sure we collect values for each interface
+
+template: interface_last_collected_secs
+      on: net.net
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: sysadmin
+
+
+# -----------------------------------------------------------------------------
 
 # check if an interface is dropping packets
-# the alarm is checked every 10 seconds
-# and examines the last 30 minutes of data
+# the alarm is checked every 1 minute
+# and examines the last hour of data
 
-template: 30min_packet_drops
+template: 1hour_packet_drops
       on: net.drops
-  lookup: sum -30m unaligned absolute
-   every: 1m
-    crit: $this > 0
+  lookup: sum -1h unaligned absolute
    units: packets
-    info: dropped packets in the last 30 minutes
+   every: 1m
+    warn: $this > 0
+   delay: down 30m multiplier 1.5 max 1h
+    info: interface dropped packets in the last hour
+      to: sysadmin
 
 
+# -----------------------------------------------------------------------------
+
 # check if an interface is having FIFO
 # buffer errors
-# the alarm is checked every 10 seconds
-# and examines the last 30 minutes of data
+# the alarm is checked every 1 minute
+# and examines the last hour of data
 
-template: 30min_fifo_errors
+template: 1hour_fifo_errors
       on: net.fifo
-  lookup: sum -30m unaligned absolute
-   every: 1m
-    crit: $this > 0
+  lookup: sum -1h unaligned absolute
    units: errors
-    info: network interface fifo errors in the last 30 minutes
-
+   every: 1m
+    warn: $this > 0
+   delay: down 30m multiplier 1.5 max 1h
+    info: interface fifo errors in the last hour
+      to: sysadmin
diff --git a/conf.d/health.d/nginx.conf b/conf.d/health.d/nginx.conf
index da13008e3..d70d6a59b 100644
--- a/conf.d/health.d/nginx.conf
+++ b/conf.d/health.d/nginx.conf
@@ -4,9 +4,11 @@
 template: nginx_last_collected_secs
       on: nginx.requests
     calc: $now - $last_collected_t
-   every: 10s
-    warn: $this > ( 5 * $update_every)
-    crit: $this > (10 * $update_every)
    units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection
+      to: webmaster
 
diff --git a/conf.d/health.d/qos.conf b/conf.d/health.d/qos.conf
index ac3bf8ff4..9e5939fdc 100644
--- a/conf.d/health.d/qos.conf
+++ b/conf.d/health.d/qos.conf
@@ -8,5 +8,7 @@
 #  lookup: sum -10m unaligned absolute
 #   every: 30s
 #    warn: $this > 0
+#   delay: up 0 down 30m multiplier 1.5 max 1h
 #   units: packets
 #    info: dropped packets in the last 30 minutes
+#      to: sysadmin
diff --git a/conf.d/health.d/ram.conf b/conf.d/health.d/ram.conf
index 1d3681128..216b82fed 100644
--- a/conf.d/health.d/ram.conf
+++ b/conf.d/health.d/ram.conf
@@ -1,9 +1,11 @@
 
-   alarm: used_ram_pcent
+   alarm: ram_in_use
       on: system.ram
     calc: $used * 100 / ($used + $cached + $free)
-   every: 10s
-    warn: $this > 80
-    crit: $this > 90
    units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (70) : (80))
+    crit: $this > (($status == $CRITICAL) ? (80) : (90))
+   delay: down 15m multiplier 1.5 max 1h
     info: system RAM usage
+      to: sysadmin
diff --git a/conf.d/health.d/redis.conf b/conf.d/health.d/redis.conf
index 3750176c5..3e648d85d 100644
--- a/conf.d/health.d/redis.conf
+++ b/conf.d/health.d/redis.conf
@@ -4,9 +4,11 @@
 template: redis_last_collected_secs
       on: redis.operations
     calc: $now - $last_collected_t
-   every: 10s
-    warn: $this > ( 5 * $update_every)
-    crit: $this > (10 * $update_every)
    units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection
+      to: dba
 
diff --git a/conf.d/health.d/retroshare.conf b/conf.d/health.d/retroshare.conf
new file mode 100644
index 000000000..1af7b4686
--- /dev/null
+++ b/conf.d/health.d/retroshare.conf
@@ -0,0 +1,25 @@
+# make sure RetroShare is running
+
+template: retroshare_last_collected_secs
+      on: retroshare.peers
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: sysadmin
+
+# make sure the DHT is fine when active
+
+template: retroshare_dht_working
+      on: retroshare.dht
+    calc: $dht_size_all
+   units: peers
+   every: 1m
+    warn: $this < (($status >= $WARNING)  ? (120) : (100))
+    crit: $this < (($status == $CRITICAL) ? (10)  : (1))
+   delay: up 0 down 15m multiplier 1.5 max 1h
+    info: Checks if the DHT has enough peers to operate
+      to: sysadmin
diff --git a/conf.d/health.d/softnet.conf b/conf.d/health.d/softnet.conf
new file mode 100644
index 000000000..0c3709f46
--- /dev/null
+++ b/conf.d/health.d/softnet.conf
@@ -0,0 +1,21 @@
+# check for common /proc/net/softnet_stat errors
+
+   alarm: 1hour_netdev_backlog_exceeded
+      on: system.softnet_stat
+  lookup: sum -1h unaligned absolute of dropped
+   units: packets
+   every: 1m
+    warn: $this > 0
+   delay: down 30m multiplier 1.5 max 1h
+    info: number of packets dropped because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets)
+      to: sysadmin
+
+   alarm: 1hour_netdev_budget_ran_outs
+      on: system.softnet_stat
+  lookup: sum -1h unaligned absolute of squeezed
+   units: events
+   every: 1m
+    warn: $this > 0
+   delay: down 30m multiplier 1.5 max 1h
+    info: number of times ksoftirq ran out of sysctl net.core.netdev_budget or time slice, with work remaining (this can be a cause for dropped packets)
+      to: silent
diff --git a/conf.d/health.d/squid.conf b/conf.d/health.d/squid.conf
index cc5ce1c3a..76143c5d7 100644
--- a/conf.d/health.d/squid.conf
+++ b/conf.d/health.d/squid.conf
@@ -4,9 +4,11 @@
 template: squid_last_collected_secs
       on: squid.clients_requests
     calc: $now - $last_collected_t
-   every: 10s
-    warn: $this > ( 5 * $update_every)
-    crit: $this > (10 * $update_every)
    units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection
+      to: proxyadmin
 
diff --git a/conf.d/health.d/swap.conf b/conf.d/health.d/swap.conf
index 552dd310a..0cfa888c4 100644
--- a/conf.d/health.d/swap.conf
+++ b/conf.d/health.d/swap.conf
@@ -4,17 +4,21 @@
   lookup: sum -30m unaligned absolute of out
           # we have to convert KB to MB by dividing $this (i.e. the result of the lookup) with 1024
     calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
-   every: 1m
-    warn: $this > 1
-    crit: $this > 10
    units: % of RAM
-    info: the sum of all memory swapped out during the last 30 minutes, as a percentage of the available RAM
+   every: 1m
+    warn: $this > (($status >= $WARNING)  ? (5)  : (10))
+    crit: $this > (($status == $CRITICAL) ? (15) : (20))
+   delay: up 0 down 15m multiplier 1.5 max 1h
+    info: the amount of memory swapped in the last 30 minutes, as a percentage of the system RAM
+      to: sysadmin
 
-   alarm: pcent_of_ram_in_swap
+   alarm: used_swap_space
       on: system.swap
     calc: $used * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
-   every: 10s
-    warn: $this > 10
-    crit: $this > 50
    units: % of RAM
-    info: the currently used swap space, as a percentage of the available RAM
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (15) : (20))
+    crit: $this > (($status == $CRITICAL) ? (40) : (50))
+   delay: up 0 down 15m multiplier 1.5 max 1h
+    info: the swap memory used, as a percentage of the system RAM
+      to: sysadmin
diff --git a/conf.d/health.d/tcp_resets.conf b/conf.d/health.d/tcp_resets.conf
new file mode 100644
index 000000000..8e93c4793
--- /dev/null
+++ b/conf.d/health.d/tcp_resets.conf
@@ -0,0 +1,32 @@
+# -----------------------------------------------------------------------------
+
+   alarm: ipv4_tcphandshake_last_collected_secs
+      on: ipv4.tcphandshake
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+   delay: up 0 down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: sysadmin
+
+# -----------------------------------------------------------------------------
+
+   alarm: 1m_ipv4_tcp_resets
+      on: ipv4.tcphandshake
+  lookup: average -1m at -10s unaligned absolute of OutRsts
+   units: tcp resets/s
+   every: 10s
+    info: average TCP RESETS this host is sending, over the last minute
+
+   alarm: 10s_ipv4_tcp_resets
+      on: ipv4.tcphandshake
+  lookup: average -10s unaligned absolute of OutRsts
+   units: tcp resets/s
+   every: 10s
+    warn: $this > ((($1m_ipv4_tcp_resets < 5)?(5):($1m_ipv4_tcp_resets)) * (($status >= $WARNING)  ? (1) : (4)))
+   delay: up 0 down 60m multiplier 1.2 max 2h
+    info: average TCP RESETS this host is sending, over the last 10 seconds (this can be an indication that a port scan is made, or that a service running on this host has crashed)
+      to: sysadmin
+
diff --git a/conf.d/health_alarm_notify.conf b/conf.d/health_alarm_notify.conf
new file mode 100644
index 000000000..d451cafed
--- /dev/null
+++ b/conf.d/health_alarm_notify.conf
@@ -0,0 +1,222 @@
+# Configuration for alarm notifications
+#
+# This configuration is used by: alarm-notify.sh
+# changes take effect immediately (the next alarm will use them).
+#
+# alarm-notify.sh can send:
+# - e-mails (using the sendmail command),
+# - push notifications to your mobile phone (pushover.net),
+# - messages to your slack team (slack.com),
+# - messages to your telegram chat / group chat (telegram.org)
+#
+# The 'to' line given at netdata alarms defines a *role*, so that many
+# people can be notified for each role.
+#
+# This file is a BASH script itself.
+#
+#
+###############################################################################
+# proxy configuration
+#
+# If you need to send curl based notifications (pushover, slack, telegram)
+# via a proxy, set these to your proxy address:
+#export http_proxy="http://10.0.0.1:3128/"
+#export https_proxy="http://10.0.0.1:3128/"
+
+
+###############################################################################
+# notifications images
+#
+# Images in notifications need to be downloaded from an Internet facing site.
+# To allow notification providers fetch the icons/images, by default we set
+# the URL of the global public netdata registry.
+# If you have an Internet facing netdata (or you have copied the images/ folder
+# of netdata to your web server), set its URL here, to fetch the notification
+# images from it.
+#images_base_url="http://my.public.netdata.server:19999"
+
+
+###############################################################################
+# external commands
+
+# The full path to the sendmail command.
+# If empty, the system $PATH will be searched for it.
+# If not found, email notifications will be disabled (silently).
+sendmail=""
+
+# The full path of the curl command.
+# If empty, the system $PATH will be searched for it.
+# If not found, pushover, telegram and slack notifications will be
+# silently disabled.
+curl=""
+
+
+###############################################################################
+# NOTE ABOUT RECIPIENTS
+#
+# When you define recipients (all types):
+#
+#  - emails addresses
+#  - pushover user tokens
+#  - telegram chat ids
+#  - slack channels
+#
+# You can append |critical to limit the notifications to be sent.
+#
+# In these examples, the first recipient receives all the alarms
+# while the second one receives only the critical ones:
+#
+#  email   : "user1@example.com user2@example.com|critical"
+#  pushover: "2987343...9437837 8756278...2362736|critical"
+#  telegram: "111827421 112746832|critical"
+#  slack   : "alarms disasters|critical"
+#
+# If a recipient is set to empty string, the default recipient of the given
+# notification method (email, pushover, telegram, slack) will be used.
+# To disable a notification, use the recipient called: disabled
+# This works for all notification methods (including the default recipients).
+
+
+###############################################################################
+# email global notification options
+
+# multiple recipients can be given like this:
+#              "admin1@example.com admin2@example.com ..."
+
+# enable/disable sending emails
+SEND_EMAIL="YES"
+
+# if a role recipient is not configured, an email will be send to:
+DEFAULT_RECIPIENT_EMAIL="root"
+# to receive only critical alarms, set it to "root|critical"
+
+
+###############################################################################
+# pushover (pushover.net) global notification options
+
+# multiple recipients can be given like this:
+#                  "USERTOKEN1 USERTOKEN2 ..."
+
+# enable/disable sending pushover notifications
+SEND_PUSHOVER="YES"
+
+# Login to pushover.net to get your pushover app token.
+# You need only one for all your netdata servers (or you can have one for
+# each of your netdata - your call).
+# Without an app token, netdata cannot send pushover notifications.
+PUSHOVER_APP_TOKEN=""
+
+# if a role's recipients are not configured, a notification will be send to
+# this pushover user token (empty = do not send a notification for unconfigured
+# roles):
+DEFAULT_RECIPIENT_PUSHOVER=""
+
+
+###############################################################################
+# telegram (telegram.org) global notification options
+
+# To get your chat ID send the command /my_id to telegram bot @get_id.
+# Users also need to open a query with the bot (see below).
+
+# note: multiple recipients can be given like this:
+#                  "CHAT_ID_1 CHAT_ID_1 ..."
+
+# enable/disable sending telegram messages
+SEND_TELEGRAM="YES"
+
+# Contact the bot @BotFather to create a new bot and receive a bot token.
+# Without it, netdata cannot send telegram messages.
+TELEGRAM_BOT_TOKEN=""
+
+# If a role's recipients are not configured, a message will be send to
+# this chat id (empty = do not send a notification for unconfigured roles):
+DEFAULT_RECIPIENT_TELEGRAM=""
+
+
+###############################################################################
+# slack (slack.com) global notification options
+
+# multiple recipients can be given like this:
+#                  "CHANNEL1 CHANNEL2 ..."
+
+# enable/disable sending slack notifications
+SEND_SLACK="YES"
+
+# Login to slack.com and create an incoming webhook. You need only one for all
+# your netdata servers (or you can have one for each of your netdata).
+# Without it, netdata cannot send slack notifications.
+# Get yours from: https://api.slack.com/incoming-webhooks
+SLACK_WEBHOOK_URL=""
+
+# if a role's recipients are not configured, a notification will be send to
+# this slack channel (empty = do not send a notification for unconfigured
+# roles):
+DEFAULT_RECIPIENT_SLACK=""
+
+
+###############################################################################
+# RECIPIENTS PER ROLE
+
+# -----------------------------------------------------------------------------
+# generic system alarms
+# CPU, disks, network interfaces, entropy, etc
+
+role_recipients_email[sysadmin]="${DEFAULT_RECIPIENT_EMAIL}"
+
+role_recipients_pushover[sysadmin]="${DEFAULT_RECIPIENT_PUSHOVER}"
+
+role_recipients_telegram[sysadmin]="${DEFAULT_RECIPIENT_TELEGRAM}"
+
+role_recipients_slack[sysadmin]="${DEFAULT_RECIPIENT_SLACK}"
+
+
+# -----------------------------------------------------------------------------
+# DNS related alarms
+
+role_recipients_email[domainadmin]="${DEFAULT_RECIPIENT_EMAIL}"
+
+role_recipients_pushover[domainadmin]="${DEFAULT_RECIPIENT_PUSHOVER}"
+
+role_recipients_telegram[domainadmin]="${DEFAULT_RECIPIENT_TELEGRAM}"
+
+role_recipients_slack[domainadmin]="${DEFAULT_RECIPIENT_SLACK}"
+
+
+# -----------------------------------------------------------------------------
+# database servers alarms
+# mysql, redis, memcached, etc
+
+role_recipients_email[dba]="${DEFAULT_RECIPIENT_EMAIL}"
+
+role_recipients_pushover[dba]="${DEFAULT_RECIPIENT_PUSHOVER}"
+
+role_recipients_telegram[dba]="${DEFAULT_RECIPIENT_TELEGRAM}"
+
+role_recipients_slack[dba]="${DEFAULT_RECIPIENT_SLACK}"
+
+
+# -----------------------------------------------------------------------------
+# web servers alarms
+# apache, nginx, etc
+
+role_recipients_email[webmaster]="${DEFAULT_RECIPIENT_EMAIL}"
+
+role_recipients_pushover[webmaster]="${DEFAULT_RECIPIENT_PUSHOVER}"
+
+role_recipients_telegram[webmaster]="${DEFAULT_RECIPIENT_TELEGRAM}"
+
+role_recipients_slack[webmaster]="${DEFAULT_RECIPIENT_SLACK}"
+
+
+# -----------------------------------------------------------------------------
+# proxy servers alarms
+# apache, nginx, etc
+
+role_recipients_email[proxyadmin]="${DEFAULT_RECIPIENT_EMAIL}"
+
+role_recipients_pushover[proxyadmin]="${DEFAULT_RECIPIENT_PUSHOVER}"
+
+role_recipients_telegram[proxyadmin]="${DEFAULT_RECIPIENT_TELEGRAM}"
+
+role_recipients_slack[proxyadmin]="${DEFAULT_RECIPIENT_SLACK}"
+
diff --git a/conf.d/health_email_recipients.conf b/conf.d/health_email_recipients.conf
new file mode 100644
index 000000000..f56c6c64a
--- /dev/null
+++ b/conf.d/health_email_recipients.conf
@@ -0,0 +1,2 @@
+# OBSOLETE FILE
+# REPLACED WITH health_alarm_notify.conf
diff --git a/conf.d/python.d/mysql.conf b/conf.d/python.d/mysql.conf
index d247b89a0..8fbbe6513 100644
--- a/conf.d/python.d/mysql.conf
+++ b/conf.d/python.d/mysql.conf
@@ -104,6 +104,8 @@ tcp:
   # pass     : ''
   host     : 'localhost'
   port     : '3306'
+  # keep in mind port might be ignored by mysql, if host = 'localhost'
+  # http://serverfault.com/questions/337818/how-to-force-mysql-to-connect-by-tcp-instead-of-a-unix-socket/337844#337844
 
 tcpipv4:
   name     : 'local'
@@ -158,6 +160,8 @@ tcp_root:
   # pass     : ''
   host     : 'localhost'
   port     : '3306'
+  # keep in mind port might be ignored by mysql, if host = 'localhost'
+  # http://serverfault.com/questions/337818/how-to-force-mysql-to-connect-by-tcp-instead-of-a-unix-socket/337844#337844
 
 tcpipv4_root:
   name     : 'local'
diff --git a/conf.d/python.d/nginx.conf b/conf.d/python.d/nginx.conf
index 1a27d67c5..645925a55 100644
--- a/conf.d/python.d/nginx.conf
+++ b/conf.d/python.d/nginx.conf
@@ -1,5 +1,17 @@
 # netdata python.d.plugin configuration for nginx
 #
+# You must have ngx_http_stub_status_module configured on your nginx server for this
+# plugin to work. The following is an example config.
+# It must be located inside a server { } block.
+#  
+# location /stub_status {
+#   stub_status;
+#   # Security: Only allow access from the IP below.
+#   allow 192.168.1.200;
+#   # Deny anyone else
+#   deny all;
+#  }
+#
 # This file is in YaML format. Generally the format is:
 #
 # name: value
@@ -47,14 +59,15 @@
 # predefined parameters. These are:
 #
 # job_name:
-#     name: myname     # the JOB's name as it will appear at the
-#                      # dashboard (by default is the job_name)
+#     name: my_name    # the JOB's name as it will appear at the
+#                      # dashboard. If name: is not supplied the
+#                      # job_name: will be used (use _ for spaces)
 #                      # JOBs sharing a name are mutually exclusive
 #     update_every: 1  # the JOB's data collection frequency
 #     priority: 60000  # the JOB's order on the dashboard
 #     retries: 5       # the JOB's number of restoration attempts
 #
-# Additionally to the above, nginx also supports the following:
+# Additionally to the above, this plugin also supports the following:
 #
 #     url: 'URL'       # the URL to fetch nginx's status stats
 #
@@ -63,6 +76,14 @@
 #     user: 'username'
 #     pass: 'password'
 #
+# Example
+# 
+# RemoteNginx:
+#     name : 'Reverse_Proxy'
+#     url  : 'http://yourdomain.com/stub_status'
+#
+# "RemoteNginx" will show up in Netdata logs. "Reverse Proxy" will show up in the menu
+# in the nginx section.
 
 # ----------------------------------------------------------------------
 # AUTO-DETECTION JOBS
diff --git a/conf.d/python.d/phpfpm.conf b/conf.d/python.d/phpfpm.conf
index 06d2367ae..f5d067cc7 100644
--- a/conf.d/python.d/phpfpm.conf
+++ b/conf.d/python.d/phpfpm.conf
@@ -57,6 +57,7 @@
 # Additionally to the above, PHP-FPM also supports the following:
 #
 #     url: 'URL'       # the URL to fetch nginx's status stats
+#                      # Be sure and include ?full&status at the end of the url
 #
 # if the URL is password protected, the following are supported:
 #
@@ -70,13 +71,13 @@
 
 localhost:
   name : 'local'
-  url  : "http://localhost/status"
+  url  : "http://localhost/status?full&json"
 
 localipv4:
   name : 'local'
-  url  : "http://127.0.0.1/status"
+  url  : "http://127.0.0.1/status?full&json"
 
 localipv6:
   name : 'local'
-  url  : "http://::1/status"
+  url  : "http://::1/status?full&json"
 
diff --git a/conf.d/python.d/retroshare.conf b/conf.d/python.d/retroshare.conf
new file mode 100644
index 000000000..79614373b
--- /dev/null
+++ b/conf.d/python.d/retroshare.conf
@@ -0,0 +1,67 @@
+# netdata python.d.plugin configuration for RetroShare
+#
+# This file is in YaML format. Generally the format is:
+#
+# name: value
+#
+# There are 2 sections:
+#  - global variables
+#  - one or more JOBS
+#
+# JOBS allow you to collect values from multiple sources.
+# Each source will have its own set of charts.
+#
+# JOB parameters have to be indented (using spaces only, example below).
+
+# ----------------------------------------------------------------------
+# Global Variables
+# These variables set the defaults for all JOBs, however each JOB
+# may define its own, overriding the defaults.
+
+# update_every sets the default data collection frequency.
+# If unset, the python.d.plugin default is used.
+# update_every: 1
+
+# priority controls the order of charts at the netdata dashboard.
+# Lower numbers move the charts towards the top of the page.
+# If unset, the default for python.d.plugin is used.
+# priority: 60000
+
+# retries sets the number of retries to be made in case of failures.
+# If unset, the default for python.d.plugin is used.
+# Attempts to restore the service are made once every update_every
+# and only if the module has collected values in the past.
+# retries: 5
+
+# ----------------------------------------------------------------------
+# JOBS (data collection sources)
+#
+# The default JOBS share the same *name*. JOBS with the same name
+# are mutually exclusive. Only one of them will be allowed running at
+# any time. This allows autodetection to try several alternatives and
+# pick the one that works.
+#
+# Any number of jobs is supported.
+#
+# All python.d.plugin JOBS (for all its modules) support a set of
+# predefined parameters. These are:
+#
+# job_name:
+#     name: myname     # the JOB's name as it will appear at the
+#                      # dashboard (by default is the job_name)
+#                      # JOBs sharing a name are mutually exclusive
+#     update_every: 1  # the JOB's data collection frequency
+#     priority: 60000  # the JOB's order on the dashboard
+#     retries: 5       # the JOB's number of restoration attempts
+#
+# Additionally to the above, RetroShare also supports the following:
+#
+# - url: 'url'         # the URL to the WebUI
+#
+# ----------------------------------------------------------------------
+# AUTO-DETECTION JOBS
+# only one of them will run (they have the same name)
+
+localhost:
+    name: 'local'
+    url: 'http://localhost:9090'
author	Federico Ceratto <federico.ceratto@gmail.com>	2016-11-23 15:49:14 +0000
committer	Federico Ceratto <federico.ceratto@gmail.com>	2016-11-23 15:49:14 +0000
commit	68141d9dac0c08e51d257feef16a79086dd8a2df (patch)
tree	f4a0f5d31ed2194b5991130754b297b9c8c076e6 /conf.d
parent	Release v. 1.3.0+dfsg-1 to Unstable (diff)
parent	New upstream version 1.4.0+dfsg (diff)
download	netdata-68141d9dac0c08e51d257feef16a79086dd8a2df.tar.xz netdata-68141d9dac0c08e51d257feef16a79086dd8a2df.zip