diff options
Diffstat (limited to '')
26 files changed, 647 insertions, 116 deletions
diff --git a/conf.d/Makefile.am b/conf.d/Makefile.am index 02fe86b0..066744ca 100644 --- a/conf.d/Makefile.am +++ b/conf.d/Makefile.am @@ -7,6 +7,8 @@ dist_config_DATA = \ apps_groups.conf \ charts.d.conf \ python.d.conf \ + health_alarm_notify.conf \ + health_email_recipients.conf \ $(NULL) chartsconfigdir=$(configdir)/charts.d @@ -34,6 +36,7 @@ dist_pythonconfig_DATA = \ python.d/phpfpm.conf \ python.d/postfix.conf \ python.d/redis.conf \ + python.d/retroshare.conf \ python.d/sensors.conf \ python.d/squid.conf \ python.d/tomcat.conf \ @@ -45,13 +48,17 @@ dist_healthconfig_DATA = \ health.d/cpu.conf \ health.d/disks.conf \ health.d/entropy.conf \ + health.d/tcp_resets.conf \ health.d/memcached.conf \ + health.d/mysql.conf \ health.d/named.conf \ health.d/net.conf \ health.d/nginx.conf \ health.d/qos.conf \ health.d/ram.conf \ health.d/redis.conf \ + health.d/retroshare.conf \ + health.d/softnet.conf \ health.d/swap.conf \ health.d/squid.conf \ $(NULL) diff --git a/conf.d/Makefile.in b/conf.d/Makefile.in index 9356f60e..823713bf 100644 --- a/conf.d/Makefile.in +++ b/conf.d/Makefile.in @@ -84,8 +84,13 @@ DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am \ $(dist_healthconfig_DATA) $(dist_nodeconfig_DATA) \ $(dist_pythonconfig_DATA) ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 -am__aclocal_m4_deps = $(top_srcdir)/m4/ax_pthread.m4 \ - $(top_srcdir)/configure.ac +am__aclocal_m4_deps = $(top_srcdir)/m4/ax_c___atomic.m4 \ + $(top_srcdir)/m4/ax_c__generic.m4 \ + $(top_srcdir)/m4/ax_c_mallinfo.m4 \ + $(top_srcdir)/m4/ax_c_mallopt.m4 \ + $(top_srcdir)/m4/ax_check_compile_flag.m4 \ + $(top_srcdir)/m4/ax_pthread.m4 $(top_srcdir)/m4/jemalloc.m4 \ + $(top_srcdir)/m4/tcmalloc.m4 $(top_srcdir)/configure.ac am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ $(ACLOCAL_M4) mkinstalldirs = $(install_sh) -d @@ -212,6 +217,7 @@ PTHREAD_CFLAGS = @PTHREAD_CFLAGS@ PTHREAD_LIBS = @PTHREAD_LIBS@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ +SSE_CANDIDATE = @SSE_CANDIDATE@ STRIP = @STRIP@ UUID_CFLAGS = @UUID_CFLAGS@ UUID_LIBS = @UUID_LIBS@ @@ -244,6 +250,8 @@ datarootdir = @datarootdir@ docdir = @docdir@ dvidir = @dvidir@ exec_prefix = @exec_prefix@ +has_jemalloc = @has_jemalloc@ +has_tcmalloc = @has_tcmalloc@ host = @host@ host_alias = @host_alias@ host_cpu = @host_cpu@ @@ -288,6 +296,8 @@ dist_config_DATA = \ apps_groups.conf \ charts.d.conf \ python.d.conf \ + health_alarm_notify.conf \ + health_email_recipients.conf \ $(NULL) chartsconfigdir = $(configdir)/charts.d @@ -315,6 +325,7 @@ dist_pythonconfig_DATA = \ python.d/phpfpm.conf \ python.d/postfix.conf \ python.d/redis.conf \ + python.d/retroshare.conf \ python.d/sensors.conf \ python.d/squid.conf \ python.d/tomcat.conf \ @@ -326,13 +337,17 @@ dist_healthconfig_DATA = \ health.d/cpu.conf \ health.d/disks.conf \ health.d/entropy.conf \ + health.d/tcp_resets.conf \ health.d/memcached.conf \ + health.d/mysql.conf \ health.d/named.conf \ health.d/net.conf \ health.d/nginx.conf \ health.d/qos.conf \ health.d/ram.conf \ health.d/redis.conf \ + health.d/retroshare.conf \ + health.d/softnet.conf \ health.d/swap.conf \ health.d/squid.conf \ $(NULL) diff --git a/conf.d/apps_groups.conf b/conf.d/apps_groups.conf index 0a6f55cd..57357a87 100644 --- a/conf.d/apps_groups.conf +++ b/conf.d/apps_groups.conf @@ -114,7 +114,7 @@ ha: corosync hs_logd ha_logd stonithd pbx: asterisk safe_asterisk *vicidial* sip: opensips* stund murmur: murmurd -vines: *vines* +xmpp: *vines* *prosody* # ----------------------------------------------------------------------------- # monitoring diff --git a/conf.d/health.d/apache.conf b/conf.d/health.d/apache.conf index 1fddbc99..0aaf0e00 100644 --- a/conf.d/health.d/apache.conf +++ b/conf.d/health.d/apache.conf @@ -4,10 +4,11 @@ template: apache_last_collected_secs on: apache.requests calc: $now - $last_collected_t - every: 10s - warn: $this > ( 5 * $update_every) - crit: $this > (10 * $update_every) units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h info: number of seconds since the last successful data collection - + to: webmaster diff --git a/conf.d/health.d/cpu.conf b/conf.d/health.d/cpu.conf index 9332e508..4d79fc79 100644 --- a/conf.d/health.d/cpu.conf +++ b/conf.d/health.d/cpu.conf @@ -1,24 +1,33 @@ -template: 5min_cpu_pcent +template: 10min_cpu_usage on: system.cpu - lookup: average -5m unaligned of user,system,nice,softirq,irq,guest,guest_nice - every: 1m - warn: $this > 90 + lookup: average -10m unaligned of user,system,nice,softirq,irq,guest,guest_nice units: % - info: average cpu utilization for the last 5 minutes + every: 1m + warn: $this > (($status >= $WARNING) ? (70) : (80)) + crit: $this > (($status == $CRITICAL) ? (80) : (90)) + delay: down 15m multiplier 1.5 max 1h + info: average cpu utilization for the last 10 minutes + to: sysadmin -template: 5min_iowait_cpu_pcent +template: 10min_cpu_iowait on: system.cpu - lookup: average -5m unaligned of iowait - every: 1m - warn: $this > 10 + lookup: average -10m unaligned of iowait units: % - info: average wait I/O for the last 5 minutes + every: 1m + warn: $this > (($status >= $WARNING) ? (5) : (10)) + crit: $this > (($status == $CRITICAL) ? (20) : (30)) + delay: down 15m multiplier 1.5 max 1h + info: average CPU wait I/O for the last 10 minutes + to: sysadmin -template: 20min_steal_cpu_pcent +template: 20min_steal_cpu on: system.cpu lookup: average -20m unaligned of steal - every: 5m - warn: $this > 10 units: % - info: average stolen CPU time for the last 20 minutes + every: 5m + warn: $this > (($status >= $WARNING) ? (5) : (10)) + crit: $this > (($status == $CRITICAL) ? (20) : (30)) + delay: down 15m multiplier 1.5 max 1h + info: average CPU steal time for the last 20 minutes + to: sysadmin diff --git a/conf.d/health.d/disks.conf b/conf.d/health.d/disks.conf index c38f1a0a..cc7a4766 100644 --- a/conf.d/health.d/disks.conf +++ b/conf.d/health.d/disks.conf @@ -1,18 +1,59 @@ # ----------------------------------------------------------------------------- +# make sure we collect values for each disk + +# for mount points +template: disk_space_last_collected_secs + on: disk.space + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection of the mount point + to: sysadmin + +# for block devices +template: disk_last_collected_secs + on: disk.io + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection of the block device + to: sysadmin + + +# ----------------------------------------------------------------------------- # low disk space # checking the latest collected values # raise an alarm if the disk is low on # available disk space -template: disk_full_percent +template: disk_space_usage on: disk.space calc: $used * 100 / ($avail + $used) - every: 1m - warn: $this > 80 - crit: $this > 95 units: % + every: 1m + warn: $this > (($status >= $WARNING ) ? (70) : (80)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: up 1m down 15m multiplier 1.5 max 1h info: current disk space usage + to: sysadmin + +template: disk_inode_usage + on: disk.inodes + calc: $used * 100 / ($avail + $used) + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (75) : (80)) + crit: $this > (($status == $CRITICAL) ? (90) : (95)) + delay: up 1m down 15m multiplier 1.5 max 1h + info: current disk inode usage + to: sysadmin # ----------------------------------------------------------------------------- @@ -20,7 +61,7 @@ template: disk_full_percent # calculate the rate the disk fills # use as base, the available space change -# during the last 30 minutes +# during the last hour # this is just a calculation - it has no alarm # we will use it in the next template to find @@ -28,25 +69,27 @@ template: disk_full_percent template: disk_fill_rate on: disk.space - lookup: max -1s at -30m unaligned of avail - calc: ($this - $avail) / ($now - $after) - every: 15s - units: MB/s - info: average rate the disk fills up (positive), or frees up (negative) space, for the last 30 minutes + lookup: min -10m at -50m unaligned of avail + calc: ($this - $avail) / (($now - $after) / 3600) + every: 1m + units: GB/hour + info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour # calculate the hours remaining # if the disk continues to fill # in this rate -template: disk_full_after_hours +template: out_of_disk_space_time on: disk.space - calc: $avail / $disk_fill_rate / 3600 - every: 10s - warn: $this > 0 and $this < 48 - crit: $this > 0 and $this < 24 + calc: $avail / $disk_fill_rate units: hours - info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last 30 minutes + every: 10s + warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) + crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) + delay: down 15m multiplier 1.2 max 1h + info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour + to: sysadmin # ----------------------------------------------------------------------------- @@ -59,13 +102,15 @@ template: disk_full_after_hours template: 10min_disk_utilization on: disk.util lookup: average -10m unaligned + units: % every: 1m green: 90 red: 98 - warn: $this > $green - crit: $this > $red - units: % + warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1)) + crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1)) + delay: down 15m multiplier 1.2 max 1h info: the percentage of time the disk was busy, during the last 10 minutes + to: sysadmin # raise an alarm if the disk backlog @@ -76,10 +121,12 @@ template: 10min_disk_utilization template: 10min_disk_backlog on: disk.backlog lookup: average -10m unaligned - every: 1m - green: 1000 - red: 2000 - warn: $this > $green - crit: $this > $red units: ms + every: 1m + green: 2000 + red: 5000 + warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1)) + crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1)) + delay: down 15m multiplier 1.2 max 1h info: average of the kernel estimated disk backlog, for the last 10 minutes + to: sysadmin diff --git a/conf.d/health.d/entropy.conf b/conf.d/health.d/entropy.conf index 6f8b6e85..d0eca8a6 100644 --- a/conf.d/health.d/entropy.conf +++ b/conf.d/health.d/entropy.conf @@ -1,13 +1,14 @@ # check if entropy is too low # the alarm is checked every 1 minute -# and examines the last 30 minutes of data +# and examines the last hour of data - alarm: min_30min_entropy + alarm: 1hour_lowest_entropy on: system.entropy - lookup: min -30m unaligned - every: 1m - warn: $this < 200 - crit: $this < 100 + lookup: min -1h unaligned units: entries - info: minimum entries in the random numbers pool (entropy), for the last 30 minutes + every: 5m + warn: $this < (($status >= $WARNING) ? (200) : (100)) + delay: down 1h multiplier 1.5 max 1h + info: minimum entries in the random numbers pool in the last 30 minutes + to: silent diff --git a/conf.d/health.d/memcached.conf b/conf.d/health.d/memcached.conf index 05ff1471..46a8ca0e 100644 --- a/conf.d/health.d/memcached.conf +++ b/conf.d/health.d/memcached.conf @@ -4,43 +4,49 @@ template: memcached_last_collected_secs on: memcached.cache calc: $now - $last_collected_t - every: 10s - warn: $this > ( 5 * $update_every) - crit: $this > (10 * $update_every) units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h info: number of seconds since the last successful data collection + to: dba # detect if memcached cache is full -template: cache_full_pcent +template: memcached_cache_memory_usage on: memcached.cache calc: $used * 100 / ($used + $available) - every: 10s - warn: $this > 80 - crit: $this > 90 units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (70) : (80)) + crit: $this > (($status == $CRITICAL) ? (80) : (90)) + delay: up 0 down 15m multiplier 1.5 max 1h info: current cache memory usage + to: dba # find the rate memcached cache is filling template: cache_fill_rate on: memcached.cache - lookup: max -1s at -30m unaligned of available - calc: ($this - $available) / ($now - $after) - every: 15s - units: KB/s - info: average rate the cache fills up (positive), or frees up (negative) space, for the last 30 minutes + lookup: min -10m at -50m unaligned of available + calc: ($this - $available) / (($now - $after) / 3600) + units: KB/hour + every: 1m + info: average rate the cache fills up (positive), or frees up (negative) space, for the last hour # find the hours remaining until memcached cache is full -template: cache_full_after_hours +template: out_of_cache_space_time on: memcached.cache - calc: $available / $cache_fill_rate / 3600 - every: 10s - warn: $this > 0 and $this < 48 - crit: $this > 0 and $this < 24 + calc: $available / $cache_fill_rate units: hours - info: estimated time the cache will run out of space, if the system continues to add data with the rate of the last 30 minutes + every: 10s + warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) + crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) + delay: down 15m multiplier 1.5 max 1h + info: estimated time the cache will run out of space, if the system continues to add data with the rate of the last hour + to: dba diff --git a/conf.d/health.d/mysql.conf b/conf.d/health.d/mysql.conf new file mode 100644 index 00000000..a2cfa3ec --- /dev/null +++ b/conf.d/health.d/mysql.conf @@ -0,0 +1,13 @@ + +# make sure mysql is running + +template: mysql_last_collected_secs + on: mysql.queries + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: dba diff --git a/conf.d/health.d/named.conf b/conf.d/health.d/named.conf index e46d1d33..f2eaa83c 100644 --- a/conf.d/health.d/named.conf +++ b/conf.d/health.d/named.conf @@ -4,9 +4,11 @@ template: named_last_collected_secs on: named.global_queries calc: $now - $last_collected_t - every: 10s - warn: $this > ( 5 * $update_every) - crit: $this > (10 * $update_every) units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h info: number of seconds since the last successful data collection + to: domainadmin diff --git a/conf.d/health.d/net.conf b/conf.d/health.d/net.conf index f65bc4fc..7753aa18 100644 --- a/conf.d/health.d/net.conf +++ b/conf.d/health.d/net.conf @@ -1,27 +1,48 @@ +# ----------------------------------------------------------------------------- +# make sure we collect values for each interface + +template: interface_last_collected_secs + on: net.net + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin + + +# ----------------------------------------------------------------------------- # check if an interface is dropping packets -# the alarm is checked every 10 seconds -# and examines the last 30 minutes of data +# the alarm is checked every 1 minute +# and examines the last hour of data -template: 30min_packet_drops +template: 1hour_packet_drops on: net.drops - lookup: sum -30m unaligned absolute - every: 1m - crit: $this > 0 + lookup: sum -1h unaligned absolute units: packets - info: dropped packets in the last 30 minutes + every: 1m + warn: $this > 0 + delay: down 30m multiplier 1.5 max 1h + info: interface dropped packets in the last hour + to: sysadmin +# ----------------------------------------------------------------------------- + # check if an interface is having FIFO # buffer errors -# the alarm is checked every 10 seconds -# and examines the last 30 minutes of data +# the alarm is checked every 1 minute +# and examines the last hour of data -template: 30min_fifo_errors +template: 1hour_fifo_errors on: net.fifo - lookup: sum -30m unaligned absolute - every: 1m - crit: $this > 0 + lookup: sum -1h unaligned absolute units: errors - info: network interface fifo errors in the last 30 minutes - + every: 1m + warn: $this > 0 + delay: down 30m multiplier 1.5 max 1h + info: interface fifo errors in the last hour + to: sysadmin diff --git a/conf.d/health.d/nginx.conf b/conf.d/health.d/nginx.conf index da13008e..d70d6a59 100644 --- a/conf.d/health.d/nginx.conf +++ b/conf.d/health.d/nginx.conf @@ -4,9 +4,11 @@ template: nginx_last_collected_secs on: nginx.requests calc: $now - $last_collected_t - every: 10s - warn: $this > ( 5 * $update_every) - crit: $this > (10 * $update_every) units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h info: number of seconds since the last successful data collection + to: webmaster diff --git a/conf.d/health.d/qos.conf b/conf.d/health.d/qos.conf index ac3bf8ff..9e5939fd 100644 --- a/conf.d/health.d/qos.conf +++ b/conf.d/health.d/qos.conf @@ -8,5 +8,7 @@ # lookup: sum -10m unaligned absolute # every: 30s # warn: $this > 0 +# delay: up 0 down 30m multiplier 1.5 max 1h # units: packets # info: dropped packets in the last 30 minutes +# to: sysadmin diff --git a/conf.d/health.d/ram.conf b/conf.d/health.d/ram.conf index 1d368112..216b82fe 100644 --- a/conf.d/health.d/ram.conf +++ b/conf.d/health.d/ram.conf @@ -1,9 +1,11 @@ - alarm: used_ram_pcent + alarm: ram_in_use on: system.ram calc: $used * 100 / ($used + $cached + $free) - every: 10s - warn: $this > 80 - crit: $this > 90 units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (70) : (80)) + crit: $this > (($status == $CRITICAL) ? (80) : (90)) + delay: down 15m multiplier 1.5 max 1h info: system RAM usage + to: sysadmin diff --git a/conf.d/health.d/redis.conf b/conf.d/health.d/redis.conf index 3750176c..3e648d85 100644 --- a/conf.d/health.d/redis.conf +++ b/conf.d/health.d/redis.conf @@ -4,9 +4,11 @@ template: redis_last_collected_secs on: redis.operations calc: $now - $last_collected_t - every: 10s - warn: $this > ( 5 * $update_every) - crit: $this > (10 * $update_every) units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h info: number of seconds since the last successful data collection + to: dba diff --git a/conf.d/health.d/retroshare.conf b/conf.d/health.d/retroshare.conf new file mode 100644 index 00000000..1af7b468 --- /dev/null +++ b/conf.d/health.d/retroshare.conf @@ -0,0 +1,25 @@ +# make sure RetroShare is running + +template: retroshare_last_collected_secs + on: retroshare.peers + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin + +# make sure the DHT is fine when active + +template: retroshare_dht_working + on: retroshare.dht + calc: $dht_size_all + units: peers + every: 1m + warn: $this < (($status >= $WARNING) ? (120) : (100)) + crit: $this < (($status == $CRITICAL) ? (10) : (1)) + delay: up 0 down 15m multiplier 1.5 max 1h + info: Checks if the DHT has enough peers to operate + to: sysadmin diff --git a/conf.d/health.d/softnet.conf b/conf.d/health.d/softnet.conf new file mode 100644 index 00000000..0c3709f4 --- /dev/null +++ b/conf.d/health.d/softnet.conf @@ -0,0 +1,21 @@ +# check for common /proc/net/softnet_stat errors + + alarm: 1hour_netdev_backlog_exceeded + on: system.softnet_stat + lookup: sum -1h unaligned absolute of dropped + units: packets + every: 1m + warn: $this > 0 + delay: down 30m multiplier 1.5 max 1h + info: number of packets dropped because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets) + to: sysadmin + + alarm: 1hour_netdev_budget_ran_outs + on: system.softnet_stat + lookup: sum -1h unaligned absolute of squeezed + units: events + every: 1m + warn: $this > 0 + delay: down 30m multiplier 1.5 max 1h + info: number of times ksoftirq ran out of sysctl net.core.netdev_budget or time slice, with work remaining (this can be a cause for dropped packets) + to: silent diff --git a/conf.d/health.d/squid.conf b/conf.d/health.d/squid.conf index cc5ce1c3..76143c5d 100644 --- a/conf.d/health.d/squid.conf +++ b/conf.d/health.d/squid.conf @@ -4,9 +4,11 @@ template: squid_last_collected_secs on: squid.clients_requests calc: $now - $last_collected_t - every: 10s - warn: $this > ( 5 * $update_every) - crit: $this > (10 * $update_every) units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h info: number of seconds since the last successful data collection + to: proxyadmin diff --git a/conf.d/health.d/swap.conf b/conf.d/health.d/swap.conf index 552dd310..0cfa888c 100644 --- a/conf.d/health.d/swap.conf +++ b/conf.d/health.d/swap.conf @@ -4,17 +4,21 @@ lookup: sum -30m unaligned absolute of out # we have to convert KB to MB by dividing $this (i.e. the result of the lookup) with 1024 calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free ) - every: 1m - warn: $this > 1 - crit: $this > 10 units: % of RAM - info: the sum of all memory swapped out during the last 30 minutes, as a percentage of the available RAM + every: 1m + warn: $this > (($status >= $WARNING) ? (5) : (10)) + crit: $this > (($status == $CRITICAL) ? (15) : (20)) + delay: up 0 down 15m multiplier 1.5 max 1h + info: the amount of memory swapped in the last 30 minutes, as a percentage of the system RAM + to: sysadmin - alarm: pcent_of_ram_in_swap + alarm: used_swap_space on: system.swap calc: $used * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free ) - every: 10s - warn: $this > 10 - crit: $this > 50 units: % of RAM - info: the currently used swap space, as a percentage of the available RAM + every: 10s + warn: $this > (($status >= $WARNING) ? (15) : (20)) + crit: $this > (($status == $CRITICAL) ? (40) : (50)) + delay: up 0 down 15m multiplier 1.5 max 1h + info: the swap memory used, as a percentage of the system RAM + to: sysadmin diff --git a/conf.d/health.d/tcp_resets.conf b/conf.d/health.d/tcp_resets.conf new file mode 100644 index 00000000..8e93c479 --- /dev/null +++ b/conf.d/health.d/tcp_resets.conf @@ -0,0 +1,32 @@ +# ----------------------------------------------------------------------------- + + alarm: ipv4_tcphandshake_last_collected_secs + on: ipv4.tcphandshake + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every)) + delay: up 0 down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin + +# ----------------------------------------------------------------------------- + + alarm: 1m_ipv4_tcp_resets + on: ipv4.tcphandshake + lookup: average -1m at -10s unaligned absolute of OutRsts + units: tcp resets/s + every: 10s + info: average TCP RESETS this host is sending, over the last minute + + alarm: 10s_ipv4_tcp_resets + on: ipv4.tcphandshake + lookup: average -10s unaligned absolute of OutRsts + units: tcp resets/s + every: 10s + warn: $this > ((($1m_ipv4_tcp_resets < 5)?(5):($1m_ipv4_tcp_resets)) * (($status >= $WARNING) ? (1) : (4))) + delay: up 0 down 60m multiplier 1.2 max 2h + info: average TCP RESETS this host is sending, over the last 10 seconds (this can be an indication that a port scan is made, or that a service running on this host has crashed) + to: sysadmin + diff --git a/conf.d/health_alarm_notify.conf b/conf.d/health_alarm_notify.conf new file mode 100644 index 00000000..d451cafe --- /dev/null +++ b/conf.d/health_alarm_notify.conf @@ -0,0 +1,222 @@ +# Configuration for alarm notifications +# +# This configuration is used by: alarm-notify.sh +# changes take effect immediately (the next alarm will use them). +# +# alarm-notify.sh can send: +# - e-mails (using the sendmail command), +# - push notifications to your mobile phone (pushover.net), +# - messages to your slack team (slack.com), +# - messages to your telegram chat / group chat (telegram.org) +# +# The 'to' line given at netdata alarms defines a *role*, so that many +# people can be notified for each role. +# +# This file is a BASH script itself. +# +# +############################################################################### +# proxy configuration +# +# If you need to send curl based notifications (pushover, slack, telegram) +# via a proxy, set these to your proxy address: +#export http_proxy="http://10.0.0.1:3128/" +#export https_proxy="http://10.0.0.1:3128/" + + +############################################################################### +# notifications images +# +# Images in notifications need to be downloaded from an Internet facing site. +# To allow notification providers fetch the icons/images, by default we set +# the URL of the global public netdata registry. +# If you have an Internet facing netdata (or you have copied the images/ folder +# of netdata to your web server), set its URL here, to fetch the notification +# images from it. +#images_base_url="http://my.public.netdata.server:19999" + + +############################################################################### +# external commands + +# The full path to the sendmail command. +# If empty, the system $PATH will be searched for it. +# If not found, email notifications will be disabled (silently). +sendmail="" + +# The full path of the curl command. +# If empty, the system $PATH will be searched for it. +# If not found, pushover, telegram and slack notifications will be +# silently disabled. +curl="" + + +############################################################################### +# NOTE ABOUT RECIPIENTS +# +# When you define recipients (all types): +# +# - emails addresses +# - pushover user tokens +# - telegram chat ids +# - slack channels +# +# You can append |critical to limit the notifications to be sent. +# +# In these examples, the first recipient receives all the alarms +# while the second one receives only the critical ones: +# +# email : "user1@example.com user2@example.com|critical" +# pushover: "2987343...9437837 8756278...2362736|critical" +# telegram: "111827421 112746832|critical" +# slack : "alarms disasters|critical" +# +# If a recipient is set to empty string, the default recipient of the given +# notification method (email, pushover, telegram, slack) will be used. +# To disable a notification, use the recipient called: disabled +# This works for all notification methods (including the default recipients). + + +############################################################################### +# email global notification options + +# multiple recipients can be given like this: +# "admin1@example.com admin2@example.com ..." + +# enable/disable sending emails +SEND_EMAIL="YES" + +# if a role recipient is not configured, an email will be send to: +DEFAULT_RECIPIENT_EMAIL="root" +# to receive only critical alarms, set it to "root|critical" + + +############################################################################### +# pushover (pushover.net) global notification options + +# multiple recipients can be given like this: +# "USERTOKEN1 USERTOKEN2 ..." + +# enable/disable sending pushover notifications +SEND_PUSHOVER="YES" + +# Login to pushover.net to get your pushover app token. +# You need only one for all your netdata servers (or you can have one for +# each of your netdata - your call). +# Without an app token, netdata cannot send pushover notifications. +PUSHOVER_APP_TOKEN="" + +# if a role's recipients are not configured, a notification will be send to +# this pushover user token (empty = do not send a notification for unconfigured +# roles): +DEFAULT_RECIPIENT_PUSHOVER="" + + +############################################################################### +# telegram (telegram.org) global notification options + +# To get your chat ID send the command /my_id to telegram bot @get_id. +# Users also need to open a query with the bot (see below). + +# note: multiple recipients can be given like this: +# "CHAT_ID_1 CHAT_ID_1 ..." + +# enable/disable sending telegram messages +SEND_TELEGRAM="YES" + +# Contact the bot @BotFather to create a new bot and receive a bot token. +# Without it, netdata cannot send telegram messages. +TELEGRAM_BOT_TOKEN="" + +# If a role's recipients are not configured, a message will be send to +# this chat id (empty = do not send a notification for unconfigured roles): +DEFAULT_RECIPIENT_TELEGRAM="" + + +############################################################################### +# slack (slack.com) global notification options + +# multiple recipients can be given like this: +# "CHANNEL1 CHANNEL2 ..." + +# enable/disable sending slack notifications +SEND_SLACK="YES" + +# Login to slack.com and create an incoming webhook. You need only one for all +# your netdata servers (or you can have one for each of your netdata). +# Without it, netdata cannot send slack notifications. +# Get yours from: https://api.slack.com/incoming-webhooks +SLACK_WEBHOOK_URL="" + +# if a role's recipients are not configured, a notification will be send to +# this slack channel (empty = do not send a notification for unconfigured +# roles): +DEFAULT_RECIPIENT_SLACK="" + + +############################################################################### +# RECIPIENTS PER ROLE + +# ----------------------------------------------------------------------------- +# generic system alarms +# CPU, disks, network interfaces, entropy, etc + +role_recipients_email[sysadmin]="${DEFAULT_RECIPIENT_EMAIL}" + +role_recipients_pushover[sysadmin]="${DEFAULT_RECIPIENT_PUSHOVER}" + +role_recipients_telegram[sysadmin]="${DEFAULT_RECIPIENT_TELEGRAM}" + +role_recipients_slack[sysadmin]="${DEFAULT_RECIPIENT_SLACK}" + + +# ----------------------------------------------------------------------------- +# DNS related alarms + +role_recipients_email[domainadmin]="${DEFAULT_RECIPIENT_EMAIL}" + +role_recipients_pushover[domainadmin]="${DEFAULT_RECIPIENT_PUSHOVER}" + +role_recipients_telegram[domainadmin]="${DEFAULT_RECIPIENT_TELEGRAM}" + +role_recipients_slack[domainadmin]="${DEFAULT_RECIPIENT_SLACK}" + + +# ----------------------------------------------------------------------------- +# database servers alarms +# mysql, redis, memcached, etc + +role_recipients_email[dba]="${DEFAULT_RECIPIENT_EMAIL}" + +role_recipients_pushover[dba]="${DEFAULT_RECIPIENT_PUSHOVER}" + +role_recipients_telegram[dba]="${DEFAULT_RECIPIENT_TELEGRAM}" + +role_recipients_slack[dba]="${DEFAULT_RECIPIENT_SLACK}" + + +# ----------------------------------------------------------------------------- +# web servers alarms +# apache, nginx, etc + +role_recipients_email[webmaster]="${DEFAULT_RECIPIENT_EMAIL}" + +role_recipients_pushover[webmaster]="${DEFAULT_RECIPIENT_PUSHOVER}" + +role_recipients_telegram[webmaster]="${DEFAULT_RECIPIENT_TELEGRAM}" + +role_recipients_slack[webmaster]="${DEFAULT_RECIPIENT_SLACK}" + + +# ----------------------------------------------------------------------------- +# proxy servers alarms +# apache, nginx, etc + +role_recipients_email[proxyadmin]="${DEFAULT_RECIPIENT_EMAIL}" + +role_recipients_pushover[proxyadmin]="${DEFAULT_RECIPIENT_PUSHOVER}" + +role_recipients_telegram[proxyadmin]="${DEFAULT_RECIPIENT_TELEGRAM}" + +role_recipients_slack[proxyadmin]="${DEFAULT_RECIPIENT_SLACK}" + diff --git a/conf.d/health_email_recipients.conf b/conf.d/health_email_recipients.conf new file mode 100644 index 00000000..f56c6c64 --- /dev/null +++ b/conf.d/health_email_recipients.conf @@ -0,0 +1,2 @@ +# OBSOLETE FILE +# REPLACED WITH health_alarm_notify.conf diff --git a/conf.d/python.d/mysql.conf b/conf.d/python.d/mysql.conf index d247b89a..8fbbe651 100644 --- a/conf.d/python.d/mysql.conf +++ b/conf.d/python.d/mysql.conf @@ -104,6 +104,8 @@ tcp: # pass : '' host : 'localhost' port : '3306' + # keep in mind port might be ignored by mysql, if host = 'localhost' + # http://serverfault.com/questions/337818/how-to-force-mysql-to-connect-by-tcp-instead-of-a-unix-socket/337844#337844 tcpipv4: name : 'local' @@ -158,6 +160,8 @@ tcp_root: # pass : '' host : 'localhost' port : '3306' + # keep in mind port might be ignored by mysql, if host = 'localhost' + # http://serverfault.com/questions/337818/how-to-force-mysql-to-connect-by-tcp-instead-of-a-unix-socket/337844#337844 tcpipv4_root: name : 'local' diff --git a/conf.d/python.d/nginx.conf b/conf.d/python.d/nginx.conf index 1a27d67c..645925a5 100644 --- a/conf.d/python.d/nginx.conf +++ b/conf.d/python.d/nginx.conf @@ -1,5 +1,17 @@ # netdata python.d.plugin configuration for nginx # +# You must have ngx_http_stub_status_module configured on your nginx server for this +# plugin to work. The following is an example config. +# It must be located inside a server { } block. +# +# location /stub_status { +# stub_status; +# # Security: Only allow access from the IP below. +# allow 192.168.1.200; +# # Deny anyone else +# deny all; +# } +# # This file is in YaML format. Generally the format is: # # name: value @@ -47,14 +59,15 @@ # predefined parameters. These are: # # job_name: -# name: myname # the JOB's name as it will appear at the -# # dashboard (by default is the job_name) +# name: my_name # the JOB's name as it will appear at the +# # dashboard. If name: is not supplied the +# # job_name: will be used (use _ for spaces) # # JOBs sharing a name are mutually exclusive # update_every: 1 # the JOB's data collection frequency # priority: 60000 # the JOB's order on the dashboard # retries: 5 # the JOB's number of restoration attempts # -# Additionally to the above, nginx also supports the following: +# Additionally to the above, this plugin also supports the following: # # url: 'URL' # the URL to fetch nginx's status stats # @@ -63,6 +76,14 @@ # user: 'username' # pass: 'password' # +# Example +# +# RemoteNginx: +# name : 'Reverse_Proxy' +# url : 'http://yourdomain.com/stub_status' +# +# "RemoteNginx" will show up in Netdata logs. "Reverse Proxy" will show up in the menu +# in the nginx section. # ---------------------------------------------------------------------- # AUTO-DETECTION JOBS diff --git a/conf.d/python.d/phpfpm.conf b/conf.d/python.d/phpfpm.conf index 06d2367a..f5d067cc 100644 --- a/conf.d/python.d/phpfpm.conf +++ b/conf.d/python.d/phpfpm.conf @@ -57,6 +57,7 @@ # Additionally to the above, PHP-FPM also supports the following: # # url: 'URL' # the URL to fetch nginx's status stats +# # Be sure and include ?full&status at the end of the url # # if the URL is password protected, the following are supported: # @@ -70,13 +71,13 @@ localhost: name : 'local' - url : "http://localhost/status" + url : "http://localhost/status?full&json" localipv4: name : 'local' - url : "http://127.0.0.1/status" + url : "http://127.0.0.1/status?full&json" localipv6: name : 'local' - url : "http://::1/status" + url : "http://::1/status?full&json" diff --git a/conf.d/python.d/retroshare.conf b/conf.d/python.d/retroshare.conf new file mode 100644 index 00000000..79614373 --- /dev/null +++ b/conf.d/python.d/retroshare.conf @@ -0,0 +1,67 @@ +# netdata python.d.plugin configuration for RetroShare +# +# This file is in YaML format. Generally the format is: +# +# name: value +# +# There are 2 sections: +# - global variables +# - one or more JOBS +# +# JOBS allow you to collect values from multiple sources. +# Each source will have its own set of charts. +# +# JOB parameters have to be indented (using spaces only, example below). + +# ---------------------------------------------------------------------- +# Global Variables +# These variables set the defaults for all JOBs, however each JOB +# may define its own, overriding the defaults. + +# update_every sets the default data collection frequency. +# If unset, the python.d.plugin default is used. +# update_every: 1 + +# priority controls the order of charts at the netdata dashboard. +# Lower numbers move the charts towards the top of the page. +# If unset, the default for python.d.plugin is used. +# priority: 60000 + +# retries sets the number of retries to be made in case of failures. +# If unset, the default for python.d.plugin is used. +# Attempts to restore the service are made once every update_every +# and only if the module has collected values in the past. +# retries: 5 + +# ---------------------------------------------------------------------- +# JOBS (data collection sources) +# +# The default JOBS share the same *name*. JOBS with the same name +# are mutually exclusive. Only one of them will be allowed running at +# any time. This allows autodetection to try several alternatives and +# pick the one that works. +# +# Any number of jobs is supported. +# +# All python.d.plugin JOBS (for all its modules) support a set of +# predefined parameters. These are: +# +# job_name: +# name: myname # the JOB's name as it will appear at the +# # dashboard (by default is the job_name) +# # JOBs sharing a name are mutually exclusive +# update_every: 1 # the JOB's data collection frequency +# priority: 60000 # the JOB's order on the dashboard +# retries: 5 # the JOB's number of restoration attempts +# +# Additionally to the above, RetroShare also supports the following: +# +# - url: 'url' # the URL to the WebUI +# +# ---------------------------------------------------------------------- +# AUTO-DETECTION JOBS +# only one of them will run (they have the same name) + +localhost: + name: 'local' + url: 'http://localhost:9090' |