diff options
Diffstat (limited to 'health/health.d/httpcheck.conf')
-rw-r--r-- | health/health.d/httpcheck.conf | 205 |
1 files changed, 116 insertions, 89 deletions
diff --git a/health/health.d/httpcheck.conf b/health/health.d/httpcheck.conf index 0158f63eb..d4d6376a3 100644 --- a/health/health.d/httpcheck.conf +++ b/health/health.d/httpcheck.conf @@ -1,99 +1,126 @@ -template: httpcheck_last_collected_secs -families: * - on: httpcheck.status - calc: $now - $last_collected_t - every: 10s - units: seconds ago - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin + template: httpcheck_last_collected_secs + families: * + on: httpcheck.status + class: Other +component: HTTP endpoint + type: Latency + calc: $now - $last_collected_t + every: 10s + units: seconds ago + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin # This is a fast-reacting no-notification alarm ideal for custom dashboards or badges -template: httpcheck_web_service_up -families: * - on: httpcheck.status - lookup: average -1m unaligned percentage of success - calc: ($this < 75) ? (0) : ($this) - every: 5s - units: up/down - info: average ratio of successful HTTP requests over the last minute (at least 75%) - to: silent + template: httpcheck_web_service_up + families: * + on: httpcheck.status + class: Web Server +component: HTTP endpoint + type: Utilization + lookup: average -1m unaligned percentage of success + calc: ($this < 75) ? (0) : ($this) + every: 5s + units: up/down + info: average ratio of successful HTTP requests over the last minute (at least 75%) + to: silent -template: httpcheck_web_service_bad_content -families: * - on: httpcheck.status - lookup: average -5m unaligned percentage of bad_content - every: 10s - units: % - warn: $this >= 10 AND $this < 40 - crit: $this >= 40 - delay: down 5m multiplier 1.5 max 1h - info: average ratio of HTTP responses with unexpected content over the last 5 minutes - options: no-clear-notification - to: webmaster + template: httpcheck_web_service_bad_content + families: * + on: httpcheck.status + class: Web Server +component: HTTP endpoint + type: Workload + lookup: average -5m unaligned percentage of bad_content + every: 10s + units: % + warn: $this >= 10 AND $this < 40 + crit: $this >= 40 + delay: down 5m multiplier 1.5 max 1h + info: average ratio of HTTP responses with unexpected content over the last 5 minutes + options: no-clear-notification + to: webmaster -template: httpcheck_web_service_bad_status -families: * - on: httpcheck.status - lookup: average -5m unaligned percentage of bad_status - every: 10s - units: % - warn: $this >= 10 AND $this < 40 - crit: $this >= 40 - delay: down 5m multiplier 1.5 max 1h - info: average ratio of HTTP responses with unexpected status over the last 5 minutes - options: no-clear-notification - to: webmaster + template: httpcheck_web_service_bad_status + families: * + on: httpcheck.status + class: Web Server +component: HTTP endpoint + type: Workload + lookup: average -5m unaligned percentage of bad_status + every: 10s + units: % + warn: $this >= 10 AND $this < 40 + crit: $this >= 40 + delay: down 5m multiplier 1.5 max 1h + info: average ratio of HTTP responses with unexpected status over the last 5 minutes + options: no-clear-notification + to: webmaster -template: httpcheck_web_service_timeouts -families: * - on: httpcheck.status - lookup: average -5m unaligned percentage of timeout - every: 10s - units: % - info: average ratio of HTTP request timeouts over the last 5 minutes + template: httpcheck_web_service_timeouts + families: * + on: httpcheck.status + class: Web Server +component: HTTP endpoint + type: Latency + lookup: average -5m unaligned percentage of timeout + every: 10s + units: % + info: average ratio of HTTP request timeouts over the last 5 minutes -template: httpcheck_no_web_service_connections -families: * - on: httpcheck.status - lookup: average -5m unaligned percentage of no_connection - every: 10s - units: % - info: average ratio of failed requests during the last 5 minutes + template: httpcheck_no_web_service_connections + families: * + on: httpcheck.status + class: Other +component: HTTP endpoint + type: Errors + lookup: average -5m unaligned percentage of no_connection + every: 10s + units: % + info: average ratio of failed requests during the last 5 minutes # combined timeout & no connection alarm -template: httpcheck_web_service_unreachable -families: * - on: httpcheck.status - calc: ($httpcheck_no_web_service_connections >= $httpcheck_web_service_timeouts) ? ($httpcheck_no_web_service_connections) : ($httpcheck_web_service_timeouts) - units: % - every: 10s - warn: ($httpcheck_no_web_service_connections >= 10 OR $httpcheck_web_service_timeouts >= 10) AND ($httpcheck_no_web_service_connections < 40 OR $httpcheck_web_service_timeouts < 40) - crit: $httpcheck_no_web_service_connections >= 40 OR $httpcheck_web_service_timeouts >= 40 - delay: down 5m multiplier 1.5 max 1h - info: ratio of failed requests either due to timeouts or no connection over the last 5 minutes - options: no-clear-notification - to: webmaster + template: httpcheck_web_service_unreachable + families: * + on: httpcheck.status + class: Web Server +component: HTTP endpoint + type: Errors + calc: ($httpcheck_no_web_service_connections >= $httpcheck_web_service_timeouts) ? ($httpcheck_no_web_service_connections) : ($httpcheck_web_service_timeouts) + units: % + every: 10s + warn: ($httpcheck_no_web_service_connections >= 10 OR $httpcheck_web_service_timeouts >= 10) AND ($httpcheck_no_web_service_connections < 40 OR $httpcheck_web_service_timeouts < 40) + crit: $httpcheck_no_web_service_connections >= 40 OR $httpcheck_web_service_timeouts >= 40 + delay: down 5m multiplier 1.5 max 1h + info: ratio of failed requests either due to timeouts or no connection over the last 5 minutes + options: no-clear-notification + to: webmaster -template: httpcheck_1h_web_service_response_time -families: * - on: httpcheck.responsetime - lookup: average -1h unaligned of time - every: 30s - units: ms - info: average HTTP response time over the last hour + template: httpcheck_1h_web_service_response_time + families: * + on: httpcheck.responsetime + class: Other +component: HTTP endpoint + type: Latency + lookup: average -1h unaligned of time + every: 30s + units: ms + info: average HTTP response time over the last hour -template: httpcheck_web_service_slow -families: * - on: httpcheck.responsetime - lookup: average -3m unaligned of time - units: ms - every: 10s - warn: ($this > ($httpcheck_1h_web_service_response_time * 2) ) - crit: ($this > ($httpcheck_1h_web_service_response_time * 3) ) - delay: down 5m multiplier 1.5 max 1h - info: average HTTP response time over the last 3 minutes, compared to the average over the last hour - options: no-clear-notification - to: webmaster + template: httpcheck_web_service_slow + families: * + on: httpcheck.responsetime + class: Web Server +component: HTTP endpoint + type: Latency + lookup: average -3m unaligned of time + units: ms + every: 10s + warn: ($this > ($httpcheck_1h_web_service_response_time * 2) ) + crit: ($this > ($httpcheck_1h_web_service_response_time * 3) ) + delay: down 5m multiplier 1.5 max 1h + info: average HTTP response time over the last 3 minutes, compared to the average over the last hour + options: no-clear-notification + to: webmaster |