diff options
Diffstat (limited to '')
-rw-r--r-- | health/health.d/riakkv.conf | 159 |
1 files changed, 90 insertions, 69 deletions
diff --git a/health/health.d/riakkv.conf b/health/health.d/riakkv.conf index d63460264..b2c0e8d9c 100644 --- a/health/health.d/riakkv.conf +++ b/health/health.d/riakkv.conf @@ -1,86 +1,107 @@ # Ensure that Riak is running. template: riak_last_collected_secs -template: riakkv_last_collected_secs - on: riak.kv.throughput - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: dba + template: riakkv_last_collected_secs + on: riak.kv.throughput + class: Database +component: Riak KV + type: Latency + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: dba # Warn if a list keys operation is running. -template: riakkv_list_keys_active - on: riak.core.fsm_active - calc: $list_fsm_active - units: state machines - every: 10s - warn: $list_fsm_active > 0 - info: number of currently running list keys finite state machines - to: dba + template: riakkv_list_keys_active + on: riak.core.fsm_active + class: Database +component: Riak KV + type: Utilization + calc: $list_fsm_active + units: state machines + every: 10s + warn: $list_fsm_active > 0 + info: number of currently running list keys finite state machines + to: dba ## Timing healthchecks # KV GET -template: riakkv_1h_kv_get_mean_latency - on: riak.kv.latency.get - calc: $node_get_fsm_time_mean - lookup: average -1h unaligned of time - every: 30s - units: ms - info: average time between reception of client GET request and \ - subsequent response to client over the last hour + template: riakkv_1h_kv_get_mean_latency + on: riak.kv.latency.get + class: Database +component: Riak KV + type: Latency + calc: $node_get_fsm_time_mean + lookup: average -1h unaligned of time + every: 30s + units: ms + info: average time between reception of client GET request and \ + subsequent response to client over the last hour -template: riakkv_kv_get_slow - on: riak.kv.latency.get - calc: $mean - lookup: average -3m unaligned of time - units: ms - every: 10s - warn: ($this > ($riakkv_1h_kv_get_mean_latency * 2) ) - crit: ($this > ($riakkv_1h_kv_get_mean_latency * 3) ) - info: average time between reception of client GET request and \ - subsequent response to the client over the last 3 minutes, \ - compared to the average over the last hour - delay: down 5m multiplier 1.5 max 1h - to: dba + template: riakkv_kv_get_slow + on: riak.kv.latency.get + class: Database +component: Riak KV + type: Latency + calc: $mean + lookup: average -3m unaligned of time + units: ms + every: 10s + warn: ($this > ($riakkv_1h_kv_get_mean_latency * 2) ) + crit: ($this > ($riakkv_1h_kv_get_mean_latency * 3) ) + info: average time between reception of client GET request and \ + subsequent response to the client over the last 3 minutes, \ + compared to the average over the last hour + delay: down 5m multiplier 1.5 max 1h + to: dba # KV PUT -template: riakkv_1h_kv_put_mean_latency - on: riak.kv.latency.put - calc: $node_put_fsm_time_mean - lookup: average -1h unaligned of time - every: 30s - units: ms - info: average time between reception of client PUT request and \ - subsequent response to the client over the last hour + template: riakkv_1h_kv_put_mean_latency + on: riak.kv.latency.put + class: Database +component: Riak KV + type: Latency + calc: $node_put_fsm_time_mean + lookup: average -1h unaligned of time + every: 30s + units: ms + info: average time between reception of client PUT request and \ + subsequent response to the client over the last hour -template: riakkv_kv_put_slow - on: riak.kv.latency.put - calc: $mean - lookup: average -3m unaligned of time - units: ms - every: 10s - warn: ($this > ($riakkv_1h_kv_put_mean_latency * 2) ) - crit: ($this > ($riakkv_1h_kv_put_mean_latency * 3) ) - info: average time between reception of client PUT request and \ - subsequent response to the client over the last 3 minutes, \ - compared to the average over the last hour - delay: down 5m multiplier 1.5 max 1h - to: dba + template: riakkv_kv_put_slow + on: riak.kv.latency.put + class: Database +component: Riak KV + type: Latency + calc: $mean + lookup: average -3m unaligned of time + units: ms + every: 10s + warn: ($this > ($riakkv_1h_kv_put_mean_latency * 2) ) + crit: ($this > ($riakkv_1h_kv_put_mean_latency * 3) ) + info: average time between reception of client PUT request and \ + subsequent response to the client over the last 3 minutes, \ + compared to the average over the last hour + delay: down 5m multiplier 1.5 max 1h + to: dba ## VM healthchecks # Default Erlang VM process limit: 262144 # On systems observed, this is < 2000, but may grow depending on load. -template: riakkv_vm_high_process_count - on: riak.vm - calc: $sys_process_count - units: processes - every: 10s - warn: $this > 10000 - crit: $this > 100000 - info: number of processes running in the Erlang VM - to: dba + template: riakkv_vm_high_process_count + on: riak.vm + class: Database +component: Riak KV + type: Utilization + calc: $sys_process_count + units: processes + every: 10s + warn: $this > 10000 + crit: $this > 100000 + info: number of processes running in the Erlang VM + to: dba |