diff options
Diffstat (limited to 'health/health.d/riakkv.conf')
-rw-r--r-- | health/health.d/riakkv.conf | 38 |
1 files changed, 22 insertions, 16 deletions
diff --git a/health/health.d/riakkv.conf b/health/health.d/riakkv.conf index 745302778..d63460264 100644 --- a/health/health.d/riakkv.conf +++ b/health/health.d/riakkv.conf @@ -1,5 +1,5 @@ # Ensure that Riak is running. template: riak_last_collected_secs -template: riak_last_collected_secs +template: riakkv_last_collected_secs on: riak.kv.throughput calc: $now - $last_collected_t units: seconds ago @@ -11,7 +11,7 @@ template: riak_last_collected_secs to: dba # Warn if a list keys operation is running. -template: riak_list_keys_active +template: riakkv_list_keys_active on: riak.core.fsm_active calc: $list_fsm_active units: state machines @@ -23,44 +23,50 @@ template: riak_list_keys_active ## Timing healthchecks # KV GET -template: 1h_kv_get_mean_latency +template: riakkv_1h_kv_get_mean_latency on: riak.kv.latency.get calc: $node_get_fsm_time_mean lookup: average -1h unaligned of time every: 30s units: ms - info: mean average KV GET latency over the last hour + info: average time between reception of client GET request and \ + subsequent response to client over the last hour -template: riak_kv_get_slow +template: riakkv_kv_get_slow on: riak.kv.latency.get calc: $mean lookup: average -3m unaligned of time units: ms every: 10s - warn: ($this > ($1h_kv_get_mean_latency * 2) ) - crit: ($this > ($1h_kv_get_mean_latency * 3) ) - info: average KV GET time over the last 3 minutes, compared to the average over the last hour + warn: ($this > ($riakkv_1h_kv_get_mean_latency * 2) ) + crit: ($this > ($riakkv_1h_kv_get_mean_latency * 3) ) + info: average time between reception of client GET request and \ + subsequent response to the client over the last 3 minutes, \ + compared to the average over the last hour delay: down 5m multiplier 1.5 max 1h to: dba # KV PUT -template: 1h_kv_put_mean_latency +template: riakkv_1h_kv_put_mean_latency on: riak.kv.latency.put calc: $node_put_fsm_time_mean lookup: average -1h unaligned of time every: 30s units: ms - info: mean average KV PUT latency over the last hour + info: average time between reception of client PUT request and \ + subsequent response to the client over the last hour -template: riak_kv_put_slow +template: riakkv_kv_put_slow on: riak.kv.latency.put calc: $mean lookup: average -3m unaligned of time units: ms every: 10s - warn: ($this > ($1h_kv_put_mean_latency * 2) ) - crit: ($this > ($1h_kv_put_mean_latency * 3) ) - info: average KV PUT time over the last 3 minutes, compared to the average over the last hour + warn: ($this > ($riakkv_1h_kv_put_mean_latency * 2) ) + crit: ($this > ($riakkv_1h_kv_put_mean_latency * 3) ) + info: average time between reception of client PUT request and \ + subsequent response to the client over the last 3 minutes, \ + compared to the average over the last hour delay: down 5m multiplier 1.5 max 1h to: dba @@ -69,12 +75,12 @@ template: riak_kv_put_slow # Default Erlang VM process limit: 262144 # On systems observed, this is < 2000, but may grow depending on load. -template: riak_vm_high_process_count +template: riakkv_vm_high_process_count on: riak.vm calc: $sys_process_count units: processes every: 10s warn: $this > 10000 crit: $this > 100000 - info: number of processes running in the Erlang VM (the default limit on ERTS 10.2.4 is 262144) + info: number of processes running in the Erlang VM to: dba |