# Ensure that Riak is running.  template: riak_last_collected_secs
template: riak_last_collected_secs
      on: riak.kv.throughput
    calc: $now - $last_collected_t
   units: seconds ago
   every: 10s
    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
   delay: down 5m multiplier 1.5 max 1h
    info: number of seconds since the last successful data collection
      to: dba

# Warn if a list keys operation is running.
template: riak_list_keys_active
      on: riak.core.fsm_active
    calc: $list_fsm_active
   units: state machines
   every: 10s
    warn: $list_fsm_active > 0
    info: number of currently running list keys finite state machines
      to: dba


## Timing healthchecks
# KV GET
template: 1h_kv_get_mean_latency
      on: riak.kv.latency.get
    calc: $node_get_fsm_time_mean
  lookup: average -1h unaligned of time
   every: 30s
   units: ms
    info: mean average KV GET latency over the last hour

template: riak_kv_get_slow
      on: riak.kv.latency.get
    calc: $mean
  lookup: average -3m unaligned of time
   units: ms
   every: 10s
    warn: ($this > ($1h_kv_get_mean_latency * 2) )
    crit: ($this > ($1h_kv_get_mean_latency * 3) )
    info: average KV GET time over the last 3 minutes, compared to the average over the last hour
   delay: down 5m multiplier 1.5 max 1h
      to: dba

# KV PUT
template: 1h_kv_put_mean_latency
      on: riak.kv.latency.put
    calc: $node_put_fsm_time_mean
  lookup: average -1h unaligned of time
   every: 30s
   units: ms
    info: mean average KV PUT latency over the last hour

template: riak_kv_put_slow
      on: riak.kv.latency.put
    calc: $mean
  lookup: average -3m unaligned of time
   units: ms
   every: 10s
    warn: ($this > ($1h_kv_put_mean_latency * 2) )
    crit: ($this > ($1h_kv_put_mean_latency * 3) )
    info: average KV PUT time over the last 3 minutes, compared to the average over the last hour
   delay: down 5m multiplier 1.5 max 1h
      to: dba


## VM healthchecks

# Default Erlang VM process limit: 262144
# On systems observed, this is < 2000, but may grow depending on load.
template: riak_vm_high_process_count
      on: riak.vm
    calc: $sys_process_count
   units: processes
   every: 10s
    warn: $this > 10000
    crit: $this > 100000
    info: number of processes running in the Erlang VM (the default limit on ERTS 10.2.4 is 262144)
      to: dba