# make sure hdfs is running

template: hdfs_last_collected_secs
      on: hdfs.heap_memory
    calc: $now - $last_collected_t
   units: seconds ago
   every: 10s
    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
   delay: down 5m multiplier 1.5 max 1h
    info: number of seconds since the last successful data collection
      to: webmaster


# Common

template: hdfs_capacity_usage
      on: hdfs.capacity
    calc: ($used) * 100 / ($used + $remaining)
   units: %
   every: 10s
    warn: $this > (($status >= $WARNING)  ? (70) : (80))
    crit: $this > (($status == $CRITICAL) ? (80) : (98))
   delay: down 15m multiplier 1.5 max 1h
    info: used capacity
      to: sysadmin


# NameNode

template: hdfs_missing_blocks
      on: hdfs.blocks
    calc: $missing
   units: missing blocks
   every: 10s
    warn: $this > 0
   delay: down 15m multiplier 1.5 max 1h
    info: missing blocks
      to: sysadmin


template: hdfs_stale_nodes
      on: hdfs.data_nodes
    calc: $stale
   units: dead nodes
   every: 10s
    warn: $this > 0
   delay: down 15m multiplier 1.5 max 1h
    info: stale data nodes
      to: sysadmin


template: hdfs_dead_nodes
      on: hdfs.data_nodes
    calc: $dead
   units: dead nodes
   every: 10s
    crit: $this > 0
   delay: down 15m multiplier 1.5 max 1h
    info: dead data nodes
      to: sysadmin


# DataNode

template: hdfs_num_failed_volumes
      on: hdfs.num_failed_volumes
    calc: $fsds_num_failed_volumes
   units: failed volumes
   every: 10s
    warn: $this > 0
   delay: down 15m multiplier 1.5 max 1h
    info: failed volumes
      to: sysadmin