summaryrefslogtreecommitdiffstats
path: root/src/health/health.d/clickhouse.conf
blob: e24f71830908dd12cb9bddb4bd04de75c4992d1e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# you can disable an alarm notification by setting the 'to' line to: silent

 template: clickhouse_restarted
       on: clickhouse.uptime
    class: Error
     type: Database
component: ClickHouse
     calc: $uptime
    units: seconds
    every: 10s
     warn: $this > 1 AND $this < 180
  summary: ClickHouse restart detected
     info: ClickHouse has recently been restarted
       to: silent

 template: clickhouse_queries_preempted
       on: clickhouse.queries_preempted
    class: Workload
     type: Database
component: ClickHouse
   lookup: max -1m unaligned
    units: preempted_queries
    every: 10s
     warn: $this > 0
    delay: down 5m multiplier 1.5 max 1h
  summary: ClickHouse preempted queries detected
     info: ClickHouse has queries that are stopped and waiting due to priority setting
       to: dba

 template: clickhouse_long_running_query
       on: clickhouse.longest_running_query_time
    class: Latency
     type: Database
component: ClickHouse
   lookup: max -1m unaligned
    units: seconds
    every: 10s
     warn: $this > (($status >= $WARNING)  ? (300) : (600))
    delay: down 5m multiplier 1.5 max 1h
  summary: ClickHouse long-running query detected
     info: ClickHouse has a long-running query exceeding the threshold
       to: dba

 template: clickhouse_rejected_inserts
       on: clickhouse.rejected_inserts
    class: Workload
     type: Database
component: ClickHouse
   lookup: sum -1m unaligned
    units: rejected_inserts
    every: 10s
     warn: $this > 0
    delay: down 5m multiplier 1.5 max 1h
  summary: ClickHouse rejected INSERT queries detected
     info: ClickHouse has INSERT queries that are rejected due to high number of active data parts for partition in a MergeTree
       to: dba

 template: clickhouse_delayed_inserts
       on: clickhouse.delayed_inserts
    class: Workload
     type: Database
component: ClickHouse
   lookup: sum -1m unaligned
    units: delayed_inserts
    every: 10s
     warn: $this > 0
    delay: down 5m multiplier 1.5 max 1h
  summary: ClickHouse delayed INSERT queries detected
     info: ClickHouse has INSERT queries that are throttled due to high number of active data parts for partition in a MergeTree
       to: silent

 template: clickhouse_replication_lag
       on: clickhouse.replicas_max_absolute_delay
    class: Workload
     type: Database
component: ClickHouse
   lookup: avg -1m unaligned
    units: seconds
    every: 10s
     warn: $this > (($status >= $WARNING)  ? (250) : (300))
    delay: down 5m multiplier 1.5 max 1h
  summary: ClickHouse high replication lag detected
     info: ClickHouse is experiencing replication lag greater than 5 minutes
       to: dba

 template: clickhouse_replicated_readonly_tables
       on: clickhouse.replicated_readonly_tables
    class: Error
     type: Database
component: ClickHouse
   lookup: max -1m unaligned
    units: readonly_tables
    every: 10s
     warn: $this > 0
    delay: down 5m multiplier 1.5 max 1h
  summary: ClickHouse replicated tables in readonly state detected
     info: ClickHouse has replicated tables in readonly state due to ZooKeeper session loss/startup without ZooKeeper configured
       to: dba

 template: clickhouse_max_part_count_for_partition
       on: clickhouse.max_part_count_for_partition
    class: Workload
     type: Database
component: ClickHouse
   lookup: avg -1m unaligned
    units: parts
    every: 10s
     warn: $this > (($status >= $WARNING)  ? (200) : (300))
    delay: down 5m multiplier 1.5 max 1h
  summary: ClickHouse high parts/partition detected
     info: ClickHouse high number of parts per partition
       to: dba

 template: clickhouse_distributed_connections_failures
       on: clickhouse.distributed_connections_fail_exhausted_retries
    class: Error
     type: Database
component: ClickHouse
   lookup: sum -1m unaligned
    units: failures
    every: 10s
     warn: $this > 0
    delay: down 5m multiplier 1.5 max 1h
  summary: ClickHouse distributed connections failures detected
     info: ClickHouse has failed distributed connections after exhausting all retry attempts
       to: dba

 template: clickhouse_distributed_files_to_insert
       on: clickhouse.distributed_files_to_insert
    class: Workload
     type: Database
component: ClickHouse
   lookup: max -1m unaligned
    units: files
    every: 10s
     warn: $this > (($status >= $WARNING)  ? (40) : (80))
    delay: down 5m multiplier 1.5 max 1h
  summary: ClickHouse high files to insert detected
     info: ClickHouse high number of pending files to process for asynchronous insertion into Distributed tables
       to: silent