diff options
Diffstat (limited to '')
-rw-r--r-- | src/health/README.md | 17 | ||||
-rw-r--r-- | src/health/REFERENCE.md | 1106 | ||||
-rw-r--r-- | src/health/guides/adaptec_raid/adaptec_raid_ld_status.md (renamed from health/guides/adaptec_raid/adaptec_raid_ld_status.md) | 0 | ||||
-rw-r--r-- | src/health/guides/adaptec_raid/adaptec_raid_pd_state.md (renamed from health/guides/adaptec_raid/adaptec_raid_pd_state.md) | 0 | ||||
-rw-r--r-- | src/health/guides/anomalies/anomalies_anomaly_flags.md (renamed from health/guides/anomalies/anomalies_anomaly_flags.md) | 0 | ||||
-rw-r--r-- | src/health/guides/anomalies/anomalies_anomaly_probabilities.md (renamed from health/guides/anomalies/anomalies_anomaly_probabilities.md) | 0 | ||||
-rw-r--r-- | src/health/guides/apcupsd/apcupsd_10min_ups_load.md (renamed from health/guides/apcupsd/apcupsd_10min_ups_load.md) | 0 | ||||
-rw-r--r-- | src/health/guides/apcupsd/apcupsd_last_collected_secs.md (renamed from health/guides/apcupsd/apcupsd_last_collected_secs.md) | 2 | ||||
-rw-r--r-- | src/health/guides/apcupsd/apcupsd_ups_charge.md (renamed from health/guides/apcupsd/apcupsd_ups_charge.md) | 0 | ||||
-rw-r--r-- | src/health/guides/beanstalk/beanstalk_number_of_tubes.md (renamed from health/guides/beanstalk/beanstalk_number_of_tubes.md) | 0 | ||||
-rw-r--r-- | src/health/guides/beanstalk/beanstalk_server_buried_jobs.md (renamed from health/guides/beanstalk/beanstalk_server_buried_jobs.md) | 0 | ||||
-rw-r--r-- | src/health/guides/beanstalk/beanstalk_tube_buried_jobs.md (renamed from health/guides/beanstalk/beanstalk_tube_buried_jobs.md) | 0 | ||||
-rw-r--r-- | src/health/guides/boinc/boinc_active_tasks.md (renamed from health/guides/boinc/boinc_active_tasks.md) | 0 | ||||
-rw-r--r-- | src/health/guides/boinc/boinc_compute_errors.md (renamed from health/guides/boinc/boinc_compute_errors.md) | 0 | ||||
-rw-r--r-- | src/health/guides/boinc/boinc_total_tasks.md (renamed from health/guides/boinc/boinc_total_tasks.md) | 2 | ||||
-rw-r--r-- | src/health/guides/boinc/boinc_upload_errors.md (renamed from health/guides/boinc/boinc_upload_errors.md) | 2 | ||||
-rw-r--r-- | src/health/guides/btrfs/btrfs_allocated.md (renamed from health/guides/btrfs/btrfs_allocated.md) | 0 | ||||
-rw-r--r-- | src/health/guides/btrfs/btrfs_data.md (renamed from health/guides/btrfs/btrfs_data.md) | 0 | ||||
-rw-r--r-- | src/health/guides/btrfs/btrfs_device_corruption_errors.md (renamed from health/guides/btrfs/btrfs_device_corruption_errors.md) | 0 | ||||
-rw-r--r-- | src/health/guides/btrfs/btrfs_device_flush_errors.md (renamed from health/guides/btrfs/btrfs_device_flush_errors.md) | 0 | ||||
-rw-r--r-- | src/health/guides/btrfs/btrfs_device_generation_errors.md (renamed from health/guides/btrfs/btrfs_device_generation_errors.md) | 0 | ||||
-rw-r--r-- | src/health/guides/btrfs/btrfs_device_read_errors.md (renamed from health/guides/btrfs/btrfs_device_read_errors.md) | 0 | ||||
-rw-r--r-- | src/health/guides/btrfs/btrfs_device_write_errors.md (renamed from health/guides/btrfs/btrfs_device_write_errors.md) | 0 | ||||
-rw-r--r-- | src/health/guides/btrfs/btrfs_metadata.md (renamed from health/guides/btrfs/btrfs_metadata.md) | 0 | ||||
-rw-r--r-- | src/health/guides/btrfs/btrfs_system.md (renamed from health/guides/btrfs/btrfs_system.md) | 0 | ||||
-rw-r--r-- | src/health/guides/ceph/ceph_cluster_space_usage.md (renamed from health/guides/ceph/ceph_cluster_space_usage.md) | 0 | ||||
-rw-r--r-- | src/health/guides/cgroups/cgroup_10min_cpu_usage.md (renamed from health/guides/cgroups/cgroup_10min_cpu_usage.md) | 0 | ||||
-rw-r--r-- | src/health/guides/cgroups/cgroup_ram_in_use.md (renamed from health/guides/cgroups/cgroup_ram_in_use.md) | 0 | ||||
-rw-r--r-- | src/health/guides/cgroups/k8s_cgroup_10min_cpu_usage.md (renamed from health/guides/cgroups/k8s_cgroup_10min_cpu_usage.md) | 0 | ||||
-rw-r--r-- | src/health/guides/cgroups/k8s_cgroup_ram_in_use.md (renamed from health/guides/cgroups/k8s_cgroup_ram_in_use.md) | 0 | ||||
-rw-r--r-- | src/health/guides/cockroachdb/cockroachdb_open_file_descriptors_limit.md (renamed from health/guides/cockroachdb/cockroachdb_open_file_descriptors_limit.md) | 0 | ||||
-rw-r--r-- | src/health/guides/cockroachdb/cockroachdb_unavailable_ranges.md (renamed from health/guides/cockroachdb/cockroachdb_unavailable_ranges.md) | 2 | ||||
-rw-r--r-- | src/health/guides/cockroachdb/cockroachdb_underreplicated_ranges.md (renamed from health/guides/cockroachdb/cockroachdb_underreplicated_ranges.md) | 0 | ||||
-rw-r--r-- | src/health/guides/cockroachdb/cockroachdb_used_storage_capacity.md (renamed from health/guides/cockroachdb/cockroachdb_used_storage_capacity.md) | 0 | ||||
-rw-r--r-- | src/health/guides/cockroachdb/cockroachdb_used_usable_storage_capacity.md (renamed from health/guides/cockroachdb/cockroachdb_used_usable_storage_capacity.md) | 0 | ||||
-rw-r--r-- | src/health/guides/consul/consul_autopilot_health_status.md (renamed from health/guides/consul/consul_autopilot_health_status.md) | 0 | ||||
-rw-r--r-- | src/health/guides/consul/consul_autopilot_server_health_status.md (renamed from health/guides/consul/consul_autopilot_server_health_status.md) | 0 | ||||
-rw-r--r-- | src/health/guides/consul/consul_client_rpc_requests_exceeded.md (renamed from health/guides/consul/consul_client_rpc_requests_exceeded.md) | 0 | ||||
-rw-r--r-- | src/health/guides/consul/consul_client_rpc_requests_failed.md (renamed from health/guides/consul/consul_client_rpc_requests_failed.md) | 0 | ||||
-rw-r--r-- | src/health/guides/consul/consul_gc_pause_time.md (renamed from health/guides/consul/consul_gc_pause_time.md) | 0 | ||||
-rw-r--r-- | src/health/guides/consul/consul_license_expiration_time.md (renamed from health/guides/consul/consul_license_expiration_time.md) | 0 | ||||
-rw-r--r-- | src/health/guides/consul/consul_node_health_check_status.md (renamed from health/guides/consul/consul_node_health_check_status.md) | 0 | ||||
-rw-r--r-- | src/health/guides/consul/consul_raft_leader_last_contact_time.md (renamed from health/guides/consul/consul_raft_leader_last_contact_time.md) | 0 | ||||
-rw-r--r-- | src/health/guides/consul/consul_raft_leadership_transitions.md (renamed from health/guides/consul/consul_raft_leadership_transitions.md) | 0 | ||||
-rw-r--r-- | src/health/guides/consul/consul_raft_thread_fsm_saturation.md (renamed from health/guides/consul/consul_raft_thread_fsm_saturation.md) | 0 | ||||
-rw-r--r-- | src/health/guides/consul/consul_raft_thread_main_saturation.md (renamed from health/guides/consul/consul_raft_thread_main_saturation.md) | 0 | ||||
-rw-r--r-- | src/health/guides/consul/consul_service_health_check_status.md (renamed from health/guides/consul/consul_service_health_check_status.md) | 0 | ||||
-rw-r--r-- | src/health/guides/cpu/10min_cpu_iowait.md (renamed from health/guides/cpu/10min_cpu_iowait.md) | 0 | ||||
-rw-r--r-- | src/health/guides/cpu/10min_cpu_usage.md (renamed from health/guides/cpu/10min_cpu_usage.md) | 0 | ||||
-rw-r--r-- | src/health/guides/cpu/20min_steal_cpu.md (renamed from health/guides/cpu/20min_steal_cpu.md) | 0 | ||||
-rw-r--r-- | src/health/guides/dbengine/10min_dbengine_global_flushing_errors.md (renamed from health/guides/dbengine/10min_dbengine_global_flushing_errors.md) | 2 | ||||
-rw-r--r-- | src/health/guides/dbengine/10min_dbengine_global_flushing_warnings.md (renamed from health/guides/dbengine/10min_dbengine_global_flushing_warnings.md) | 2 | ||||
-rw-r--r-- | src/health/guides/dbengine/10min_dbengine_global_fs_errors.md (renamed from health/guides/dbengine/10min_dbengine_global_fs_errors.md) | 2 | ||||
-rw-r--r-- | src/health/guides/dbengine/10min_dbengine_global_io_errors.md (renamed from health/guides/dbengine/10min_dbengine_global_io_errors.md) | 2 | ||||
-rw-r--r-- | src/health/guides/disks/10min_disk_backlog.md (renamed from health/guides/disks/10min_disk_backlog.md) | 0 | ||||
-rw-r--r-- | src/health/guides/disks/10min_disk_utilization.md (renamed from health/guides/disks/10min_disk_utilization.md) | 0 | ||||
-rw-r--r-- | src/health/guides/disks/bcache_cache_dirty.md (renamed from health/guides/disks/bcache_cache_dirty.md) | 0 | ||||
-rw-r--r-- | src/health/guides/disks/bcache_cache_errors.md (renamed from health/guides/disks/bcache_cache_errors.md) | 0 | ||||
-rw-r--r-- | src/health/guides/disks/disk_inode_usage.md (renamed from health/guides/disks/disk_inode_usage.md) | 0 | ||||
-rw-r--r-- | src/health/guides/disks/disk_space_usage.md (renamed from health/guides/disks/disk_space_usage.md) | 0 | ||||
-rw-r--r-- | src/health/guides/dns_query/dns_query_query_status.md (renamed from health/guides/dns_query/dns_query_query_status.md) | 0 | ||||
-rw-r--r-- | src/health/guides/dnsmasq/dnsmasq_dhcp_dhcp_range_utilization.md (renamed from health/guides/dnsmasq/dnsmasq_dhcp_dhcp_range_utilization.md) | 0 | ||||
-rw-r--r-- | src/health/guides/docker/docker_container_unhealthy.md (renamed from health/guides/docker/docker_container_unhealthy.md) | 0 | ||||
-rw-r--r-- | src/health/guides/elasticsearch/elasticsearch_cluster_health_status_red.md (renamed from health/guides/elasticsearch/elasticsearch_cluster_health_status_red.md) | 0 | ||||
-rw-r--r-- | src/health/guides/elasticsearch/elasticsearch_cluster_health_status_yellow.md (renamed from health/guides/elasticsearch/elasticsearch_cluster_health_status_yellow.md) | 0 | ||||
-rw-r--r-- | src/health/guides/elasticsearch/elasticsearch_node_index_health_red.md (renamed from health/guides/elasticsearch/elasticsearch_node_index_health_red.md) | 0 | ||||
-rw-r--r-- | src/health/guides/elasticsearch/elasticsearch_node_indices_search_time_fetch.md (renamed from health/guides/elasticsearch/elasticsearch_node_indices_search_time_fetch.md) | 0 | ||||
-rw-r--r-- | src/health/guides/elasticsearch/elasticsearch_node_indices_search_time_query.md (renamed from health/guides/elasticsearch/elasticsearch_node_indices_search_time_query.md) | 0 | ||||
-rw-r--r-- | src/health/guides/entropy/lowest_entropy.md (renamed from health/guides/entropy/lowest_entropy.md) | 4 | ||||
-rw-r--r-- | src/health/guides/exporting/exporting_last_buffering.md (renamed from health/guides/exporting/exporting_last_buffering.md) | 2 | ||||
-rw-r--r-- | src/health/guides/exporting/exporting_metrics_sent.md (renamed from health/guides/exporting/exporting_metrics_sent.md) | 2 | ||||
-rw-r--r-- | src/health/guides/gearman/gearman_workers_queued.md (renamed from health/guides/gearman/gearman_workers_queued.md) | 0 | ||||
-rw-r--r-- | src/health/guides/geth/geth_chainhead_diff_between_header_block.md (renamed from health/guides/geth/geth_chainhead_diff_between_header_block.md) | 0 | ||||
-rw-r--r-- | src/health/guides/haproxy/haproxy_backend_server_status.md (renamed from health/guides/haproxy/haproxy_backend_server_status.md) | 0 | ||||
-rw-r--r-- | src/health/guides/haproxy/haproxy_backend_status.md (renamed from health/guides/haproxy/haproxy_backend_status.md) | 0 | ||||
-rw-r--r-- | src/health/guides/hdfs/hdfs_capacity_usage.md (renamed from health/guides/hdfs/hdfs_capacity_usage.md) | 0 | ||||
-rw-r--r-- | src/health/guides/hdfs/hdfs_dead_nodes.md (renamed from health/guides/hdfs/hdfs_dead_nodes.md) | 0 | ||||
-rw-r--r-- | src/health/guides/hdfs/hdfs_missing_blocks.md (renamed from health/guides/hdfs/hdfs_missing_blocks.md) | 0 | ||||
-rw-r--r-- | src/health/guides/hdfs/hdfs_num_failed_volumes.md (renamed from health/guides/hdfs/hdfs_num_failed_volumes.md) | 0 | ||||
-rw-r--r-- | src/health/guides/hdfs/hdfs_stale_nodes.md (renamed from health/guides/hdfs/hdfs_stale_nodes.md) | 0 | ||||
-rw-r--r-- | src/health/guides/httpcheck/httpcheck_web_service_bad_content.md (renamed from health/guides/httpcheck/httpcheck_web_service_bad_content.md) | 2 | ||||
-rw-r--r-- | src/health/guides/httpcheck/httpcheck_web_service_bad_status.md (renamed from health/guides/httpcheck/httpcheck_web_service_bad_status.md) | 2 | ||||
-rw-r--r-- | src/health/guides/httpcheck/httpcheck_web_service_no_connection.md (renamed from health/guides/httpcheck/httpcheck_web_service_no_connection.md) | 0 | ||||
-rw-r--r-- | src/health/guides/httpcheck/httpcheck_web_service_slow.md (renamed from health/guides/httpcheck/httpcheck_web_service_slow.md) | 2 | ||||
-rw-r--r-- | src/health/guides/httpcheck/httpcheck_web_service_timeouts.md (renamed from health/guides/httpcheck/httpcheck_web_service_timeouts.md) | 3 | ||||
-rw-r--r-- | src/health/guides/httpcheck/httpcheck_web_service_unreachable.md (renamed from health/guides/httpcheck/httpcheck_web_service_unreachable.md) | 2 | ||||
-rw-r--r-- | src/health/guides/httpcheck/httpcheck_web_service_up.md (renamed from health/guides/httpcheck/httpcheck_web_service_up.md) | 0 | ||||
-rw-r--r-- | src/health/guides/ioping/ioping_disk_latency.md (renamed from health/guides/ioping/ioping_disk_latency.md) | 0 | ||||
-rw-r--r-- | src/health/guides/ipc/semaphore_arrays_used.md (renamed from health/guides/ipc/semaphore_arrays_used.md) | 0 | ||||
-rw-r--r-- | src/health/guides/ipc/semaphores_used.md (renamed from health/guides/ipc/semaphores_used.md) | 3 | ||||
-rw-r--r-- | src/health/guides/ipfs/ipfs_datastore_usage.md (renamed from health/guides/ipfs/ipfs_datastore_usage.md) | 0 | ||||
-rw-r--r-- | src/health/guides/ipmi/ipmi_events.md (renamed from health/guides/ipmi/ipmi_events.md) | 0 | ||||
-rw-r--r-- | src/health/guides/ipmi/ipmi_sensors_states.md (renamed from health/guides/ipmi/ipmi_sensors_states.md) | 0 | ||||
-rw-r--r-- | src/health/guides/kubelet/kubelet_10s_pleg_relist_latency_quantile_05.md (renamed from health/guides/kubelet/kubelet_10s_pleg_relist_latency_quantile_05.md) | 0 | ||||
-rw-r--r-- | src/health/guides/kubelet/kubelet_10s_pleg_relist_latency_quantile_09.md (renamed from health/guides/kubelet/kubelet_10s_pleg_relist_latency_quantile_09.md) | 0 | ||||
-rw-r--r-- | src/health/guides/kubelet/kubelet_10s_pleg_relist_latency_quantile_099.md (renamed from health/guides/kubelet/kubelet_10s_pleg_relist_latency_quantile_099.md) | 0 | ||||
-rw-r--r-- | src/health/guides/kubelet/kubelet_1m_pleg_relist_latency_quantile_05.md (renamed from health/guides/kubelet/kubelet_1m_pleg_relist_latency_quantile_05.md) | 0 | ||||
-rw-r--r-- | src/health/guides/kubelet/kubelet_1m_pleg_relist_latency_quantile_09.md (renamed from health/guides/kubelet/kubelet_1m_pleg_relist_latency_quantile_09.md) | 0 | ||||
-rw-r--r-- | src/health/guides/kubelet/kubelet_1m_pleg_relist_latency_quantile_099.md (renamed from health/guides/kubelet/kubelet_1m_pleg_relist_latency_quantile_099.md) | 0 | ||||
-rw-r--r-- | src/health/guides/kubelet/kubelet_node_config_error.md (renamed from health/guides/kubelet/kubelet_node_config_error.md) | 0 | ||||
-rw-r--r-- | src/health/guides/kubelet/kubelet_operations_error.md (renamed from health/guides/kubelet/kubelet_operations_error.md) | 0 | ||||
-rw-r--r-- | src/health/guides/kubelet/kubelet_token_requests.md (renamed from health/guides/kubelet/kubelet_token_requests.md) | 0 | ||||
-rw-r--r-- | src/health/guides/linux_power_supply/linux_power_supply_capacity.md (renamed from health/guides/linux_power_supply/linux_power_supply_capacity.md) | 0 | ||||
-rw-r--r-- | src/health/guides/load/load_average_1.md (renamed from health/guides/load/load_average_1.md) | 0 | ||||
-rw-r--r-- | src/health/guides/load/load_average_15.md (renamed from health/guides/load/load_average_15.md) | 2 | ||||
-rw-r--r-- | src/health/guides/load/load_average_5.md (renamed from health/guides/load/load_average_5.md) | 2 | ||||
-rw-r--r-- | src/health/guides/load/load_cpu_number.md (renamed from health/guides/load/load_cpu_number.md) | 0 | ||||
-rw-r--r-- | src/health/guides/mdstat/mdstat_disks.md (renamed from health/guides/mdstat/mdstat_disks.md) | 0 | ||||
-rw-r--r-- | src/health/guides/mdstat/mdstat_mismatch_cnt.md (renamed from health/guides/mdstat/mdstat_mismatch_cnt.md) | 0 | ||||
-rw-r--r-- | src/health/guides/mdstat/mdstat_nonredundant_last_collected.md (renamed from health/guides/mdstat/mdstat_nonredundant_last_collected.md) | 2 | ||||
-rw-r--r-- | src/health/guides/megacli/megacli_adapter_state.md (renamed from health/guides/megacli/megacli_adapter_state.md) | 0 | ||||
-rw-r--r-- | src/health/guides/megacli/megacli_bbu_cycle_count.md (renamed from health/guides/megacli/megacli_bbu_cycle_count.md) | 0 | ||||
-rw-r--r-- | src/health/guides/megacli/megacli_bbu_relative_charge.md (renamed from health/guides/megacli/megacli_bbu_relative_charge.md) | 0 | ||||
-rw-r--r-- | src/health/guides/megacli/megacli_pd_media_errors.md (renamed from health/guides/megacli/megacli_pd_media_errors.md) | 0 | ||||
-rw-r--r-- | src/health/guides/megacli/megacli_pd_predictive_failures.md (renamed from health/guides/megacli/megacli_pd_predictive_failures.md) | 0 | ||||
-rw-r--r-- | src/health/guides/memcached/memcached_cache_fill_rate.md (renamed from health/guides/memcached/memcached_cache_fill_rate.md) | 0 | ||||
-rw-r--r-- | src/health/guides/memcached/memcached_cache_memory_usage.md (renamed from health/guides/memcached/memcached_cache_memory_usage.md) | 0 | ||||
-rw-r--r-- | src/health/guides/memcached/memcached_out_of_cache_space_time.md (renamed from health/guides/memcached/memcached_out_of_cache_space_time.md) | 0 | ||||
-rw-r--r-- | src/health/guides/memory/1hour_ecc_memory_correctable.md (renamed from health/guides/memory/1hour_ecc_memory_correctable.md) | 0 | ||||
-rw-r--r-- | src/health/guides/memory/1hour_ecc_memory_uncorrectable.md (renamed from health/guides/memory/1hour_ecc_memory_uncorrectable.md) | 0 | ||||
-rw-r--r-- | src/health/guides/memory/1hour_memory_hw_corrupted.md (renamed from health/guides/memory/1hour_memory_hw_corrupted.md) | 0 | ||||
-rw-r--r-- | src/health/guides/ml/ml_1min_node_ar.md | 26 | ||||
-rw-r--r-- | src/health/guides/mysql/mysql_10s_slow_queries.md (renamed from health/guides/mysql/mysql_10s_slow_queries.md) | 0 | ||||
-rw-r--r-- | src/health/guides/mysql/mysql_10s_table_locks_immediate.md (renamed from health/guides/mysql/mysql_10s_table_locks_immediate.md) | 0 | ||||
-rw-r--r-- | src/health/guides/mysql/mysql_10s_table_locks_waited.md (renamed from health/guides/mysql/mysql_10s_table_locks_waited.md) | 0 | ||||
-rw-r--r-- | src/health/guides/mysql/mysql_10s_waited_locks_ratio.md (renamed from health/guides/mysql/mysql_10s_waited_locks_ratio.md) | 0 | ||||
-rw-r--r-- | src/health/guides/mysql/mysql_connections.md (renamed from health/guides/mysql/mysql_connections.md) | 0 | ||||
-rw-r--r-- | src/health/guides/mysql/mysql_galera_cluster_size.md (renamed from health/guides/mysql/mysql_galera_cluster_size.md) | 0 | ||||
-rw-r--r-- | src/health/guides/mysql/mysql_galera_cluster_size_max_2m.md (renamed from health/guides/mysql/mysql_galera_cluster_size_max_2m.md) | 0 | ||||
-rw-r--r-- | src/health/guides/mysql/mysql_galera_cluster_state_crit.md (renamed from health/guides/mysql/mysql_galera_cluster_state_crit.md) | 0 | ||||
-rw-r--r-- | src/health/guides/mysql/mysql_galera_cluster_state_warn.md (renamed from health/guides/mysql/mysql_galera_cluster_state_warn.md) | 0 | ||||
-rw-r--r-- | src/health/guides/mysql/mysql_galera_cluster_status.md (renamed from health/guides/mysql/mysql_galera_cluster_status.md) | 0 | ||||
-rw-r--r-- | src/health/guides/mysql/mysql_replication.md (renamed from health/guides/mysql/mysql_replication.md) | 0 | ||||
-rw-r--r-- | src/health/guides/mysql/mysql_replication_lag.md (renamed from health/guides/mysql/mysql_replication_lag.md) | 0 | ||||
-rw-r--r-- | src/health/guides/net/10min_fifo_errors.md (renamed from health/guides/net/10min_fifo_errors.md) | 0 | ||||
-rw-r--r-- | src/health/guides/net/10min_netisr_backlog_exceeded.md (renamed from health/guides/net/10min_netisr_backlog_exceeded.md) | 0 | ||||
-rw-r--r-- | src/health/guides/net/10s_received_packets_storm.md (renamed from health/guides/net/10s_received_packets_storm.md) | 0 | ||||
-rw-r--r-- | src/health/guides/net/1m_received_packets_rate.md (renamed from health/guides/net/1m_received_packets_rate.md) | 0 | ||||
-rw-r--r-- | src/health/guides/net/1m_received_traffic_overflow.md (renamed from health/guides/net/1m_received_traffic_overflow.md) | 0 | ||||
-rw-r--r-- | src/health/guides/net/1m_sent_traffic_overflow.md (renamed from health/guides/net/1m_sent_traffic_overflow.md) | 0 | ||||
-rw-r--r-- | src/health/guides/net/inbound_packets_dropped.md (renamed from health/guides/net/inbound_packets_dropped.md) | 0 | ||||
-rw-r--r-- | src/health/guides/net/inbound_packets_dropped_ratio.md (renamed from health/guides/net/inbound_packets_dropped_ratio.md) | 0 | ||||
-rw-r--r-- | src/health/guides/net/interface_inbound_errors.md (renamed from health/guides/net/interface_inbound_errors.md) | 0 | ||||
-rw-r--r-- | src/health/guides/net/interface_outbound_errors.md (renamed from health/guides/net/interface_outbound_errors.md) | 0 | ||||
-rw-r--r-- | src/health/guides/net/interface_speed.md (renamed from health/guides/net/interface_speed.md) | 0 | ||||
-rw-r--r-- | src/health/guides/net/outbound_packets_dropped.md (renamed from health/guides/net/outbound_packets_dropped.md) | 4 | ||||
-rw-r--r-- | src/health/guides/net/outbound_packets_dropped_ratio.md (renamed from health/guides/net/outbound_packets_dropped_ratio.md) | 0 | ||||
-rw-r--r-- | src/health/guides/netdev/1min_netdev_backlog_exceeded.md (renamed from health/guides/netdev/1min_netdev_backlog_exceeded.md) | 0 | ||||
-rw-r--r-- | src/health/guides/netdev/1min_netdev_budget_ran_outs.md (renamed from health/guides/netdev/1min_netdev_budget_ran_outs.md) | 0 | ||||
-rw-r--r-- | src/health/guides/netfilter/netfilter_conntrack_full.md (renamed from health/guides/netfilter/netfilter_conntrack_full.md) | 0 | ||||
-rw-r--r-- | src/health/guides/nvme/nvme_device_critical_warnings_state.md (renamed from health/guides/nvme/nvme_device_critical_warnings_state.md) | 0 | ||||
-rw-r--r-- | src/health/guides/pihole/pihole_blocklist_last_update.md (renamed from health/guides/pihole/pihole_blocklist_last_update.md) | 0 | ||||
-rw-r--r-- | src/health/guides/pihole/pihole_status.md (renamed from health/guides/pihole/pihole_status.md) | 0 | ||||
-rw-r--r-- | src/health/guides/ping/ping_host_latency.md (renamed from health/guides/ping/ping_host_latency.md) | 0 | ||||
-rw-r--r-- | src/health/guides/ping/ping_host_reachable.md (renamed from health/guides/ping/ping_host_reachable.md) | 0 | ||||
-rw-r--r-- | src/health/guides/ping/ping_packet_loss.md (renamed from health/guides/ping/ping_packet_loss.md) | 0 | ||||
-rw-r--r-- | src/health/guides/portcheck/portcheck_connection_fails.md (renamed from health/guides/portcheck/portcheck_connection_fails.md) | 0 | ||||
-rw-r--r-- | src/health/guides/portcheck/portcheck_connection_timeouts.md (renamed from health/guides/portcheck/portcheck_connection_timeouts.md) | 3 | ||||
-rw-r--r-- | src/health/guides/portcheck/portcheck_service_reachable.md (renamed from health/guides/portcheck/portcheck_service_reachable.md) | 0 | ||||
-rw-r--r-- | src/health/guides/postgres/postgres_acquired_locks_utilization.md (renamed from health/guides/postgres/postgres_acquired_locks_utilization.md) | 0 | ||||
-rw-r--r-- | src/health/guides/postgres/postgres_db_cache_io_ratio.md (renamed from health/guides/postgres/postgres_db_cache_io_ratio.md) | 0 | ||||
-rw-r--r-- | src/health/guides/postgres/postgres_db_deadlocks_rate.md (renamed from health/guides/postgres/postgres_db_deadlocks_rate.md) | 0 | ||||
-rw-r--r-- | src/health/guides/postgres/postgres_db_transactions_rollback_ratio.md (renamed from health/guides/postgres/postgres_db_transactions_rollback_ratio.md) | 0 | ||||
-rw-r--r-- | src/health/guides/postgres/postgres_index_bloat_size_perc.md (renamed from health/guides/postgres/postgres_index_bloat_size_perc.md) | 0 | ||||
-rw-r--r-- | src/health/guides/postgres/postgres_table_bloat_size_perc.md (renamed from health/guides/postgres/postgres_table_bloat_size_perc.md) | 0 | ||||
-rw-r--r-- | src/health/guides/postgres/postgres_table_cache_io_ratio.md (renamed from health/guides/postgres/postgres_table_cache_io_ratio.md) | 3 | ||||
-rw-r--r-- | src/health/guides/postgres/postgres_table_index_cache_io_ratio.md (renamed from health/guides/postgres/postgres_table_index_cache_io_ratio.md) | 0 | ||||
-rw-r--r-- | src/health/guides/postgres/postgres_table_last_autoanalyze_time.md (renamed from health/guides/postgres/postgres_table_last_autoanalyze_time.md) | 0 | ||||
-rw-r--r-- | src/health/guides/postgres/postgres_table_last_autovacuum_time.md (renamed from health/guides/postgres/postgres_table_last_autovacuum_time.md) | 0 | ||||
-rw-r--r-- | src/health/guides/postgres/postgres_table_toast_cache_io_ratio.md (renamed from health/guides/postgres/postgres_table_toast_cache_io_ratio.md) | 0 | ||||
-rw-r--r-- | src/health/guides/postgres/postgres_table_toast_index_cache_io_ratio.md (renamed from health/guides/postgres/postgres_table_toast_index_cache_io_ratio.md) | 0 | ||||
-rw-r--r-- | src/health/guides/postgres/postgres_total_connection_utilization.md (renamed from health/guides/postgres/postgres_total_connection_utilization.md) | 0 | ||||
-rw-r--r-- | src/health/guides/postgres/postgres_txid_exhaustion_perc.md (renamed from health/guides/postgres/postgres_txid_exhaustion_perc.md) | 0 | ||||
-rw-r--r-- | src/health/guides/processes/active_processes.md (renamed from health/guides/processes/active_processes.md) | 0 | ||||
-rw-r--r-- | src/health/guides/qos/10min_qos_packet_drops.md (renamed from health/guides/qos/10min_qos_packet_drops.md) | 0 | ||||
-rw-r--r-- | src/health/guides/ram/oom_kill.md (renamed from health/guides/ram/oom_kill.md) | 0 | ||||
-rw-r--r-- | src/health/guides/ram/ram_available.md (renamed from health/guides/ram/ram_available.md) | 0 | ||||
-rw-r--r-- | src/health/guides/ram/ram_in_use.md (renamed from health/guides/ram/ram_in_use.md) | 0 | ||||
-rw-r--r-- | src/health/guides/redis/redis_bgsave_broken.md (renamed from health/guides/redis/redis_bgsave_broken.md) | 0 | ||||
-rw-r--r-- | src/health/guides/redis/redis_bgsave_slow.md (renamed from health/guides/redis/redis_bgsave_slow.md) | 0 | ||||
-rw-r--r-- | src/health/guides/redis/redis_connections_rejected.md (renamed from health/guides/redis/redis_connections_rejected.md) | 0 | ||||
-rw-r--r-- | src/health/guides/redis/redis_master_link_down.md (renamed from health/guides/redis/redis_master_link_down.md) | 0 | ||||
-rw-r--r-- | src/health/guides/retroshare/retroshare_dht_working.md (renamed from health/guides/retroshare/retroshare_dht_working.md) | 0 | ||||
-rw-r--r-- | src/health/guides/riakkv/riakkv_1h_kv_get_mean_latency.md (renamed from health/guides/riakkv/riakkv_1h_kv_get_mean_latency.md) | 0 | ||||
-rw-r--r-- | src/health/guides/riakkv/riakkv_1h_kv_put_mean_latency.md (renamed from health/guides/riakkv/riakkv_1h_kv_put_mean_latency.md) | 0 | ||||
-rw-r--r-- | src/health/guides/riakkv/riakkv_kv_get_slow.md (renamed from health/guides/riakkv/riakkv_kv_get_slow.md) | 5 | ||||
-rw-r--r-- | src/health/guides/riakkv/riakkv_kv_put_slow.md (renamed from health/guides/riakkv/riakkv_kv_put_slow.md) | 0 | ||||
-rw-r--r-- | src/health/guides/riakkv/riakkv_list_keys_active.md (renamed from health/guides/riakkv/riakkv_list_keys_active.md) | 0 | ||||
-rw-r--r-- | src/health/guides/riakkv/riakkv_vm_high_process_count.md (renamed from health/guides/riakkv/riakkv_vm_high_process_count.md) | 0 | ||||
-rw-r--r-- | src/health/guides/scaleio/scaleio_sdc_mdm_connection_state.md (renamed from health/guides/scaleio/scaleio_sdc_mdm_connection_state.md) | 0 | ||||
-rw-r--r-- | src/health/guides/scaleio/scaleio_storage_pool_capacity_utilization.md (renamed from health/guides/scaleio/scaleio_storage_pool_capacity_utilization.md) | 0 | ||||
-rw-r--r-- | src/health/guides/sync/sync_freq.md (renamed from health/guides/sync/sync_freq.md) | 0 | ||||
-rw-r--r-- | src/health/guides/systemdunits/systemd_automount_unit_failed_state.md (renamed from health/guides/systemdunits/systemd_automount_unit_failed_state.md) | 0 | ||||
-rw-r--r-- | src/health/guides/systemdunits/systemd_device_unit_failed_state.md (renamed from health/guides/systemdunits/systemd_device_unit_failed_state.md) | 0 | ||||
-rw-r--r-- | src/health/guides/systemdunits/systemd_mount_unit_failed_state.md (renamed from health/guides/systemdunits/systemd_mount_unit_failed_state.md) | 0 | ||||
-rw-r--r-- | src/health/guides/systemdunits/systemd_path_unit_failed_state.md (renamed from health/guides/systemdunits/systemd_path_unit_failed_state.md) | 0 | ||||
-rw-r--r-- | src/health/guides/systemdunits/systemd_scope_unit_failed_state.md (renamed from health/guides/systemdunits/systemd_scope_unit_failed_state.md) | 0 | ||||
-rw-r--r-- | src/health/guides/systemdunits/systemd_service_unit_failed_state.md (renamed from health/guides/systemdunits/systemd_service_unit_failed_state.md) | 0 | ||||
-rw-r--r-- | src/health/guides/systemdunits/systemd_slice_unit_failed_state.md (renamed from health/guides/systemdunits/systemd_slice_unit_failed_state.md) | 0 | ||||
-rw-r--r-- | src/health/guides/systemdunits/systemd_socket_unit_failed_state.md (renamed from health/guides/systemdunits/systemd_socket_unit_failed_state.md) | 0 | ||||
-rw-r--r-- | src/health/guides/systemdunits/systemd_swap_unit_failed_state.md (renamed from health/guides/systemdunits/systemd_swap_unit_failed_state.md) | 0 | ||||
-rw-r--r-- | src/health/guides/systemdunits/systemd_target_unit_failed_state.md (renamed from health/guides/systemdunits/systemd_target_unit_failed_state.md) | 0 | ||||
-rw-r--r-- | src/health/guides/tcp/10s_ipv4_tcp_resets_received.md (renamed from health/guides/tcp/10s_ipv4_tcp_resets_received.md) | 0 | ||||
-rw-r--r-- | src/health/guides/tcp/10s_ipv4_tcp_resets_sent.md (renamed from health/guides/tcp/10s_ipv4_tcp_resets_sent.md) | 0 | ||||
-rw-r--r-- | src/health/guides/tcp/1m_ipv4_tcp_resets_received.md (renamed from health/guides/tcp/1m_ipv4_tcp_resets_received.md) | 0 | ||||
-rw-r--r-- | src/health/guides/tcp/1m_ipv4_tcp_resets_sent.md (renamed from health/guides/tcp/1m_ipv4_tcp_resets_sent.md) | 0 | ||||
-rw-r--r-- | src/health/guides/tcp/1m_tcp_accept_queue_drops.md (renamed from health/guides/tcp/1m_tcp_accept_queue_drops.md) | 0 | ||||
-rw-r--r-- | src/health/guides/tcp/1m_tcp_accept_queue_overflows.md (renamed from health/guides/tcp/1m_tcp_accept_queue_overflows.md) | 0 | ||||
-rw-r--r-- | src/health/guides/tcp/1m_tcp_syn_queue_cookies.md (renamed from health/guides/tcp/1m_tcp_syn_queue_cookies.md) | 0 | ||||
-rw-r--r-- | src/health/guides/tcp/1m_tcp_syn_queue_drops.md (renamed from health/guides/tcp/1m_tcp_syn_queue_drops.md) | 0 | ||||
-rw-r--r-- | src/health/guides/tcp/tcp_connections.md (renamed from health/guides/tcp/tcp_connections.md) | 0 | ||||
-rw-r--r-- | src/health/guides/tcp/tcp_memory.md (renamed from health/guides/tcp/tcp_memory.md) | 0 | ||||
-rw-r--r-- | src/health/guides/tcp/tcp_orphans.md (renamed from health/guides/tcp/tcp_orphans.md) | 0 | ||||
-rw-r--r-- | src/health/guides/timex/system_clock_sync_state.md (renamed from health/guides/timex/system_clock_sync_state.md) | 0 | ||||
-rw-r--r-- | src/health/guides/udp/1m_ipv4_udp_receive_buffer_errors.md (renamed from health/guides/udp/1m_ipv4_udp_receive_buffer_errors.md) | 0 | ||||
-rw-r--r-- | src/health/guides/udp/1m_ipv4_udp_send_buffer_errors.md (renamed from health/guides/udp/1m_ipv4_udp_send_buffer_errors.md) | 0 | ||||
-rw-r--r-- | src/health/guides/unbound/unbound_request_list_dropped.md (renamed from health/guides/unbound/unbound_request_list_dropped.md) | 0 | ||||
-rw-r--r-- | src/health/guides/unbound/unbound_request_list_overwritten.md (renamed from health/guides/unbound/unbound_request_list_overwritten.md) | 0 | ||||
-rw-r--r-- | src/health/guides/upsd/upsd_10min_ups_load.md (renamed from health/guides/upsd/upsd_10min_ups_load.md) | 0 | ||||
-rw-r--r-- | src/health/guides/upsd/upsd_ups_battery_charge.md (renamed from health/guides/upsd/upsd_ups_battery_charge.md) | 0 | ||||
-rw-r--r-- | src/health/guides/upsd/upsd_ups_last_collected_secs.md (renamed from health/guides/upsd/upsd_ups_last_collected_secs.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vcsa/vcsa_applmgmt_health.md (renamed from health/guides/vcsa/vcsa_applmgmt_health.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vcsa/vcsa_database_storage_health.md (renamed from health/guides/vcsa/vcsa_database_storage_health.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vcsa/vcsa_load_health.md (renamed from health/guides/vcsa/vcsa_load_health.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vcsa/vcsa_mem_health.md (renamed from health/guides/vcsa/vcsa_mem_health.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vcsa/vcsa_software_updates_health.md (renamed from health/guides/vcsa/vcsa_software_updates_health.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vcsa/vcsa_storage_health.md (renamed from health/guides/vcsa/vcsa_storage_health.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vcsa/vcsa_swap_health.md (renamed from health/guides/vcsa/vcsa_swap_health.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vcsa/vcsa_system_health.md (renamed from health/guides/vcsa/vcsa_system_health.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vernemq/vernemq_average_scheduler_utilization.md (renamed from health/guides/vernemq/vernemq_average_scheduler_utilization.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vernemq/vernemq_cluster_dropped.md (renamed from health/guides/vernemq/vernemq_cluster_dropped.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vernemq/vernemq_mqtt_connack_sent_reason_unsuccessful.md (renamed from health/guides/vernemq/vernemq_mqtt_connack_sent_reason_unsuccessful.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vernemq/vernemq_mqtt_disconnect_received_reason_not_normal.md (renamed from health/guides/vernemq/vernemq_mqtt_disconnect_received_reason_not_normal.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vernemq/vernemq_mqtt_disconnect_sent_reason_not_normal.md (renamed from health/guides/vernemq/vernemq_mqtt_disconnect_sent_reason_not_normal.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vernemq/vernemq_mqtt_puback_received_reason_unsuccessful.md (renamed from health/guides/vernemq/vernemq_mqtt_puback_received_reason_unsuccessful.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vernemq/vernemq_mqtt_puback_sent_reason_unsuccessful.md (renamed from health/guides/vernemq/vernemq_mqtt_puback_sent_reason_unsuccessful.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vernemq/vernemq_mqtt_puback_unexpected.md (renamed from health/guides/vernemq/vernemq_mqtt_puback_unexpected.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vernemq/vernemq_mqtt_pubcomp_received_reason_unsuccessful.md (renamed from health/guides/vernemq/vernemq_mqtt_pubcomp_received_reason_unsuccessful.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vernemq/vernemq_mqtt_pubcomp_sent_reason_unsuccessful.md (renamed from health/guides/vernemq/vernemq_mqtt_pubcomp_sent_reason_unsuccessful.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vernemq/vernemq_mqtt_pubcomp_unexpected.md (renamed from health/guides/vernemq/vernemq_mqtt_pubcomp_unexpected.md) | 2 | ||||
-rw-r--r-- | src/health/guides/vernemq/vernemq_mqtt_publish_auth_errors.md (renamed from health/guides/vernemq/vernemq_mqtt_publish_auth_errors.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vernemq/vernemq_mqtt_publish_errors.md (renamed from health/guides/vernemq/vernemq_mqtt_publish_errors.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vernemq/vernemq_mqtt_pubrec_invalid_error.md (renamed from health/guides/vernemq/vernemq_mqtt_pubrec_invalid_error.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vernemq/vernemq_mqtt_pubrec_received_reason_unsuccessful.md (renamed from health/guides/vernemq/vernemq_mqtt_pubrec_received_reason_unsuccessful.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vernemq/vernemq_mqtt_pubrec_sent_reason_unsuccessful.md (renamed from health/guides/vernemq/vernemq_mqtt_pubrec_sent_reason_unsuccessful.md) | 2 | ||||
-rw-r--r-- | src/health/guides/vernemq/vernemq_mqtt_pubrel_received_reason_unsuccessful.md (renamed from health/guides/vernemq/vernemq_mqtt_pubrel_received_reason_unsuccessful.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vernemq/vernemq_mqtt_pubrel_sent_reason_unsuccessful.md (renamed from health/guides/vernemq/vernemq_mqtt_pubrel_sent_reason_unsuccessful.md) | 3 | ||||
-rw-r--r-- | src/health/guides/vernemq/vernemq_mqtt_subscribe_auth_error.md (renamed from health/guides/vernemq/vernemq_mqtt_subscribe_auth_error.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vernemq/vernemq_mqtt_subscribe_error.md (renamed from health/guides/vernemq/vernemq_mqtt_subscribe_error.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vernemq/vernemq_mqtt_unsubscribe_error.md (renamed from health/guides/vernemq/vernemq_mqtt_unsubscribe_error.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vernemq/vernemq_netsplits.md (renamed from health/guides/vernemq/vernemq_netsplits.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vernemq/vernemq_queue_message_drop.md (renamed from health/guides/vernemq/vernemq_queue_message_drop.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vernemq/vernemq_queue_message_expired.md (renamed from health/guides/vernemq/vernemq_queue_message_expired.md) | 1 | ||||
-rw-r--r-- | src/health/guides/vernemq/vernemq_queue_message_unhandled.md (renamed from health/guides/vernemq/vernemq_queue_message_unhandled.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vernemq/vernemq_socket_errors.md (renamed from health/guides/vernemq/vernemq_socket_errors.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vsphere/vsphere_cpu_usage.md (renamed from health/guides/vsphere/vsphere_cpu_usage.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vsphere/vsphere_host_mem_usage.md (renamed from health/guides/vsphere/vsphere_host_mem_usage.md) | 5 | ||||
-rw-r--r-- | src/health/guides/vsphere/vsphere_inbound_packets_dropped.md (renamed from health/guides/vsphere/vsphere_inbound_packets_dropped.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vsphere/vsphere_inbound_packets_dropped_ratio.md (renamed from health/guides/vsphere/vsphere_inbound_packets_dropped_ratio.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vsphere/vsphere_inbound_packets_errors.md (renamed from health/guides/vsphere/vsphere_inbound_packets_errors.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vsphere/vsphere_inbound_packets_errors_ratio.md (renamed from health/guides/vsphere/vsphere_inbound_packets_errors_ratio.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vsphere/vsphere_outbound_packets_dropped.md (renamed from health/guides/vsphere/vsphere_outbound_packets_dropped.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vsphere/vsphere_outbound_packets_dropped_ratio.md (renamed from health/guides/vsphere/vsphere_outbound_packets_dropped_ratio.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vsphere/vsphere_outbound_packets_errors.md (renamed from health/guides/vsphere/vsphere_outbound_packets_errors.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vsphere/vsphere_outbound_packets_errors_ratio.md (renamed from health/guides/vsphere/vsphere_outbound_packets_errors_ratio.md) | 0 | ||||
-rw-r--r-- | src/health/guides/vsphere/vsphere_vm_mem_usage.md (renamed from health/guides/vsphere/vsphere_vm_mem_usage.md) | 0 | ||||
-rw-r--r-- | src/health/guides/web_log/1m_bad_requests.md (renamed from health/guides/web_log/1m_bad_requests.md) | 0 | ||||
-rw-r--r-- | src/health/guides/web_log/1m_internal_errors.md (renamed from health/guides/web_log/1m_internal_errors.md) | 0 | ||||
-rw-r--r-- | src/health/guides/web_log/1m_successful.md (renamed from health/guides/web_log/1m_successful.md) | 0 | ||||
-rw-r--r-- | src/health/guides/web_log/web_log_10m_response_time.md (renamed from health/guides/web_log/web_log_10m_response_time.md) | 0 | ||||
-rw-r--r-- | src/health/guides/web_log/web_log_1m_bad_requests.md (renamed from health/guides/web_log/web_log_1m_bad_requests.md) | 0 | ||||
-rw-r--r-- | src/health/guides/web_log/web_log_1m_internal_errors.md (renamed from health/guides/web_log/web_log_1m_internal_errors.md) | 0 | ||||
-rw-r--r-- | src/health/guides/web_log/web_log_1m_redirects.md (renamed from health/guides/web_log/web_log_1m_redirects.md) | 0 | ||||
-rw-r--r-- | src/health/guides/web_log/web_log_1m_requests.md (renamed from health/guides/web_log/web_log_1m_requests.md) | 0 | ||||
-rw-r--r-- | src/health/guides/web_log/web_log_1m_successful.md (renamed from health/guides/web_log/web_log_1m_successful.md) | 0 | ||||
-rw-r--r-- | src/health/guides/web_log/web_log_1m_total_requests.md (renamed from health/guides/web_log/web_log_1m_total_requests.md) | 5 | ||||
-rw-r--r-- | src/health/guides/web_log/web_log_1m_unmatched.md | 15 | ||||
-rw-r--r-- | src/health/guides/web_log/web_log_5m_requests_ratio.md (renamed from health/guides/web_log/web_log_5m_requests_ratio.md) | 0 | ||||
-rw-r--r-- | src/health/guides/web_log/web_log_5m_successful.md (renamed from health/guides/web_log/web_log_5m_successful.md) | 4 | ||||
-rw-r--r-- | src/health/guides/web_log/web_log_5m_successful_old.md (renamed from health/guides/web_log/web_log_5m_successful_old.md) | 0 | ||||
-rw-r--r-- | src/health/guides/web_log/web_log_web_slow.md (renamed from health/guides/web_log/web_log_web_slow.md) | 9 | ||||
-rw-r--r-- | src/health/guides/whoisquery/whoisquery_days_until_expiration.md (renamed from health/guides/whoisquery/whoisquery_days_until_expiration.md) | 0 | ||||
-rw-r--r-- | src/health/guides/wifi/wifi_inbound_packets_dropped_ratio.md (renamed from health/guides/wifi/wifi_inbound_packets_dropped_ratio.md) | 1 | ||||
-rw-r--r-- | src/health/guides/wifi/wifi_outbound_packets_dropped_ratio.md (renamed from health/guides/wifi/wifi_outbound_packets_dropped_ratio.md) | 4 | ||||
-rw-r--r-- | src/health/guides/windows/windows_10min_cpu_usage.md (renamed from health/guides/windows/windows_10min_cpu_usage.md) | 7 | ||||
-rw-r--r-- | src/health/guides/windows/windows_disk_in_use.md (renamed from health/guides/windows/windows_disk_in_use.md) | 3 | ||||
-rw-r--r-- | src/health/guides/windows/windows_inbound_packets_discarded.md (renamed from health/guides/windows/windows_inbound_packets_discarded.md) | 3 | ||||
-rw-r--r-- | src/health/guides/windows/windows_inbound_packets_errors.md (renamed from health/guides/windows/windows_inbound_packets_errors.md) | 3 | ||||
-rw-r--r-- | src/health/guides/windows/windows_outbound_packets_discarded.md (renamed from health/guides/windows/windows_outbound_packets_discarded.md) | 3 | ||||
-rw-r--r-- | src/health/guides/windows/windows_outbound_packets_errors.md (renamed from health/guides/windows/windows_outbound_packets_errors.md) | 7 | ||||
-rw-r--r-- | src/health/guides/windows/windows_ram_in_use.md (renamed from health/guides/windows/windows_ram_in_use.md) | 4 | ||||
-rw-r--r-- | src/health/guides/windows/windows_swap_in_use.md (renamed from health/guides/windows/windows_swap_in_use.md) | 5 | ||||
-rw-r--r-- | src/health/guides/x509check/x509check_days_until_expiration.md (renamed from health/guides/x509check/x509check_days_until_expiration.md) | 3 | ||||
-rw-r--r-- | src/health/guides/x509check/x509check_revocation_status.md (renamed from health/guides/x509check/x509check_revocation_status.md) | 8 | ||||
-rw-r--r-- | src/health/guides/zfs/zfs_memory_throttle.md (renamed from health/guides/zfs/zfs_memory_throttle.md) | 0 | ||||
-rw-r--r-- | src/health/guides/zfs/zfs_pool_state_crit.md (renamed from health/guides/zfs/zfs_pool_state_crit.md) | 0 | ||||
-rw-r--r-- | src/health/guides/zfs/zfs_pool_state_warn.md (renamed from health/guides/zfs/zfs_pool_state_warn.md) | 0 | ||||
-rw-r--r-- | src/health/health.c | 172 | ||||
-rw-r--r-- | src/health/health.d/adaptec_raid.conf | 29 | ||||
-rw-r--r-- | src/health/health.d/anomalies.conf | 25 | ||||
-rw-r--r-- | src/health/health.d/apcupsd.conf (renamed from health/health.d/apcupsd.conf) | 4 | ||||
-rw-r--r-- | src/health/health.d/bcache.conf (renamed from health/health.d/bcache.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/beanstalkd.conf (renamed from health/health.d/beanstalkd.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/boinc.conf (renamed from health/health.d/boinc.conf) | 10 | ||||
-rw-r--r-- | src/health/health.d/btrfs.conf (renamed from health/health.d/btrfs.conf) | 19 | ||||
-rw-r--r-- | src/health/health.d/ceph.conf (renamed from health/health.d/ceph.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/cgroups.conf | 67 | ||||
-rw-r--r-- | src/health/health.d/clickhouse.conf | 140 | ||||
-rw-r--r-- | src/health/health.d/cockroachdb.conf (renamed from health/health.d/cockroachdb.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/consul.conf (renamed from health/health.d/consul.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/cpu.conf | 65 | ||||
-rw-r--r-- | src/health/health.d/dbengine.conf (renamed from health/health.d/dbengine.conf) | 9 | ||||
-rw-r--r-- | src/health/health.d/disks.conf | 161 | ||||
-rw-r--r-- | src/health/health.d/dns_query.conf (renamed from health/health.d/dns_query.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/dnsmasq_dhcp.conf (renamed from health/health.d/dnsmasq_dhcp.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/docker.conf (renamed from health/health.d/docker.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/elasticsearch.conf (renamed from health/health.d/elasticsearch.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/entropy.conf | 19 | ||||
-rw-r--r-- | src/health/health.d/exporting.conf (renamed from health/health.d/exporting.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/file_descriptors.conf | 30 | ||||
-rw-r--r-- | src/health/health.d/gearman.conf (renamed from health/health.d/gearman.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/geth.conf (renamed from health/health.d/geth.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/go.d.plugin.conf | 17 | ||||
-rw-r--r-- | src/health/health.d/haproxy.conf (renamed from health/health.d/haproxy.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/hdfs.conf (renamed from health/health.d/hdfs.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/httpcheck.conf | 88 | ||||
-rw-r--r-- | src/health/health.d/ioping.conf (renamed from health/health.d/ioping.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/ipc.conf | 32 | ||||
-rw-r--r-- | src/health/health.d/ipfs.conf (renamed from health/health.d/ipfs.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/ipmi.conf (renamed from health/health.d/ipmi.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/isc_dhcpd.conf | 15 | ||||
-rw-r--r-- | src/health/health.d/kubelet.conf (renamed from health/health.d/kubelet.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/linux_power_supply.conf (renamed from health/health.d/linux_power_supply.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/load.conf | 67 | ||||
-rw-r--r-- | src/health/health.d/lvm.conf | 31 | ||||
-rw-r--r-- | src/health/health.d/mdstat.conf (renamed from health/health.d/mdstat.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/megacli.conf | 77 | ||||
-rw-r--r-- | src/health/health.d/memcached.conf (renamed from health/health.d/memcached.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/memory.conf | 76 | ||||
-rw-r--r-- | src/health/health.d/ml.conf (renamed from health/health.d/ml.conf) | 7 | ||||
-rw-r--r-- | src/health/health.d/mysql.conf (renamed from health/health.d/mysql.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/net.conf | 239 | ||||
-rw-r--r-- | src/health/health.d/netfilter.conf | 18 | ||||
-rw-r--r-- | src/health/health.d/nvme.conf (renamed from health/health.d/nvme.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/pihole.conf (renamed from health/health.d/pihole.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/ping.conf | 50 | ||||
-rw-r--r-- | src/health/health.d/plugin.conf (renamed from health/health.d/plugin.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/portcheck.conf (renamed from health/health.d/portcheck.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/postgres.conf | 216 | ||||
-rw-r--r-- | src/health/health.d/processes.conf (renamed from health/health.d/processes.conf) | 1 | ||||
-rw-r--r-- | src/health/health.d/python.d.plugin.conf | 17 | ||||
-rw-r--r-- | src/health/health.d/qos.conf | 16 | ||||
-rw-r--r-- | src/health/health.d/ram.conf | 76 | ||||
-rw-r--r-- | src/health/health.d/redis.conf | 58 | ||||
-rw-r--r-- | src/health/health.d/retroshare.conf (renamed from health/health.d/retroshare.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/riakkv.conf (renamed from health/health.d/riakkv.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/scaleio.conf (renamed from health/health.d/scaleio.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/softnet.conf | 53 | ||||
-rw-r--r-- | src/health/health.d/storcli.conf | 61 | ||||
-rw-r--r-- | src/health/health.d/swap.conf | 34 | ||||
-rw-r--r-- | src/health/health.d/synchronization.conf (renamed from health/health.d/synchronization.conf) | 1 | ||||
-rw-r--r-- | src/health/health.d/systemdunits.conf | 177 | ||||
-rw-r--r-- | src/health/health.d/tcp_conn.conf | 21 | ||||
-rw-r--r-- | src/health/health.d/tcp_listen.conf | 93 | ||||
-rw-r--r-- | src/health/health.d/tcp_mem.conf | 22 | ||||
-rw-r--r-- | src/health/health.d/tcp_orphans.conf | 22 | ||||
-rw-r--r-- | src/health/health.d/tcp_resets.conf | 66 | ||||
-rw-r--r-- | src/health/health.d/timex.conf | 17 | ||||
-rw-r--r-- | src/health/health.d/udp_errors.conf | 37 | ||||
-rw-r--r-- | src/health/health.d/unbound.conf (renamed from health/health.d/unbound.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/upsd.conf | 46 | ||||
-rw-r--r-- | src/health/health.d/vcsa.conf (renamed from health/health.d/vcsa.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/vernemq.conf (renamed from health/health.d/vernemq.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/vsphere.conf | 66 | ||||
-rw-r--r-- | src/health/health.d/web_log.conf (renamed from health/health.d/web_log.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/whoisquery.conf | 14 | ||||
-rw-r--r-- | src/health/health.d/windows.conf | 108 | ||||
-rw-r--r-- | src/health/health.d/x509check.conf | 26 | ||||
-rw-r--r-- | src/health/health.d/zfs.conf | 90 | ||||
-rw-r--r-- | src/health/health.h | 102 | ||||
-rw-r--r-- | src/health/health_config.c | 842 | ||||
-rw-r--r-- | src/health/health_dyncfg.c | 842 | ||||
-rw-r--r-- | src/health/health_event_loop.c | 771 | ||||
-rw-r--r-- | src/health/health_internals.h | 129 | ||||
-rw-r--r-- | src/health/health_json.c | 286 | ||||
-rw-r--r-- | src/health/health_log.c (renamed from health/health_log.c) | 36 | ||||
-rw-r--r-- | src/health/health_notifications.c | 569 | ||||
-rw-r--r-- | src/health/health_prototypes.c | 717 | ||||
-rw-r--r-- | src/health/health_prototypes.h | 138 | ||||
-rw-r--r-- | src/health/health_silencers.c | 495 | ||||
-rw-r--r-- | src/health/health_silencers.h | 55 | ||||
-rw-r--r-- | src/health/health_variable.c | 466 | ||||
-rw-r--r-- | src/health/notifications/README.md | 207 | ||||
-rwxr-xr-x | src/health/notifications/alarm-email.sh (renamed from health/notifications/alarm-email.sh) | 0 | ||||
-rwxr-xr-x | src/health/notifications/alarm-notify.sh.in (renamed from health/notifications/alarm-notify.sh.in) | 37 | ||||
-rwxr-xr-x | src/health/notifications/alarm-test.sh (renamed from health/notifications/alarm-test.sh) | 0 | ||||
-rw-r--r-- | src/health/notifications/alerta/README.md | 128 | ||||
-rw-r--r-- | src/health/notifications/alerta/metadata.yaml (renamed from health/notifications/alerta/metadata.yaml) | 0 | ||||
-rw-r--r-- | src/health/notifications/awssns/README.md | 180 | ||||
-rw-r--r-- | src/health/notifications/awssns/metadata.yaml (renamed from health/notifications/awssns/metadata.yaml) | 0 | ||||
-rw-r--r-- | src/health/notifications/custom/README.md | 211 | ||||
-rw-r--r-- | src/health/notifications/custom/metadata.yaml (renamed from health/notifications/custom/metadata.yaml) | 0 | ||||
-rw-r--r-- | src/health/notifications/discord/README.md | 117 | ||||
-rw-r--r-- | src/health/notifications/discord/metadata.yaml (renamed from health/notifications/discord/metadata.yaml) | 0 | ||||
-rw-r--r-- | src/health/notifications/dynatrace/README.md | 124 | ||||
-rw-r--r-- | src/health/notifications/dynatrace/metadata.yaml (renamed from health/notifications/dynatrace/metadata.yaml) | 0 | ||||
-rw-r--r-- | src/health/notifications/email/README.md | 114 | ||||
-rw-r--r-- | src/health/notifications/email/metadata.yaml (renamed from health/notifications/email/metadata.yaml) | 0 | ||||
-rw-r--r-- | src/health/notifications/flock/README.md | 113 | ||||
-rw-r--r-- | src/health/notifications/flock/metadata.yaml (renamed from health/notifications/flock/metadata.yaml) | 0 | ||||
-rw-r--r-- | src/health/notifications/gotify/README.md | 98 | ||||
-rw-r--r-- | src/health/notifications/gotify/metadata.yaml (renamed from health/notifications/gotify/metadata.yaml) | 0 | ||||
-rwxr-xr-x | src/health/notifications/health_alarm_notify.conf (renamed from health/notifications/health_alarm_notify.conf) | 0 | ||||
-rw-r--r-- | src/health/notifications/health_email_recipients.conf (renamed from health/notifications/health_email_recipients.conf) | 0 | ||||
-rw-r--r-- | src/health/notifications/irc/README.md | 132 | ||||
-rw-r--r-- | src/health/notifications/irc/metadata.yaml (renamed from health/notifications/irc/metadata.yaml) | 0 | ||||
-rw-r--r-- | src/health/notifications/kavenegar/README.md | 120 | ||||
-rw-r--r-- | src/health/notifications/kavenegar/metadata.yaml (renamed from health/notifications/kavenegar/metadata.yaml) | 0 | ||||
-rw-r--r-- | src/health/notifications/matrix/README.md | 132 | ||||
-rw-r--r-- | src/health/notifications/matrix/metadata.yaml | 91 | ||||
-rw-r--r-- | src/health/notifications/messagebird/README.md | 117 | ||||
-rw-r--r-- | src/health/notifications/messagebird/metadata.yaml (renamed from health/notifications/messagebird/metadata.yaml) | 0 | ||||
-rw-r--r-- | src/health/notifications/msteams/README.md | 118 | ||||
-rw-r--r-- | src/health/notifications/msteams/metadata.yaml (renamed from health/notifications/msteams/metadata.yaml) | 0 | ||||
-rw-r--r-- | src/health/notifications/ntfy/README.md | 135 | ||||
-rw-r--r-- | src/health/notifications/ntfy/metadata.yaml (renamed from health/notifications/ntfy/metadata.yaml) | 0 | ||||
-rw-r--r-- | src/health/notifications/opsgenie/README.md | 98 | ||||
-rw-r--r-- | src/health/notifications/opsgenie/metadata.yaml (renamed from health/notifications/opsgenie/metadata.yaml) | 0 | ||||
-rw-r--r-- | src/health/notifications/pagerduty/README.md | 117 | ||||
-rw-r--r-- | src/health/notifications/pagerduty/metadata.yaml (renamed from health/notifications/pagerduty/metadata.yaml) | 0 | ||||
-rw-r--r-- | src/health/notifications/prowl/README.md | 119 | ||||
-rw-r--r-- | src/health/notifications/prowl/metadata.yaml (renamed from health/notifications/prowl/metadata.yaml) | 0 | ||||
-rw-r--r-- | src/health/notifications/pushbullet/README.md | 117 | ||||
-rw-r--r-- | src/health/notifications/pushbullet/metadata.yaml (renamed from health/notifications/pushbullet/metadata.yaml) | 0 | ||||
-rw-r--r-- | src/health/notifications/pushover/README.md | 119 | ||||
-rw-r--r-- | src/health/notifications/pushover/metadata.yaml (renamed from health/notifications/pushover/metadata.yaml) | 0 | ||||
-rw-r--r-- | src/health/notifications/rocketchat/README.md | 116 | ||||
-rw-r--r-- | src/health/notifications/rocketchat/metadata.yaml (renamed from health/notifications/rocketchat/metadata.yaml) | 0 | ||||
-rw-r--r-- | src/health/notifications/sample-metadata.yaml (renamed from health/notifications/sample-metadata.yaml) | 0 | ||||
-rw-r--r-- | src/health/notifications/slack/README.md | 101 | ||||
-rw-r--r-- | src/health/notifications/slack/metadata.yaml (renamed from health/notifications/slack/metadata.yaml) | 0 | ||||
-rw-r--r-- | src/health/notifications/smstools3/README.md | 126 | ||||
-rw-r--r-- | src/health/notifications/smstools3/metadata.yaml (renamed from health/notifications/smstools3/metadata.yaml) | 0 | ||||
-rw-r--r-- | src/health/notifications/syslog/README.md | 132 | ||||
-rw-r--r-- | src/health/notifications/syslog/metadata.yaml (renamed from health/notifications/syslog/metadata.yaml) | 0 | ||||
-rw-r--r-- | src/health/notifications/telegram/README.md | 117 | ||||
-rw-r--r-- | src/health/notifications/telegram/metadata.yaml | 76 | ||||
-rw-r--r-- | src/health/notifications/twilio/README.md | 118 | ||||
-rw-r--r-- | src/health/notifications/twilio/metadata.yaml (renamed from health/notifications/twilio/metadata.yaml) | 0 | ||||
-rw-r--r-- | src/health/notifications/web/README.md | 18 | ||||
-rw-r--r-- | src/health/rrdcalc.c | 512 | ||||
-rw-r--r-- | src/health/rrdcalc.h | 148 | ||||
-rw-r--r-- | src/health/rrdvar.c | 342 | ||||
-rw-r--r-- | src/health/rrdvar.h | 44 | ||||
-rw-r--r-- | src/health/schema.d/health%3Aalert%3Aprototype.json | 687 |
455 files changed, 14669 insertions, 170 deletions
diff --git a/src/health/README.md b/src/health/README.md new file mode 100644 index 000000000..5c479af5f --- /dev/null +++ b/src/health/README.md @@ -0,0 +1,17 @@ +# Alerts and notifications + +Netdata offers two ways to receive alert notifications on external integrations. These methods work independently, which means you can enable both at the same time to send alert notifications to any number of endpoints. + +Both methods use a node's health alerts to generate the content of a notification. + +Read our documentation on [configuring alerts](/src/health/REFERENCE.md) to change the preconfigured thresholds or to create tailored alerts for your infrastructure. + +- Netdata Cloud provides centralized alert notifications, utilizing the health status data already sent to Netdata Cloud from connected nodes to send alerts to configured integrations. [Supported integrations](/docs/alerts-&-notifications/notifications/centralized-cloud-notifications) include Amazon SNS, Discord, Slack, Splunk, and others. + +- The Netdata Agent offers a [wider range of notification options](/docs/alerts-&-notifications/notifications/agent-dispatched-notifications) directly from the agent itself. You can choose from over a dozen services, including email, Slack, PagerDuty, Twilio, and others, for more granular control over notifications on each node. + +The Netdata Agent is a health watchdog for the health and performance of your systems, services, and applications. We've worked closely with our community of DevOps engineers, SREs, and developers to define hundreds of production-ready alerts that work without any configuration. + +The Agent's health monitoring system is also dynamic and fully customizable. You can write entirely new alerts, tune the pre-configured alerts for every app/service [the Agent collects metrics from](/src/collectors/COLLECTORS.md), or silence anything you're not interested in. You can even power complex lookups by running statistical algorithms against your metrics. + +You can [use various alert notification methods](/docs/alerts-and-notifications/notifications/README.md), [customize alerts](/src/health/REFERENCE.md), and [disable/silence](/src/health/REFERENCE.md#disable-or-silence-alerts) alerts. diff --git a/src/health/REFERENCE.md b/src/health/REFERENCE.md new file mode 100644 index 000000000..8b0a9177e --- /dev/null +++ b/src/health/REFERENCE.md @@ -0,0 +1,1106 @@ +# Configure alerts + +Netdata's health watchdog is highly configurable, with support for dynamic thresholds, hysteresis, alert templates, and +more. You can tweak any of the existing alerts based on your infrastructure's topology or specific monitoring needs, or +create new entities. + +You can use health alerts in conjunction with any of Netdata's [collectors](/src/collectors/README.md) (see +the [supported collector list](/src/collectors/COLLECTORS.md)) to monitor the health of your systems, containers, and +applications in real time. + +While you can see active alerts both on the local dashboard and Netdata Cloud, all health alerts are configured _per +node_ via individual Netdata Agents. If you want to deploy a new alert across your +[infrastructure](/docs/netdata-cloud/organize-your-infrastructure-invite-your-team.md), you must configure each node with the same health configuration +files. + +## Reload health configuration + +You do not need to restart the Netdata Agent between changes to health configuration files, such as specific health entities. Instead, you can use `netdatacli` and the `reload-health` option to prevent gaps in metrics collection. + +```bash +sudo netdatacli reload-health +``` + +If `netdatacli` doesn't work on your system, send a `SIGUSR2` signal to the daemon, which reloads health configuration without restarting the entire process. + +```bash +killall -USR2 netdata +``` + +## Edit health configuration files + +You can configure the Agent's health watchdog service by editing files in two locations: + +- The `[health]` section in `netdata.conf`. By editing the daemon's behavior, you can disable health monitoring + altogether, run health checks more or less often, and more. See + [daemon configuration](/src/daemon/config/README.md#health-section-options) for a table of + all the available settings, their default values, and what they control. + +- The individual `.conf` files in `health.d/`. These health entity files are organized by the type of metric they are + performing calculations on or their associated collector. You should edit these files using the `edit-config` + script. For example: `sudo ./edit-config health.d/cpu.conf`. + +Navigate to your [Netdata config directory](/docs/netdata-agent/configuration/README.md) and +use `edit-config` to make changes to any of these files. + +### Edit individual alerts + +For example, to edit the `cpu.conf` health configuration file, run: + +```bash +sudo ./edit-config health.d/cpu.conf +``` + +Each health configuration file contains one or more health _entities_, which always begin with `alarm:` or `template:`. +For example, here is the first health entity in `health.d/cpu.conf`: + +```yaml + template: 10min_cpu_usage + on: system.cpu + class: Utilization + type: System +component: CPU + lookup: average -10m unaligned of user,system,softirq,irq,guest + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (75) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + summary: CPU utilization + info: Average cpu utilization for the last 10 minutes (excluding iowait, nice and steal) + to: sysadmin +``` + +To tune this alert to trigger warning and critical alerts at a lower CPU utilization, change the `warn` and `crit` lines +to the values of your choosing. For example: + +```yaml + warn: $this > (($status >= $WARNING) ? (60) : (75)) + crit: $this > (($status == $CRITICAL) ? (75) : (85)) +``` + +Save the file and [reload Netdata's health configuration](#reload-health-configuration) to apply your changes. + +## Disable or silence alerts + +Alerts and notifications can be disabled permanently via configuration changes, or temporarily, via the +[health management API](/src/web/api/health/README.md). The +available options are described below. + +### Disable all alerts + +In the `netdata.conf` `[health]` section, set `enabled` to `no`, and restart the agent. + +### Disable some alerts + +In the `netdata.conf` `[health]` section, set `enabled alarms` to a +[simple pattern](https://github.com/netdata/netdata/edit/master/src/libnetdata/simple_pattern/README.md) that +excludes one or more alerts. e.g. `enabled alarms = !oom_kill *` will load all alerts except `oom_kill`. + +You can also [edit the file where the alert is defined](#edit-individual-alerts), comment out its definition, +and [reload Netdata's health configuration](#reload-health-configuration). + +### Silence an individual alert + +You can stop receiving notification for an individual alert by [changing](#edit-individual-alerts) the `to:` line to `silent`. + +```yaml + to: silent +``` + +This action requires that you [reload Netdata's health configuration](#reload-health-configuration). + +### Temporarily disable alerts at runtime + +When you need to frequently disable all or some alerts from triggering during certain times (for instance +when running backups) you can use the +[health management API](/src/web/api/health/README.md). +The API allows you to issue commands to control the health engine's behavior without changing configuration, +or restarting the agent. + +### Temporarily silence notifications at runtime + +If you want health checks to keep running and alerts to keep getting triggered, but notifications to be +suppressed temporarily, you can use the +[health management API](/src/web/api/health/README.md). +The API allows you to issue commands to control the health engine's behavior without changing configuration, +or restarting the agent. + +## Write a new health entity + +While tuning existing alerts may work in some cases, you may need to write entirely new health entities based on how +your systems, containers, and applications work. + +Read the [health entity reference](#health-entity-reference) for a full listing of the format, +syntax, and functionality of health entities. + +To write a new health entity into a new file, navigate to your [Netdata config directory](/docs/netdata-agent/configuration/README.md), +then use `touch` to create a new file in the `health.d/` directory. Use `edit-config` to start editing the file. + +As an example, let's create a `ram-usage.conf` file. + +```bash +sudo touch health.d/ram-usage.conf +sudo ./edit-config health.d/ram-usage.conf +``` + +For example, here is a health entity that triggers a warning alert when a node's RAM usage rises above 80%, and a +critical alert above 90%: + +```yaml + alarm: ram_usage + on: system.ram +lookup: average -1m percentage of used + units: % + every: 1m + warn: $this > 80 + crit: $this > 90 + info: The percentage of RAM being used by the system. +``` + +Let's look into each of the lines to see how they create a working health entity. + +- `alarm`: The name for your new entity. The name needs to follow these requirements: + - Any alphabet letter or number. + - The symbols `.` and `_`. + - Cannot be `chart name`, `dimension name`, `family name`, or `chart variable names`. + +- `on`: Which chart the entity listens to. + +- `lookup`: Which metrics the alert monitors, the duration of time to monitor, and how to process the metrics into a + usable format. + - `average`: Calculate the average of all the metrics collected. + - `-1m`: Use metrics from 1 minute ago until now to calculate that average. + - `percentage`: Clarify that we're calculating a percentage of RAM usage. + - `of used`: Specify which dimension (`used`) on the `system.ram` chart you want to monitor with this entity. + +- `units`: Use percentages rather than absolute units. + +- `every`: How often to perform the `lookup` calculation to decide whether to trigger this alert. + +- `warn`/`crit`: The value at which Netdata should trigger a warning or critical alert. This example uses simple + syntax, but most pre-configured health entities use + [hysteresis](#special-use-of-the-conditional-operator) to avoid superfluous notifications. + +- `info`: A description of the alert, which will appear in the dashboard and notifications. + +In human-readable format: + +> This health entity, named **ram_usage**, watches the **system.ram** chart. It looks up the last **1 minute** of +> metrics from the **used** dimension and calculates the **average** of all those metrics in a **percentage** format, +> using a **% unit**. The entity performs this lookup **every minute**. +> +> If the average RAM usage percentage over the last 1 minute is **more than 80%**, the entity triggers a warning alert. +> If the usage is **more than 90%**, the entity triggers a critical alert. + +When you finish writing this new health entity, [reload Netdata's health configuration](#reload-health-configuration) to +see it live on the local dashboard or Netdata Cloud. + +## Health entity reference + +The following reference contains information about the syntax and options of _health entities_, which Netdata attaches +to charts in order to trigger alerts. + +### Entity types + +There are two entity types: **alarms** and **templates**. They have the same format and feature set—the only difference +is their label. + +**Alerts** are attached to specific charts and use the `alarm` label. + +**Templates** define rules that apply to all charts of a specific context, and use the `template` label. Templates help +you apply one entity to all disks, all network interfaces, all MySQL databases, and so on. + +Alerts have higher precedence and will override templates. +If the `alert` and `template` entities have the same name and are attached to the same chart, Netdata will use `alarm`. + +### Entity format + +Netdata parses the following lines. Beneath the table is an in-depth explanation of each line's purpose and syntax. + +- The `alarm` or `template` line must be the first line of any entity. +- The `on` line is **always required**. +- The `every` line is **required** if not using `lookup`. +- Each entity **must** have at least one of the following lines: `lookup`, `calc`, `warn`, or `crit`. +- A few lines use space-separated lists to define how the entity behaves. You can use `*` as a wildcard or prefix with + `!` for a negative match. Order is important, too! See our [simple patterns docs](/src/libnetdata/simple_pattern/README.md) for + more examples. +- Lines terminated by a `\` are spliced together with the next line. The backslash is removed and the following line is + joined with the current one. No space is inserted, so you may split a line anywhere, even in the middle of a word. + This comes in handy if your `info` line consists of several sentences. + +| line | required | functionality | +|-----------------------------------------------------|-----------------|---------------------------------------------------------------------------------------| +| [`alarm`/`template`](#alert-line-alarm-or-template) | yes | Name of the alert/template. | +| [`on`](#alert-line-on) | yes | The chart this alert should attach to. | +| [`class`](#alert-line-class) | no | The general alert classification. | +| [`type`](#alert-line-type) | no | What area of the system the alert monitors. | +| [`component`](#alert-line-component) | no | Specific component of the type of the alert. | +| [`lookup`](#alert-line-lookup) | yes | The database lookup to find and process metrics for the chart specified through `on`. | +| [`calc`](#alert-line-calc) | yes (see above) | A calculation to apply to the value found via `lookup` or another variable. | +| [`every`](#alert-line-every) | no | The frequency of the alert. | +| [`green`/`red`](#alert-lines-green-and-red) | no | Set the green and red thresholds of a chart. | +| [`warn`/`crit`](#alert-lines-warn-and-crit) | yes (see above) | Expressions evaluating to true or false, and when true, will trigger the alert. | +| [`to`](#alert-line-to) | no | A list of roles to send notifications to. | +| [`exec`](#alert-line-exec) | no | The script to execute when the alert changes status. | +| [`delay`](#alert-line-delay) | no | Optional hysteresis settings to prevent floods of notifications. | +| [`repeat`](#alert-line-repeat) | no | The interval for sending notifications when an alert is in WARNING or CRITICAL mode. | +| [`options`](#alert-line-options) | no | Add an option to not clear alerts. | +| [`host labels`](#alert-line-host-labels) | no | Restrict an alert or template to a list of matching labels present on a host. | +| [`chart labels`](#alert-line-chart-labels) | no | Restrict an alert or template to a list of matching labels present on a chart. | +| [`summary`](#alert-line-summary) | no | A brief description of the alert. | +| [`info`](#alert-line-info) | no | A longer text field that provides more information of this alert | + +The `alarm` or `template` line must be the first line of any entity. + +#### Alert line `alarm` or `template` + +This line starts an alert or template based on the [entity type](#entity-types) you're interested in creating. + +**Alert:** + +```yaml +alarm: NAME +``` + +**Template:** + +```yaml +template: NAME +``` + +`NAME` can be any alpha character, with `.` (period) and `_` (underscore) as the only allowed symbols, but the names +cannot be `chart name`, `dimension name`, `family name`, or `chart variables names`. + +#### Alert line `on` + +This line defines the chart this alert should attach to. + +**Alerts:** + +```yaml +on: CHART +``` + +The value `CHART` should be the unique ID or name of the chart you're interested in, as shown on the dashboard. In the +image below, the unique ID is `system.cpu`. + +![Finding the unique ID of a +chart](https://user-images.githubusercontent.com/1153921/67443082-43b16e80-f5b8-11e9-8d33-d6ee052c6678.png) + +**Template:** + +```yaml +on: CONTEXT +``` + +The value `CONTEXT` should be the context you want this template to attach to. + +Need to find the context? Hover over the date on any given chart and look at the tooltip. In the image below, which +shows a disk I/O chart, the tooltip reads: `proc:/proc/diskstats, disk.io`. + +![Finding the context of a chart via the tooltip](https://user-images.githubusercontent.com/1153921/68882856-2b230880-06cd-11ea-923b-b28c4632d479.png) + +You're interested in what comes after the comma: `disk.io`. That's the name of the chart's context. + +If you create a template using the `disk.io` context, it will apply an alert to every disk available on your system. + +#### Alert line `class` + +This indicates the type of error (or general problem area) that the alert or template applies to. For example, `Latency` can be used for alerts that trigger on latency issues on network interfaces, web servers, or database systems. Example: + +```yaml +class: Latency +``` + +<details> +<summary>Netdata's stock alerts use the following `class` attributes by default:</summary> + +| Class | +|-------------| +| Errors | +| Latency | +| Utilization | +| Workload | + +</details> + +`class` will default to `Unknown` if the line is missing from the alert configuration. + +#### Alert line `type` + +Type can be used to indicate the broader area of the system that the alert applies to. For example, under the general `Database` type, you can group together alerts that operate on various database systems, like `MySQL`, `CockroachDB`, `CouchDB` etc. Example: + +```yaml +type: Database +``` + +<details> +<summary>Netdata's stock alerts use the following `type` attributes by default, but feel free to adjust for your own requirements.</summary> + +| Type | Description | +|-----------------|------------------------------------------------------------------------------------------------| +| Ad Filtering | Services related to Ad Filtering (like pi-hole) | +| Certificates | Certificates monitoring related | +| Cgroups | Alerts for cpu and memory usage of control groups | +| Computing | Alerts for shared computing applications (e.g. boinc) | +| Containers | Container related alerts (e.g. docker instances) | +| Database | Database systems (e.g. MySQL, PostgreSQL, etc) | +| Data Sharing | Used to group together alerts for data sharing applications | +| DHCP | Alerts for dhcp related services | +| DNS | Alerts for dns related services | +| Kubernetes | Alerts for kubernetes nodes monitoring | +| KV Storage | Key-Value pairs services alerts (e.g. memcached) | +| Linux | Services specific to Linux (e.g. systemd) | +| Messaging | Alerts for message passing services (e.g. vernemq) | +| Netdata | Internal Netdata components monitoring | +| Other | When an alert doesn't fit in other types. | +| Power Supply | Alerts from power supply related services (e.g. apcupsd) | +| Search engine | Alerts for search services (e.g. elasticsearch) | +| Storage | Class for alerts dealing with storage services (storage devices typically live under `System`) | +| System | General system alerts (e.g. cpu, network, etc.) | +| Virtual Machine | Virtual Machine software | +| Web Proxy | Web proxy software (e.g. squid) | +| Web Server | Web server software (e.g. Apache, ngnix, etc.) | +| Windows | Alerts for monitor of windows services | + +</details> + +If an alert configuration is missing the `type` line, its value will default to `Unknown`. + +#### Alert line `component` + +Component can be used to narrow down what the previous `type` value specifies for each alert or template. Continuing from the previous example, `component` might include `MySQL`, `CockroachDB`, `MongoDB`, all under the same `Database` type. Example: + +```yaml +component: MySQL +``` + +As with the `class` and `type` line, if `component` is missing from the configuration, its value will default to `Unknown`. + +#### Alert line `lookup` + +This line makes a database lookup to find a value. This result of this lookup is available as `$this`. + +The format is: + +```yaml +lookup: METHOD(GROUPING OPTIONS) AFTER [at BEFORE] [every DURATION] [OPTIONS] [of DIMENSIONS] +``` + +The full [database query API](/src/web/api/queries/README.md) is supported. In short: + +- `METHOD` is one of the available [grouping methods](/src/web/api/queries/README.md#grouping-methods) such as `average`, `min`, `max` etc. + This is required. + + - `GROUPING OPTIONS` are optional and can have the form `CONDITION VALUE`, where `CONDITION` is `!=`, `=`, `<=`, `<`, `>`, `>=` and `VALUE` is a number. The `CONDITION` and `VALUE` are required for `countif`, while `VALUE` is used by `percentile`, `trimmed_mean` and `trimmed_median`. + +- `AFTER` is a relative number of seconds, but it also accepts a single letter for changing + the units, like `-1s` = 1 second in the past, `-1m` = 1 minute in the past, `-1h` = 1 hour + in the past, `-1d` = 1 day in the past. You need a negative number (i.e. how far in the past + to look for the value). **This is required**. + +- `at BEFORE` is by default 0 and is not required. Using this you can define the end of the + lookup. So data will be evaluated between `AFTER` and `BEFORE`. + +- `every DURATION` sets the updated frequency of the lookup (supports single letter units as + above too). + +- `OPTIONS` is a space separated list of `percentage`, `absolute`, `min`, `max`, `average`, `sum`, + `min2max`, `unaligned`, `match-ids`, `match-names`. + + - `percentage` during time-aggregation, calculate the percentage of the selected dimensions over the total of all dimensions. + - `absolute` during time-aggregation, turns all sample values positive before using them. + - `min` after time-aggregation of each dimension, return the minimum of all dimensions. + - `max` after time-aggregation of each dimension, return the maximum of all dimensions. + - `average` after time-aggregation of each dimension, return the average of all dimensions. + - `sum` after time-aggregation of each dimension, return the sum of all dimensions (this is the default). + - `min2max` after time-aggregation of each dimension, return the delta between the min and the max of the dimensions. + - `unaligned` prevents shifting the query window to multiples of the query duration. + - `match-ids` matches the dimensions based on their IDs (the default is enabled, give `match-names` to disable). + - `match-names` matches the dimension based on their names (the default is enabled, give `match-ids` to disable). + +- `of DIMENSIONS` is optional and has to be the last parameter. Dimensions have to be separated + by `,` or `|`. The space characters found in dimensions will be kept as-is (a few dimensions + have spaces in their names). This accepts Netdata simple patterns _(with `words` separated by + `,` or `|` instead of spaces)_ and the `match-ids` and `match-names` options affect the searches + for dimensions. + +The result of the lookup will be available as `$this` and `$NAME` in expressions. +The timestamps of the timeframe evaluated by the database lookup is available as variables +`$after` and `$before` (both are unix timestamps). + +#### Alert line `calc` + +A `calc` is designed to apply some calculation to the values or variables available to the entity. The result of the +calculation will be made available at the `$this` variable, overwriting the value from your `lookup`, to use in warning +and critical expressions. + +When paired with `lookup`, `calc` will perform the calculation just after `lookup` has retrieved a value from Netdata's +database. + +You can use `calc` without `lookup` if you are using [other available variables](#variables). + +The `calc` line uses [expressions](#expressions) for its syntax. + +```yaml +calc: EXPRESSION +``` + +#### Alert line `every` + +Sets the update frequency of this alert. This is the same to the `every DURATION` given +in the `lookup` lines. + +Format: + +```yaml +every: DURATION +``` + +`DURATION` accepts `s` for seconds, `m` is minutes, `h` for hours, `d` for days. + +#### Alert lines `green` and `red` + +Set the green and red thresholds of a chart. Both are available as `$green` and `$red` in expressions. If multiple +alerts define different thresholds, the ones defined by the first alert will be used. Eventually it will be visualized +on the dashboard, so only one set of them is allowed If you need multiple sets of them in different alerts, use +absolute numbers instead of `$red` and `$green`. + +Format: + +```yaml +green: NUMBER +red: NUMBER +``` + +#### Alert lines `warn` and `crit` + +Define the expression that triggers either a warning or critical alert. These are optional, and should evaluate to +either true or false (or zero/non-zero). + +The format uses Netdata's [expressions syntax](#expressions). + +```yaml +warn: EXPRESSION +crit: EXPRESSION +``` + +#### Alert line `to` + +This will be the first script parameter that will be executed when the alert changes its status. Its meaning is left up to +the `exec` script. + +The default `exec` script, `alarm-notify.sh`, uses this field as a space separated list of roles, which are then +consulted to find the exact recipients per notification method. + +Format: + +```yaml +to: ROLE1 ROLE2 ROLE3 ... +``` + +#### Alert line `exec` + +Script to be executed when the alert status changes. + +Format: + +```yaml +exec: SCRIPT +``` + +The default `SCRIPT` is Netdata's `alarm-notify.sh`, which supports all the notifications methods Netdata supports, +including custom hooks. + +#### Alert line `delay` + +This is used to provide optional hysteresis settings for the notifications, to defend against notification floods. These +settings do not affect the actual alert - only the time the `exec` script is executed. + +Format: + +```yaml +delay: [[[up U] [down D] multiplier M] max X] +``` + +- `up U` defines the delay to be applied to a notification for an alert that raised its status + (i.e. CLEAR to WARNING, CLEAR to CRITICAL, WARNING to CRITICAL). For example, `up 10s`, the + notification for this event will be sent 10 seconds after the actual event. This is used in + hope the alert will get back to its previous state within the duration given. The default `U` + is zero. + +- `down D` defines the delay to be applied to a notification for an alert that moves to lower + state (i.e. CRITICAL to WARNING, CRITICAL to CLEAR, WARNING to CLEAR). For example, `down 1m` + will delay the notification by 1 minute. This is used to prevent notifications for flapping + alerts. The default `D` is zero. + +- `multiplier M` multiplies `U` and `D` when an alert changes state, while a notification is + delayed. The default multiplier is `1.0`. + +- `max X` defines the maximum absolute notification delay an alert may get. The default `X` + is `max(U * M, D * M)` (i.e. the max duration of `U` or `D` multiplied once with `M`). + + Example: + + `delay: up 10s down 15m multiplier 2 max 1h` + + The time is `00:00:00` and the status of the alert is CLEAR. + + | time of event | new status | delay | notification will be sent | why | + |---------------|------------|---------------------|---------------------------|-------------------------------------------------------------------------------| + | 00:00:01 | WARNING | `up 10s` | 00:00:11 | first state switch | + | 00:00:05 | CLEAR | `down 15m x2` | 00:30:05 | the alert changes state while a notification is delayed, so it was multiplied | + | 00:00:06 | WARNING | `up 10s x2 x2` | 00:00:26 | multiplied twice | + | 00:00:07 | CLEAR | `down 15m x2 x2 x2` | 00:45:07 | multiplied 3 times. | + + So: + + - `U` and `D` are multiplied by `M` every time the alert changes state (any state, not just + their matching one) and a delay is in place. + - All are reset to their defaults when the alert switches state without a delay in place. + +#### Alert line `repeat` + +Defines the interval between repeating notifications for the alerts in CRITICAL or WARNING mode. This will override the +default interval settings inherited from health settings in `netdata.conf`. The default settings for repeating +notifications are `default repeat warning = DURATION` and `default repeat critical = DURATION` which can be found in +health stock configuration, when one of these interval is bigger than 0, Netdata will activate the repeat notification +for `CRITICAL`, `CLEAR` and `WARNING` messages. + +Format: + +```yaml +repeat: [off] [warning DURATION] [critical DURATION] +``` + +- `off`: Turns off the repeating feature for the current alert. This is effective when the default repeat settings has + been enabled in health configuration. +- `warning DURATION`: Defines the interval when the alert is in WARNING state. Use `0s` to turn off the repeating + notification for WARNING mode. +- `critical DURATION`: Defines the interval when the alert is in CRITICAL state. Use `0s` to turn off the repeating + notification for CRITICAL mode. + +#### Alert line `options` + +The only possible value for the `options` line is + +```yaml +options: no-clear-notification +``` + +For some alerts we need compare two time-frames, to detect anomalies. For example, `health.d/httpcheck.conf` has an +alert template called `web_service_slow` that compares the average http call response time over the last 3 minutes, +compared to the average over the last hour. It triggers a warning alert when the average of the last 3 minutes is twice +the average of the last hour. In such cases, it is easy to trigger the alert, but difficult to tell when the alert is +cleared. As time passes, the newest window moves into the older, so the average response time of the last hour will keep +increasing. Eventually, the comparison will find the averages in the two time-frames close enough to clear the alert. +However, the issue was not resolved, it's just a matter of the newer data "polluting" the old. For such alerts, it's a +good idea to tell Netdata to not clear the notification, by using the `no-clear-notification` option. + +#### Alert line `host labels` + +Defines the list of labels present on a host. See our [host labels guide](/docs/netdata-agent/configuration/organize-systems-metrics-and-alerts.md) for +an explanation of host labels and how to implement them. + +For example, let's suppose that `netdata.conf` is configured with the following labels: + +```yaml +[host labels] + installed = 20191211 + room = server +``` + +And more labels in `netdata.conf` for workstations: + +```yaml +[host labels] + installed = 201705 + room = workstation +``` + +By defining labels inside of `netdata.conf`, you can now apply labels to alerts. For example, you can add the following +line to any alerts you'd like to apply to hosts that have the label `room = server`. + +```yaml +host labels: room = server +``` + +The `host labels` is a space-separated list that accepts simple patterns. For example, you can create an alert +that will be applied to all hosts installed in the last decade with the following line: + +```yaml +host labels: installed = 201* +``` + +See our [simple patterns docs](/src/libnetdata/simple_pattern/README.md) for more examples. + +#### Alert line `chart labels` + +Similar to host labels, the `chart labels` key can be used to filter if an alert will load or not for a specific chart, based on +whether these chart labels match or not. + +The list of chart labels present on each chart can be obtained from http://localhost:19999/api/v1/charts?all + +For example, each `disk_space` chart defines a chart label called `mount_point` with each instance of this chart having +a value there of which mount point it monitors. + +If you have an e.g. external disk mounted on `/mnt/disk1` and you don't wish any related disk space alerts running for +it (but you do for all other mount points), you can add the following to the alert's configuration: + +```yaml +chart labels: mount_point=!/mnt/disk1 * +``` + +The `chart labels` is a space-separated list that accepts simple patterns. If you use multiple different chart labels, +then the result is an AND between them. i.e. the following: + +```yaml +chart labels: mount_point=/mnt/disk1 device=sda +``` + +Will create the alert if the `mount_point` is `/mnt/disk1` and the `device` is `sda`. Furthermore, if a chart label name +is specified that does not exist in the chart, the chart won't be matched. + +See our [simple patterns docs](/src/libnetdata/simple_pattern/README.md) for more examples. + +#### Alert line `summary` + +The summary field contains a brief title of the alert. It is used as the subject for the notifications, and in +dashboard list of alerts. An example for the `ram_available` alert is: + +```yaml +summary: Available Ram +``` + +summary fields can contain special variables in their text that will be replaced during run-time to provide more specific +alert information. Current variables supported are: + +| variable | description | +|---------------------|-------------------------------------------------------------------| +| ${family} | Will be replaced by the family instance for the alert (e.g. eth0) | +| ${label:LABEL_NAME} | The variable will be replaced with the value of the chart label | + +For example, a summary field like the following: + +```yaml +summary: 1 minute received traffic overflow for ${label:device} +``` + +Will be rendered on the alert acting on interface `eth0` as: + +```yaml +summary: 1 minute received traffic overflow for eth0 +``` + +> Please note that variable names are case-sensitive. + +#### Alert line `info` + +The info field can contain a small piece of text describing the alert or template. This will be rendered in +notifications and UI elements whenever the specific alert is in focus. An example for the `ram_available` alert is: + +```yaml +info: Percentage of estimated amount of RAM available for userspace processes, without causing swapping +``` + +info fields can contain special variables in their text that will be replaced during run-time to provide more specific +alert information. Current variables supported are: + +| variable | description | +|---------------------|-------------------------------------------------------------------| +| ${family} | Will be replaced by the family instance for the alert (e.g. eth0) | +| ${label:LABEL_NAME} | The variable will be replaced with the value of the chart label | + +For example, an info field like the following: + +```yaml +info: average inbound utilization for the network interface ${family} over the last minute +``` + +Will be rendered on the alert acting on interface `eth0` as: + +```yaml +info: average inbound utilization for the network interface eth0 over the last minute +``` + +An alert acting on a chart that has a chart label named e.g. `target`, with a value of `https://netdata.cloud/`, +can be enriched as follows: + +```yaml +info: average ratio of HTTP responses with unexpected status over the last 5 minutes for the site ${label:target} +``` + +Will become: + +```yaml +info: average ratio of HTTP responses with unexpected status over the last 5 minutes for the site https://netdata.cloud/ +``` + +> Please note that variable names are case-sensitive. + +## Expressions + +Netdata has an internal infix expression parser under `libnetdata/eval`. This parses expressions and creates an internal +structure that allows fast execution of them. + +These operators are supported `+`, `-`, `*`, `/`, `<`, `==`, `<=`, `<>`, `!=`, `>`, `>=`, `&&`, `||`, `!`, `AND`, `OR`, `NOT`. +Boolean operators result in either `1` (true) or `0` (false). + +The conditional evaluation operator `?` is supported too. Using this operator IF-THEN-ELSE conditional statements can be +specified. The format is: `(condition) ? (true expression) : (false expression)`. So, Netdata will first evaluate the +`condition` and based on the result will either evaluate `true expression` or `false expression`. + +Example: `($this > 0) ? ($avail * 2) : ($used / 2)`. + +Nested such expressions are also supported (i.e. `true expression` and `false expression` can contain conditional +evaluations). + +Expressions also support the `abs()` function. + +Expressions can have variables. Variables start with `$`. Check below for more information. + +There are two special values you can use: + +- `nan`, for example `$this != nan` will check if the variable `this` is available. A variable can be `nan` if the + database lookup failed. All calculations (i.e. addition, multiplication, etc.) with a `nan` result in a `nan`. + +- `inf`, for example `$this != inf` will check if `this` is not infinite. A value or variable can be set to infinite + if divided by zero. All calculations (i.e. addition, multiplication, etc.) with a `inf` result in a `inf`. + +### Special use of the conditional operator + +A common (but not necessarily obvious) use of the conditional evaluation operator is to provide +[hysteresis](https://en.wikipedia.org/wiki/Hysteresis) around the critical or warning thresholds. This usage helps to +avoid bogus messages resulting from small variations in the value when it is varying regularly but staying close to the +threshold value, without needing to delay sending messages at all. + +An example of such usage from the default CPU usage alerts bundled with Netdata is: + +```yaml +warn: $this > (($status >= $WARNING) ? (75) : (85)) +crit: $this > (($status == $CRITICAL) ? (85) : (95)) +``` + +The above say: + +- If the alert is currently a warning, then the threshold for being considered a warning is 75, otherwise it's 85. + +- If the alert is currently critical, then the threshold for being considered critical is 85, otherwise it's 95. + +Which in turn, results in the following behavior: + +- While the value is rising, it will trigger a warning when it exceeds 85, and a critical alert when it exceeds 95. + +- While the value is falling, it will return to a warning state when it goes below 85, and a normal state when it goes + below 75. + +- If the value is constantly varying between 80 and 90, then it will trigger a warning the first time it goes above + 85, but will remain a warning until it goes below 75 (or goes above 85). + +- If the value is constantly varying between 90 and 100, then it will trigger a critical alert the first time it goes + above 95, but will remain a critical alert goes below 85 (at which point it will return to being a warning). + +## Variables + +You can find all the variables that can be used for a given chart, using +`http://NODE:19999/api/v1/alarm_variables?chart=CHART_NAME`, replacing `NODE` with the IP address or hostname for your +Agent dashboard. For example, [variables for the `system.cpu` chart of the +registry](https://registry.my-netdata.io/api/v1/alarm_variables?chart=system.cpu). + +> If you don't know how to find the CHART_NAME, you can read about it [here](/src/web/README.md#charts). + +Netdata supports 3 internal indexes for variables that will be used in health monitoring. + +<details><summary>The variables below can be used in both chart alerts and context templates.</summary> + +Although the `alarm_variables` link shows you variables for a particular chart, the same variables can also be used in +templates for charts belonging to a given [context](/src/web/README.md#contexts). The reason is that all charts of a given +context are essentially identical, with the only difference being the family that identifies a particular hardware or software instance. + +</details> + +- **chart local variables**. All the dimensions of the chart are exposed as local variables. The value of `$this` for + the other configured alerts of the chart also appears, under the name of each configured alert. + + Charts also define a few special variables: + + - `$last_collected_t` is the unix timestamp of the last data collection + - `$collected_total_raw` is the sum of all the dimensions (their last collected values) + - `$update_every` is the update frequency of the chart + - `$green` and `$red` the threshold defined in alerts (these are per chart - the charts inherits them from the first alert that defined them) + + > Chart dimensions define their last calculated (i.e. interpolated) value, exactly as + shown on the charts, but also a variable with their name and suffix `_raw` that resolves + to the last collected value - as collected and another with suffix `_last_collected_t` + that resolves to unix timestamp the dimension was last collected (there may be dimensions + that fail to be collected while others continue normally). + +- **host variables**. All the dimensions of all charts, including all alerts, in fullname. + Fullname is `CHART.VARIABLE`, where `CHART` is either the chart id or the chart name (both + are supported). + +- **special variables** are: + + - `$this`, which is resolved to the value of the current alert. + + - `$status`, which is resolved to the current status of the alert (the current = the last + status, i.e. before the current database lookup and the evaluation of the `calc` line). + This values can be compared with `$REMOVED`, `$UNINITIALIZED`, `$UNDEFINED`, `$CLEAR`, + `$WARNING`, `$CRITICAL`. These values are incremental, e.g. `$status > $CLEAR` works as + expected. + + - `$now`, which is resolved to current unix timestamp. + +## Alert statuses + +Alerts can have the following statuses: + +- `REMOVED` - the alert has been deleted (this happens when a SIGUSR2 is sent to Netdata + to reload health configuration) + +- `UNINITIALIZED` - the alert is not initialized yet + +- `UNDEFINED` - the alert failed to be calculated (i.e. the database lookup failed, + a division by zero occurred, etc.) + +- `CLEAR` - the alert is not armed / raised (i.e. is OK) + +- `WARNING` - the warning expression resulted in true or non-zero + +- `CRITICAL` - the critical expression resulted in true or non-zero + +The external script will be called for all status changes. + +## Example alerts + +Check the `health/health.d/` directory for all alerts shipped with Netdata. + +Here are a few examples: + +### Example 1 - check server alive + +A simple check if an apache server is alive: + +```yaml +template: apache_last_collected_secs + on: apache.requests + calc: $now - $last_collected_t + every: 10s + warn: $this > ( 5 * $update_every) + crit: $this > (10 * $update_every) +``` + +The above checks that Netdata is able to collect data from apache. In detail: + +```yaml +template: apache_last_collected_secs +``` + +The above defines a **template** named `apache_last_collected_secs`. +The name is important since `$apache_last_collected_secs` resolves to the `calc` line. +So, try to give something descriptive. + +```yaml + on: apache.requests +``` + +The above applies the **template** to all charts that have `context = apache.requests` +(i.e. all your apache servers). + +```yaml + calc: $now - $last_collected_t +``` + +- `$now` is a standard variable that resolves to the current timestamp. + +- `$last_collected_t` is the last data collection timestamp of the chart. + So this calculation gives the number of seconds passed since the last data collection. + +```yaml + every: 10s +``` + +The alert will be evaluated every 10 seconds. + +```yaml + warn: $this > ( 5 * $update_every) + crit: $this > (10 * $update_every) +``` + +If these result in non-zero or true, they trigger the alert. + +- `$this` refers to the value of this alert (e.g. the result of the `calc` line). + We could also use `$apache_last_collected_secs`. + +`$update_every` is the update frequency of the chart, in seconds. + +So, the warning condition checks if we have not collected data from apache for 5 +iterations and the critical condition checks for 10 iterations. + +### Example 2 - disk space + +Check if any of the disks is critically low on disk space: + +```yaml +template: disk_full_percent + on: disk.space + calc: $used * 100 / ($avail + $used) + every: 1m + warn: $this > 80 + crit: $this > 95 + repeat: warning 120s critical 10s +``` + +`$used` and `$avail` are the `used` and `avail` chart dimensions as shown on the dashboard. + +So, the `calc` line finds the percentage of used space. `$this` resolves to this percentage. + +This is a repeating alert and if the alert becomes CRITICAL it repeats the notifications every 10 seconds. It also +repeats notifications every 2 minutes if the alert goes into WARNING mode. + +### Example 3 - disk fill rate + +Predict if any disk will run out of space in the near future. + +We do this in 2 steps: + +Calculate the disk fill rate: + +```yaml + template: disk_fill_rate + on: disk.space + lookup: max -1s at -30m unaligned of avail + calc: ($this - $avail) / (30 * 60) + every: 15s +``` + +In the `calc` line: `$this` is the result of the `lookup` line (i.e. the free space 30 minutes +ago) and `$avail` is the current disk free space. So the `calc` line will either have a positive +number of GB/second if the disk is filling up, or a negative number of GB/second if the disk is +freeing up space. + +There is no `warn` or `crit` lines here. So, this template will just do the calculation and +nothing more. + +Predict the hours after which the disk will run out of space: + +```yaml + template: disk_full_after_hours + on: disk.space + calc: $avail / $disk_fill_rate / 3600 + every: 10s + warn: $this > 0 and $this < 48 + crit: $this > 0 and $this < 24 +``` + +The `calc` line estimates the time in hours, we will run out of disk space. Of course, only +positive values are interesting for this check, so the warning and critical conditions check +for positive values and that we have enough free space for 48 and 24 hours respectively. + +Once this alert triggers we will receive an email like this: + +![image](https://cloud.githubusercontent.com/assets/2662304/17839993/87872b32-6802-11e6-8e08-b2e4afef93bb.png) + +### Example 4 - dropped packets + +Check if any network interface is dropping packets: + +```yaml +template: 30min_packet_drops + on: net.drops + lookup: sum -30m unaligned absolute + every: 10s + crit: $this > 0 +``` + +The `lookup` line will calculate the sum of the all dropped packets in the last 30 minutes. + +The `crit` line will issue a critical alert if even a single packet has been dropped. + +Note that the drops chart does not exist if a network interface has never dropped a single packet. +When Netdata detects a dropped packet, it will add the chart, and it will automatically attach this +alert to it. + +### Example 5 - Z-Score based alert + +Derive a "[Z Score](https://en.wikipedia.org/wiki/Standard_score)" based alert on `user` dimension of the `system.cpu` chart: + +```yaml + alarm: cpu_user_mean + on: system.cpu +lookup: mean -60s of user + every: 10s + + alarm: cpu_user_stddev + on: system.cpu +lookup: stddev -60s of user + every: 10s + + alarm: cpu_user_zscore + on: system.cpu +lookup: mean -10s of user + calc: ($this - $cpu_user_mean) / $cpu_user_stddev + every: 10s + warn: $this < -2 or $this > 2 + crit: $this < -3 or $this > 3 +``` + +Since [`z = (x - mean) / stddev`](https://en.wikipedia.org/wiki/Standard_score) we create two input alerts, one for `mean` and one for `stddev` and then use them both as inputs in our final `cpu_user_zscore` alert. + +### Example 6 - [Anomaly rate](/src/ml/README.md#anomaly-rate) based CPU chart alert + +Warning if 5 minute rolling [anomaly rate](/src/ml/README.md#anomaly-rate) averaged across all CPU dimensions is above 5%, critical if it goes above 20%: + +```yaml +template: ml_5min_cpu_chart + on: system.cpu + lookup: average -5m anomaly-bit of * + calc: $this + units: % + every: 30s + warn: $this > (($status >= $WARNING) ? (5) : (20)) + crit: $this > (($status == $CRITICAL) ? (20) : (100)) + info: rolling 5min anomaly rate for system.cpu chart +``` + +The `lookup` line will calculate the average anomaly rate across all `system.cpu` dimensions over the last 5 minues. In this case +Netdata will create one alert for the chart. + +### Example 7 - [Anomaly rate](/src/ml/README.md#anomaly-rate) based node level alert + +Warning if 5 minute rolling [anomaly rate](/src/ml/README.md#anomaly-rate) averaged across all ML enabled dimensions is above 5%, critical if it goes above 20%: + +```yaml +template: ml_5min_node + on: anomaly_detection.anomaly_rate + lookup: average -5m of anomaly_rate + calc: $this + units: % + every: 30s + warn: $this > (($status >= $WARNING) ? (5) : (20)) + crit: $this > (($status == $CRITICAL) ? (20) : (100)) + info: rolling 5min anomaly rate for all ML enabled dims +``` + +The `lookup` line will use the `anomaly_rate` dimension of the `anomaly_detection.anomaly_rate` ML chart to calculate the average [node level anomaly rate](/src/ml/README.md#node-anomaly-rate) over the last 5 minutes. + +## Troubleshooting + +You can compile Netdata with [debugging](/src/daemon/README.md#debugging) and then set in `netdata.conf`: + +```yaml +[global] + debug flags = 0x0000000000800000 +``` + +Then check your `/var/log/netdata/debug.log`. It will show you how it works. Important: this will generate a lot of +output in debug.log. + +You can find the context of charts by looking up the chart in either `http://NODE:19999/netdata.conf` or +`http://NODE:19999/api/v1/charts`, replacing `NODE` with the IP address or hostname for your Agent dashboard. + +You can find how Netdata interpreted the expressions by examining the alert at +`http://NODE:19999/api/v1/alarms?all`. For each expression, Netdata will return the expression as given in its +config file, and the same expression with additional parentheses added to indicate the evaluation flow of the +expression. diff --git a/health/guides/adaptec_raid/adaptec_raid_ld_status.md b/src/health/guides/adaptec_raid/adaptec_raid_ld_status.md index 7da1cdd17..7da1cdd17 100644 --- a/health/guides/adaptec_raid/adaptec_raid_ld_status.md +++ b/src/health/guides/adaptec_raid/adaptec_raid_ld_status.md diff --git a/health/guides/adaptec_raid/adaptec_raid_pd_state.md b/src/health/guides/adaptec_raid/adaptec_raid_pd_state.md index 00c9d5901..00c9d5901 100644 --- a/health/guides/adaptec_raid/adaptec_raid_pd_state.md +++ b/src/health/guides/adaptec_raid/adaptec_raid_pd_state.md diff --git a/health/guides/anomalies/anomalies_anomaly_flags.md b/src/health/guides/anomalies/anomalies_anomaly_flags.md index d4ffa1641..d4ffa1641 100644 --- a/health/guides/anomalies/anomalies_anomaly_flags.md +++ b/src/health/guides/anomalies/anomalies_anomaly_flags.md diff --git a/health/guides/anomalies/anomalies_anomaly_probabilities.md b/src/health/guides/anomalies/anomalies_anomaly_probabilities.md index cea04a43e..cea04a43e 100644 --- a/health/guides/anomalies/anomalies_anomaly_probabilities.md +++ b/src/health/guides/anomalies/anomalies_anomaly_probabilities.md diff --git a/health/guides/apcupsd/apcupsd_10min_ups_load.md b/src/health/guides/apcupsd/apcupsd_10min_ups_load.md index 4069de9f0..4069de9f0 100644 --- a/health/guides/apcupsd/apcupsd_10min_ups_load.md +++ b/src/health/guides/apcupsd/apcupsd_10min_ups_load.md diff --git a/health/guides/apcupsd/apcupsd_last_collected_secs.md b/src/health/guides/apcupsd/apcupsd_last_collected_secs.md index 7c8f8035d..fb8d9f9fc 100644 --- a/health/guides/apcupsd/apcupsd_last_collected_secs.md +++ b/src/health/guides/apcupsd/apcupsd_last_collected_secs.md @@ -42,5 +42,5 @@ This alert is related to your American Power Conversion (APC) uninterruptible po ### Useful resources -1. [Netdata - APC UPS monitoring](https://learn.netdata.cloud/docs/data-collection/ups/apc-ups) +1. [Netdata - APC UPS monitoring](/src/collectors/charts.d.plugin/apcupsd/integrations/apc_ups.md) 2. [`apcupsd` - Power management and control software for APC UPS](https://github.com/apcupsd/apcupsd) diff --git a/health/guides/apcupsd/apcupsd_ups_charge.md b/src/health/guides/apcupsd/apcupsd_ups_charge.md index 600520b58..600520b58 100644 --- a/health/guides/apcupsd/apcupsd_ups_charge.md +++ b/src/health/guides/apcupsd/apcupsd_ups_charge.md diff --git a/health/guides/beanstalk/beanstalk_number_of_tubes.md b/src/health/guides/beanstalk/beanstalk_number_of_tubes.md index 8f14f07f1..8f14f07f1 100644 --- a/health/guides/beanstalk/beanstalk_number_of_tubes.md +++ b/src/health/guides/beanstalk/beanstalk_number_of_tubes.md diff --git a/health/guides/beanstalk/beanstalk_server_buried_jobs.md b/src/health/guides/beanstalk/beanstalk_server_buried_jobs.md index 99d4f5074..99d4f5074 100644 --- a/health/guides/beanstalk/beanstalk_server_buried_jobs.md +++ b/src/health/guides/beanstalk/beanstalk_server_buried_jobs.md diff --git a/health/guides/beanstalk/beanstalk_tube_buried_jobs.md b/src/health/guides/beanstalk/beanstalk_tube_buried_jobs.md index 76a43cc68..76a43cc68 100644 --- a/health/guides/beanstalk/beanstalk_tube_buried_jobs.md +++ b/src/health/guides/beanstalk/beanstalk_tube_buried_jobs.md diff --git a/health/guides/boinc/boinc_active_tasks.md b/src/health/guides/boinc/boinc_active_tasks.md index efdb7b9e2..efdb7b9e2 100644 --- a/health/guides/boinc/boinc_active_tasks.md +++ b/src/health/guides/boinc/boinc_active_tasks.md diff --git a/health/guides/boinc/boinc_compute_errors.md b/src/health/guides/boinc/boinc_compute_errors.md index 8390686c3..8390686c3 100644 --- a/health/guides/boinc/boinc_compute_errors.md +++ b/src/health/guides/boinc/boinc_compute_errors.md diff --git a/health/guides/boinc/boinc_total_tasks.md b/src/health/guides/boinc/boinc_total_tasks.md index c14e15f85..ed7225784 100644 --- a/health/guides/boinc/boinc_total_tasks.md +++ b/src/health/guides/boinc/boinc_total_tasks.md @@ -24,8 +24,6 @@ This alert monitors the average number of total tasks for the BOINC system over sudo /etc/init.d/boinc-client restart ``` -2. For other operating systems or custom installations, refer to the BOINC's documentation for restarting the client: https://boinc.berkeley.edu/wiki/Stop_or_restart_BOINC - #### Check system resources BOINC tasks may fail or slow down if there is not enough system resources (CPU, RAM, or Disk Space) available. Monitor your system performance using tools like `top`, `free`, and `df`, and make adjustments if necessary to ensure that BOINC has enough resources to complete tasks. diff --git a/health/guides/boinc/boinc_upload_errors.md b/src/health/guides/boinc/boinc_upload_errors.md index 80c0ad364..ffd8c78b9 100644 --- a/health/guides/boinc/boinc_upload_errors.md +++ b/src/health/guides/boinc/boinc_upload_errors.md @@ -18,7 +18,7 @@ This alert indicates that your BOINC node is experiencing an increase in the ave 4. Inspect BOINC client logs - Consult the BOINC client logs to gain insight into the upload errors. The logs can be found in the client's data directory. Refer to the [BOINC log file documentation](https://boinc.berkeley.edu/wiki/Log_Files) for more information on how to read and analyze the logs. + Consult the BOINC client logs to gain insight into the upload errors. The logs can be found in the client's data directory. 5. Contact project support diff --git a/health/guides/btrfs/btrfs_allocated.md b/src/health/guides/btrfs/btrfs_allocated.md index 690d45d06..690d45d06 100644 --- a/health/guides/btrfs/btrfs_allocated.md +++ b/src/health/guides/btrfs/btrfs_allocated.md diff --git a/health/guides/btrfs/btrfs_data.md b/src/health/guides/btrfs/btrfs_data.md index 7782b2d88..7782b2d88 100644 --- a/health/guides/btrfs/btrfs_data.md +++ b/src/health/guides/btrfs/btrfs_data.md diff --git a/health/guides/btrfs/btrfs_device_corruption_errors.md b/src/health/guides/btrfs/btrfs_device_corruption_errors.md index 98fd4b440..98fd4b440 100644 --- a/health/guides/btrfs/btrfs_device_corruption_errors.md +++ b/src/health/guides/btrfs/btrfs_device_corruption_errors.md diff --git a/health/guides/btrfs/btrfs_device_flush_errors.md b/src/health/guides/btrfs/btrfs_device_flush_errors.md index c9bb1b118..c9bb1b118 100644 --- a/health/guides/btrfs/btrfs_device_flush_errors.md +++ b/src/health/guides/btrfs/btrfs_device_flush_errors.md diff --git a/health/guides/btrfs/btrfs_device_generation_errors.md b/src/health/guides/btrfs/btrfs_device_generation_errors.md index b357b83e9..b357b83e9 100644 --- a/health/guides/btrfs/btrfs_device_generation_errors.md +++ b/src/health/guides/btrfs/btrfs_device_generation_errors.md diff --git a/health/guides/btrfs/btrfs_device_read_errors.md b/src/health/guides/btrfs/btrfs_device_read_errors.md index 684cd0be5..684cd0be5 100644 --- a/health/guides/btrfs/btrfs_device_read_errors.md +++ b/src/health/guides/btrfs/btrfs_device_read_errors.md diff --git a/health/guides/btrfs/btrfs_device_write_errors.md b/src/health/guides/btrfs/btrfs_device_write_errors.md index cdf221723..cdf221723 100644 --- a/health/guides/btrfs/btrfs_device_write_errors.md +++ b/src/health/guides/btrfs/btrfs_device_write_errors.md diff --git a/health/guides/btrfs/btrfs_metadata.md b/src/health/guides/btrfs/btrfs_metadata.md index 6c44ee09b..6c44ee09b 100644 --- a/health/guides/btrfs/btrfs_metadata.md +++ b/src/health/guides/btrfs/btrfs_metadata.md diff --git a/health/guides/btrfs/btrfs_system.md b/src/health/guides/btrfs/btrfs_system.md index 82d321edb..82d321edb 100644 --- a/health/guides/btrfs/btrfs_system.md +++ b/src/health/guides/btrfs/btrfs_system.md diff --git a/health/guides/ceph/ceph_cluster_space_usage.md b/src/health/guides/ceph/ceph_cluster_space_usage.md index 8dbe2e876..8dbe2e876 100644 --- a/health/guides/ceph/ceph_cluster_space_usage.md +++ b/src/health/guides/ceph/ceph_cluster_space_usage.md diff --git a/health/guides/cgroups/cgroup_10min_cpu_usage.md b/src/health/guides/cgroups/cgroup_10min_cpu_usage.md index 0ba413634..0ba413634 100644 --- a/health/guides/cgroups/cgroup_10min_cpu_usage.md +++ b/src/health/guides/cgroups/cgroup_10min_cpu_usage.md diff --git a/health/guides/cgroups/cgroup_ram_in_use.md b/src/health/guides/cgroups/cgroup_ram_in_use.md index 59440e0b8..59440e0b8 100644 --- a/health/guides/cgroups/cgroup_ram_in_use.md +++ b/src/health/guides/cgroups/cgroup_ram_in_use.md diff --git a/health/guides/cgroups/k8s_cgroup_10min_cpu_usage.md b/src/health/guides/cgroups/k8s_cgroup_10min_cpu_usage.md index 3168e2793..3168e2793 100644 --- a/health/guides/cgroups/k8s_cgroup_10min_cpu_usage.md +++ b/src/health/guides/cgroups/k8s_cgroup_10min_cpu_usage.md diff --git a/health/guides/cgroups/k8s_cgroup_ram_in_use.md b/src/health/guides/cgroups/k8s_cgroup_ram_in_use.md index aec443b78..aec443b78 100644 --- a/health/guides/cgroups/k8s_cgroup_ram_in_use.md +++ b/src/health/guides/cgroups/k8s_cgroup_ram_in_use.md diff --git a/health/guides/cockroachdb/cockroachdb_open_file_descriptors_limit.md b/src/health/guides/cockroachdb/cockroachdb_open_file_descriptors_limit.md index ad2fa4ac7..ad2fa4ac7 100644 --- a/health/guides/cockroachdb/cockroachdb_open_file_descriptors_limit.md +++ b/src/health/guides/cockroachdb/cockroachdb_open_file_descriptors_limit.md diff --git a/health/guides/cockroachdb/cockroachdb_unavailable_ranges.md b/src/health/guides/cockroachdb/cockroachdb_unavailable_ranges.md index ef495cb72..4a48f1dba 100644 --- a/health/guides/cockroachdb/cockroachdb_unavailable_ranges.md +++ b/src/health/guides/cockroachdb/cockroachdb_unavailable_ranges.md @@ -44,7 +44,7 @@ This alert indicates that there are unavailable ranges in your CockroachDB clust 6. Consider rebalancing the cluster - Rebalancing the cluster can help distribute the load more evenly across nodes and reduce the number of unavailable ranges. See the [CockroachDB documentation](https://www.cockroachlabs.com/docs/stable/training/manual-rebalancing.html) for more information on manual rebalancing. + Rebalancing the cluster can help distribute the load more evenly across nodes and reduce the number of unavailable ranges. See the [CockroachDB documentation](https://www.cockroachlabs.com/docs/stable/demo-replication-and-rebalancing.html) for more information on manual rebalancing. ### Useful resources diff --git a/health/guides/cockroachdb/cockroachdb_underreplicated_ranges.md b/src/health/guides/cockroachdb/cockroachdb_underreplicated_ranges.md index e82695993..e82695993 100644 --- a/health/guides/cockroachdb/cockroachdb_underreplicated_ranges.md +++ b/src/health/guides/cockroachdb/cockroachdb_underreplicated_ranges.md diff --git a/health/guides/cockroachdb/cockroachdb_used_storage_capacity.md b/src/health/guides/cockroachdb/cockroachdb_used_storage_capacity.md index ac1bc000c..ac1bc000c 100644 --- a/health/guides/cockroachdb/cockroachdb_used_storage_capacity.md +++ b/src/health/guides/cockroachdb/cockroachdb_used_storage_capacity.md diff --git a/health/guides/cockroachdb/cockroachdb_used_usable_storage_capacity.md b/src/health/guides/cockroachdb/cockroachdb_used_usable_storage_capacity.md index ec00dbb98..ec00dbb98 100644 --- a/health/guides/cockroachdb/cockroachdb_used_usable_storage_capacity.md +++ b/src/health/guides/cockroachdb/cockroachdb_used_usable_storage_capacity.md diff --git a/health/guides/consul/consul_autopilot_health_status.md b/src/health/guides/consul/consul_autopilot_health_status.md index 42ccab5a6..42ccab5a6 100644 --- a/health/guides/consul/consul_autopilot_health_status.md +++ b/src/health/guides/consul/consul_autopilot_health_status.md diff --git a/health/guides/consul/consul_autopilot_server_health_status.md b/src/health/guides/consul/consul_autopilot_server_health_status.md index 687c2bb1d..687c2bb1d 100644 --- a/health/guides/consul/consul_autopilot_server_health_status.md +++ b/src/health/guides/consul/consul_autopilot_server_health_status.md diff --git a/health/guides/consul/consul_client_rpc_requests_exceeded.md b/src/health/guides/consul/consul_client_rpc_requests_exceeded.md index eab01e820..eab01e820 100644 --- a/health/guides/consul/consul_client_rpc_requests_exceeded.md +++ b/src/health/guides/consul/consul_client_rpc_requests_exceeded.md diff --git a/health/guides/consul/consul_client_rpc_requests_failed.md b/src/health/guides/consul/consul_client_rpc_requests_failed.md index 7d8cb3311..7d8cb3311 100644 --- a/health/guides/consul/consul_client_rpc_requests_failed.md +++ b/src/health/guides/consul/consul_client_rpc_requests_failed.md diff --git a/health/guides/consul/consul_gc_pause_time.md b/src/health/guides/consul/consul_gc_pause_time.md index c4408234b..c4408234b 100644 --- a/health/guides/consul/consul_gc_pause_time.md +++ b/src/health/guides/consul/consul_gc_pause_time.md diff --git a/health/guides/consul/consul_license_expiration_time.md b/src/health/guides/consul/consul_license_expiration_time.md index 3f86b0845..3f86b0845 100644 --- a/health/guides/consul/consul_license_expiration_time.md +++ b/src/health/guides/consul/consul_license_expiration_time.md diff --git a/health/guides/consul/consul_node_health_check_status.md b/src/health/guides/consul/consul_node_health_check_status.md index 44b431edc..44b431edc 100644 --- a/health/guides/consul/consul_node_health_check_status.md +++ b/src/health/guides/consul/consul_node_health_check_status.md diff --git a/health/guides/consul/consul_raft_leader_last_contact_time.md b/src/health/guides/consul/consul_raft_leader_last_contact_time.md index baa6ed462..baa6ed462 100644 --- a/health/guides/consul/consul_raft_leader_last_contact_time.md +++ b/src/health/guides/consul/consul_raft_leader_last_contact_time.md diff --git a/health/guides/consul/consul_raft_leadership_transitions.md b/src/health/guides/consul/consul_raft_leadership_transitions.md index 59eb3e738..59eb3e738 100644 --- a/health/guides/consul/consul_raft_leadership_transitions.md +++ b/src/health/guides/consul/consul_raft_leadership_transitions.md diff --git a/health/guides/consul/consul_raft_thread_fsm_saturation.md b/src/health/guides/consul/consul_raft_thread_fsm_saturation.md index 12c5f7df3..12c5f7df3 100644 --- a/health/guides/consul/consul_raft_thread_fsm_saturation.md +++ b/src/health/guides/consul/consul_raft_thread_fsm_saturation.md diff --git a/health/guides/consul/consul_raft_thread_main_saturation.md b/src/health/guides/consul/consul_raft_thread_main_saturation.md index 7f33627d0..7f33627d0 100644 --- a/health/guides/consul/consul_raft_thread_main_saturation.md +++ b/src/health/guides/consul/consul_raft_thread_main_saturation.md diff --git a/health/guides/consul/consul_service_health_check_status.md b/src/health/guides/consul/consul_service_health_check_status.md index e9da2508f..e9da2508f 100644 --- a/health/guides/consul/consul_service_health_check_status.md +++ b/src/health/guides/consul/consul_service_health_check_status.md diff --git a/health/guides/cpu/10min_cpu_iowait.md b/src/health/guides/cpu/10min_cpu_iowait.md index b05530e84..b05530e84 100644 --- a/health/guides/cpu/10min_cpu_iowait.md +++ b/src/health/guides/cpu/10min_cpu_iowait.md diff --git a/health/guides/cpu/10min_cpu_usage.md b/src/health/guides/cpu/10min_cpu_usage.md index 17e153f6f..17e153f6f 100644 --- a/health/guides/cpu/10min_cpu_usage.md +++ b/src/health/guides/cpu/10min_cpu_usage.md diff --git a/health/guides/cpu/20min_steal_cpu.md b/src/health/guides/cpu/20min_steal_cpu.md index e87c6f057..e87c6f057 100644 --- a/health/guides/cpu/20min_steal_cpu.md +++ b/src/health/guides/cpu/20min_steal_cpu.md diff --git a/health/guides/dbengine/10min_dbengine_global_flushing_errors.md b/src/health/guides/dbengine/10min_dbengine_global_flushing_errors.md index 4e388eb28..7548c2d7e 100644 --- a/health/guides/dbengine/10min_dbengine_global_flushing_errors.md +++ b/src/health/guides/dbengine/10min_dbengine_global_flushing_errors.md @@ -9,5 +9,5 @@ faster disks. This alert is triggered in critical state when the number deleted ### Useful resources -[Read more about Netdata DB engine](https://learn.netdata.cloud/docs/agent/database/engine) +[Read more about Netdata DB engine](/src/database/README.md/engine) diff --git a/health/guides/dbengine/10min_dbengine_global_flushing_warnings.md b/src/health/guides/dbengine/10min_dbengine_global_flushing_warnings.md index 1029e7f60..444796703 100644 --- a/health/guides/dbengine/10min_dbengine_global_flushing_warnings.md +++ b/src/health/guides/dbengine/10min_dbengine_global_flushing_warnings.md @@ -11,5 +11,5 @@ This alert is triggered in warn state when the number of `dbengine` dirty pages ### Useful resources -[Read more about Netdata DB engine](https://learn.netdata.cloud/docs/agent/database/engine) +[Read more about Netdata DB engine](/src/database/README.md/engine) diff --git a/health/guides/dbengine/10min_dbengine_global_fs_errors.md b/src/health/guides/dbengine/10min_dbengine_global_fs_errors.md index 446289a9c..a4093681b 100644 --- a/health/guides/dbengine/10min_dbengine_global_fs_errors.md +++ b/src/health/guides/dbengine/10min_dbengine_global_fs_errors.md @@ -10,5 +10,5 @@ This alert is triggered in warning state when the number of filesystem errors is ### Useful resources -[Read more about Netdata DB engine](https://learn.netdata.cloud/docs/agent/database/engine) +[Read more about Netdata DB engine](/src/database/README.md/engine) diff --git a/health/guides/dbengine/10min_dbengine_global_io_errors.md b/src/health/guides/dbengine/10min_dbengine_global_io_errors.md index c47004f40..6bb831669 100644 --- a/health/guides/dbengine/10min_dbengine_global_io_errors.md +++ b/src/health/guides/dbengine/10min_dbengine_global_io_errors.md @@ -10,5 +10,5 @@ This alert is triggered in critical state when the number of IO errors is greate ### Useful resources -[Read more about Netdata DB engine](https://learn.netdata.cloud/docs/agent/database/engine) +[Read more about Netdata DB engine](/src/database/README.md/engine) diff --git a/health/guides/disks/10min_disk_backlog.md b/src/health/guides/disks/10min_disk_backlog.md index 9b0a275b1..9b0a275b1 100644 --- a/health/guides/disks/10min_disk_backlog.md +++ b/src/health/guides/disks/10min_disk_backlog.md diff --git a/health/guides/disks/10min_disk_utilization.md b/src/health/guides/disks/10min_disk_utilization.md index 41a987a42..41a987a42 100644 --- a/health/guides/disks/10min_disk_utilization.md +++ b/src/health/guides/disks/10min_disk_utilization.md diff --git a/health/guides/disks/bcache_cache_dirty.md b/src/health/guides/disks/bcache_cache_dirty.md index 11b74e522..11b74e522 100644 --- a/health/guides/disks/bcache_cache_dirty.md +++ b/src/health/guides/disks/bcache_cache_dirty.md diff --git a/health/guides/disks/bcache_cache_errors.md b/src/health/guides/disks/bcache_cache_errors.md index 5256c480f..5256c480f 100644 --- a/health/guides/disks/bcache_cache_errors.md +++ b/src/health/guides/disks/bcache_cache_errors.md diff --git a/health/guides/disks/disk_inode_usage.md b/src/health/guides/disks/disk_inode_usage.md index 3c9161063..3c9161063 100644 --- a/health/guides/disks/disk_inode_usage.md +++ b/src/health/guides/disks/disk_inode_usage.md diff --git a/health/guides/disks/disk_space_usage.md b/src/health/guides/disks/disk_space_usage.md index 14663942f..14663942f 100644 --- a/health/guides/disks/disk_space_usage.md +++ b/src/health/guides/disks/disk_space_usage.md diff --git a/health/guides/dns_query/dns_query_query_status.md b/src/health/guides/dns_query/dns_query_query_status.md index f47b8adee..f47b8adee 100644 --- a/health/guides/dns_query/dns_query_query_status.md +++ b/src/health/guides/dns_query/dns_query_query_status.md diff --git a/health/guides/dnsmasq/dnsmasq_dhcp_dhcp_range_utilization.md b/src/health/guides/dnsmasq/dnsmasq_dhcp_dhcp_range_utilization.md index d259ae40f..d259ae40f 100644 --- a/health/guides/dnsmasq/dnsmasq_dhcp_dhcp_range_utilization.md +++ b/src/health/guides/dnsmasq/dnsmasq_dhcp_dhcp_range_utilization.md diff --git a/health/guides/docker/docker_container_unhealthy.md b/src/health/guides/docker/docker_container_unhealthy.md index bdad26480..bdad26480 100644 --- a/health/guides/docker/docker_container_unhealthy.md +++ b/src/health/guides/docker/docker_container_unhealthy.md diff --git a/health/guides/elasticsearch/elasticsearch_cluster_health_status_red.md b/src/health/guides/elasticsearch/elasticsearch_cluster_health_status_red.md index 494a7853c..494a7853c 100644 --- a/health/guides/elasticsearch/elasticsearch_cluster_health_status_red.md +++ b/src/health/guides/elasticsearch/elasticsearch_cluster_health_status_red.md diff --git a/health/guides/elasticsearch/elasticsearch_cluster_health_status_yellow.md b/src/health/guides/elasticsearch/elasticsearch_cluster_health_status_yellow.md index 2f8bf854d..2f8bf854d 100644 --- a/health/guides/elasticsearch/elasticsearch_cluster_health_status_yellow.md +++ b/src/health/guides/elasticsearch/elasticsearch_cluster_health_status_yellow.md diff --git a/health/guides/elasticsearch/elasticsearch_node_index_health_red.md b/src/health/guides/elasticsearch/elasticsearch_node_index_health_red.md index 1e2877d14..1e2877d14 100644 --- a/health/guides/elasticsearch/elasticsearch_node_index_health_red.md +++ b/src/health/guides/elasticsearch/elasticsearch_node_index_health_red.md diff --git a/health/guides/elasticsearch/elasticsearch_node_indices_search_time_fetch.md b/src/health/guides/elasticsearch/elasticsearch_node_indices_search_time_fetch.md index e0bcc1125..e0bcc1125 100644 --- a/health/guides/elasticsearch/elasticsearch_node_indices_search_time_fetch.md +++ b/src/health/guides/elasticsearch/elasticsearch_node_indices_search_time_fetch.md diff --git a/health/guides/elasticsearch/elasticsearch_node_indices_search_time_query.md b/src/health/guides/elasticsearch/elasticsearch_node_indices_search_time_query.md index 3a82a64ac..3a82a64ac 100644 --- a/health/guides/elasticsearch/elasticsearch_node_indices_search_time_query.md +++ b/src/health/guides/elasticsearch/elasticsearch_node_indices_search_time_query.md diff --git a/health/guides/entropy/lowest_entropy.md b/src/health/guides/entropy/lowest_entropy.md index b53aed2c6..c25dc4d01 100644 --- a/health/guides/entropy/lowest_entropy.md +++ b/src/health/guides/entropy/lowest_entropy.md @@ -4,7 +4,7 @@ This alert presents the minimum amount of entropy in the kernel entropy pool in The Netdata Agent checks for the minimum entropy value in the last 5 minutes. The alert gets raised into warning if the value < 100, and cleared if the value > 200. -For further information on how our alerts are calculated, please have a look at our [Documentation](https://learn.netdata.cloud/docs/agent/health/reference#expressions). +For further information on how our alerts are calculated, please have a look at our [Documentation](/src/health/REFERENCE.md#expressions). ### What is entropy and why do we need it? @@ -12,7 +12,7 @@ Entropy is similar to "randomness". A Linux system gathers "real" random numbers Encryption and cryptography applications require random numbers to operate. A function or an algorithm that produces numbers -*that seem to be random*- is very predictable, if you know what function is used. -In real life, we use our surroundings and our thoughts to produce truly random numbers. A computer can't really do this by itself, so it gathers numbers from a lot of sources. For example, it can get the CO2 levels in a room from a sensor on the system and use that as a random number. +In real life, we use our surroundings and our thoughts to produce truly random numbers. A computer can't really do this by itself, so it gathers numbers from a lot of sources. For example, it can get the CO2 levels in a Room from a sensor on the system and use that as a random number. This way all the values are random and there is no pattern to be found among them. diff --git a/health/guides/exporting/exporting_last_buffering.md b/src/health/guides/exporting/exporting_last_buffering.md index 4b13fe761..1139b0b6d 100644 --- a/health/guides/exporting/exporting_last_buffering.md +++ b/src/health/guides/exporting/exporting_last_buffering.md @@ -26,4 +26,4 @@ This alert is related to the Netdata Exporting engine, which calculates the numb ### Useful resources -1. [Netdata Exporting Engine](https://learn.netdata.cloud/docs/exporting-data-to-other-systems/exporting-reference) +1. [Netdata Exporting Reference](/src/exporting/README.md) diff --git a/health/guides/exporting/exporting_metrics_sent.md b/src/health/guides/exporting/exporting_metrics_sent.md index f17f593c4..9896701ce 100644 --- a/health/guides/exporting/exporting_metrics_sent.md +++ b/src/health/guides/exporting/exporting_metrics_sent.md @@ -43,4 +43,4 @@ To troubleshoot this alert, follow these steps: ### Useful resources -1. [Netdata Exporting Engine documentation](https://learn.netdata.cloud/docs/exporting-data-to-other-systems/exporting-reference) +1. [Netdata Exporting Reference](/src/exporting/README.md) diff --git a/health/guides/gearman/gearman_workers_queued.md b/src/health/guides/gearman/gearman_workers_queued.md index cf9c481e6..cf9c481e6 100644 --- a/health/guides/gearman/gearman_workers_queued.md +++ b/src/health/guides/gearman/gearman_workers_queued.md diff --git a/health/guides/geth/geth_chainhead_diff_between_header_block.md b/src/health/guides/geth/geth_chainhead_diff_between_header_block.md index 18d20e3d3..18d20e3d3 100644 --- a/health/guides/geth/geth_chainhead_diff_between_header_block.md +++ b/src/health/guides/geth/geth_chainhead_diff_between_header_block.md diff --git a/health/guides/haproxy/haproxy_backend_server_status.md b/src/health/guides/haproxy/haproxy_backend_server_status.md index 3d95921ec..3d95921ec 100644 --- a/health/guides/haproxy/haproxy_backend_server_status.md +++ b/src/health/guides/haproxy/haproxy_backend_server_status.md diff --git a/health/guides/haproxy/haproxy_backend_status.md b/src/health/guides/haproxy/haproxy_backend_status.md index 47be09c73..47be09c73 100644 --- a/health/guides/haproxy/haproxy_backend_status.md +++ b/src/health/guides/haproxy/haproxy_backend_status.md diff --git a/health/guides/hdfs/hdfs_capacity_usage.md b/src/health/guides/hdfs/hdfs_capacity_usage.md index 666dcdc28..666dcdc28 100644 --- a/health/guides/hdfs/hdfs_capacity_usage.md +++ b/src/health/guides/hdfs/hdfs_capacity_usage.md diff --git a/health/guides/hdfs/hdfs_dead_nodes.md b/src/health/guides/hdfs/hdfs_dead_nodes.md index 9c65a0c66..9c65a0c66 100644 --- a/health/guides/hdfs/hdfs_dead_nodes.md +++ b/src/health/guides/hdfs/hdfs_dead_nodes.md diff --git a/health/guides/hdfs/hdfs_missing_blocks.md b/src/health/guides/hdfs/hdfs_missing_blocks.md index 490028805..490028805 100644 --- a/health/guides/hdfs/hdfs_missing_blocks.md +++ b/src/health/guides/hdfs/hdfs_missing_blocks.md diff --git a/health/guides/hdfs/hdfs_num_failed_volumes.md b/src/health/guides/hdfs/hdfs_num_failed_volumes.md index bdb23f243..bdb23f243 100644 --- a/health/guides/hdfs/hdfs_num_failed_volumes.md +++ b/src/health/guides/hdfs/hdfs_num_failed_volumes.md diff --git a/health/guides/hdfs/hdfs_stale_nodes.md b/src/health/guides/hdfs/hdfs_stale_nodes.md index 71ca50f95..71ca50f95 100644 --- a/health/guides/hdfs/hdfs_stale_nodes.md +++ b/src/health/guides/hdfs/hdfs_stale_nodes.md diff --git a/health/guides/httpcheck/httpcheck_web_service_bad_content.md b/src/health/guides/httpcheck/httpcheck_web_service_bad_content.md index 0a5961ca7..433425e09 100644 --- a/health/guides/httpcheck/httpcheck_web_service_bad_content.md +++ b/src/health/guides/httpcheck/httpcheck_web_service_bad_content.md @@ -27,4 +27,4 @@ sudo ./edit-config go.d/httpcheck.conf ### Useful resources -1. [HTTP endpoint monitoring with Netdata](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/httpcheck)
\ No newline at end of file +1. [HTTP endpoint monitoring with Netdata](/src/go/collectors/go.d.plugin/modules/httpcheck/integrations/http_endpoints.md)
\ No newline at end of file diff --git a/health/guides/httpcheck/httpcheck_web_service_bad_status.md b/src/health/guides/httpcheck/httpcheck_web_service_bad_status.md index bd9c14341..60fabd751 100644 --- a/health/guides/httpcheck/httpcheck_web_service_bad_status.md +++ b/src/health/guides/httpcheck/httpcheck_web_service_bad_status.md @@ -18,4 +18,4 @@ root@netdata # curl -v <your_http_endpoint>:<port>/<path> ### Useful resources -1. [HTTP endpoint monitoring with Netdata](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/httpcheck) +1. [HTTP endpoint monitoring with Netdata](/src/go/collectors/go.d.plugin/modules/httpcheck/integrations/http_endpoints.md) diff --git a/health/guides/httpcheck/httpcheck_web_service_no_connection.md b/src/health/guides/httpcheck/httpcheck_web_service_no_connection.md index 0f36803fe..0f36803fe 100644 --- a/health/guides/httpcheck/httpcheck_web_service_no_connection.md +++ b/src/health/guides/httpcheck/httpcheck_web_service_no_connection.md diff --git a/health/guides/httpcheck/httpcheck_web_service_slow.md b/src/health/guides/httpcheck/httpcheck_web_service_slow.md index aad2cc8da..4f962e155 100644 --- a/health/guides/httpcheck/httpcheck_web_service_slow.md +++ b/src/health/guides/httpcheck/httpcheck_web_service_slow.md @@ -14,5 +14,5 @@ To troubleshoot this issue, check for: ### Useful resources -1. [HTTP endpoint monitoring with Netdata](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/httpcheck) +1. [HTTP endpoint monitoring with Netdata](/src/go/collectors/go.d.plugin/modules/httpcheck/integrations/http_endpoints.md) diff --git a/health/guides/httpcheck/httpcheck_web_service_timeouts.md b/src/health/guides/httpcheck/httpcheck_web_service_timeouts.md index 03e300d1d..3e4c4debf 100644 --- a/health/guides/httpcheck/httpcheck_web_service_timeouts.md +++ b/src/health/guides/httpcheck/httpcheck_web_service_timeouts.md @@ -15,7 +15,7 @@ An HTTP request timeout occurs when a client (such as a web browser) sends a req - Verify the issue -Check the HTTP endpoint to see if it is responsive and reachable. You can use tools like `curl` or online services like [https://www.isitdownrightnow.com/](https://www.isitdownrightnow.com/) to check the availability of the website or service. +Check the HTTP endpoint to see if it is responsive and reachable. You can use tools like `curl` or online services like <https://www.isitdownrightnow.com/> to check the availability of the website or service. - Analyze server logs @@ -36,4 +36,3 @@ Make sure your web server configurations are optimized for performance. For inst - Verify network configurations Examine the network configurations for potential issues that can lead to HTTP request timeouts. Check for misconfigured firewalls or faulty load balancers that may be interfering with traffic to the HTTP endpoint. - diff --git a/health/guides/httpcheck/httpcheck_web_service_unreachable.md b/src/health/guides/httpcheck/httpcheck_web_service_unreachable.md index bb6f51bf5..c77d33c0b 100644 --- a/health/guides/httpcheck/httpcheck_web_service_unreachable.md +++ b/src/health/guides/httpcheck/httpcheck_web_service_unreachable.md @@ -30,4 +30,4 @@ To troubleshoot this error, check the following: ### Useful resources -1. [HTTP endpoint monitoring with Netdata](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/httpcheck)
\ No newline at end of file +1. [HTTP endpoint monitoring with Netdata](/src/go/collectors/go.d.plugin/modules/httpcheck/integrations/http_endpoints.md)
\ No newline at end of file diff --git a/health/guides/httpcheck/httpcheck_web_service_up.md b/src/health/guides/httpcheck/httpcheck_web_service_up.md index be17fadd5..be17fadd5 100644 --- a/health/guides/httpcheck/httpcheck_web_service_up.md +++ b/src/health/guides/httpcheck/httpcheck_web_service_up.md diff --git a/health/guides/ioping/ioping_disk_latency.md b/src/health/guides/ioping/ioping_disk_latency.md index cc4fdc697..cc4fdc697 100644 --- a/health/guides/ioping/ioping_disk_latency.md +++ b/src/health/guides/ioping/ioping_disk_latency.md diff --git a/health/guides/ipc/semaphore_arrays_used.md b/src/health/guides/ipc/semaphore_arrays_used.md index d12dacd47..d12dacd47 100644 --- a/health/guides/ipc/semaphore_arrays_used.md +++ b/src/health/guides/ipc/semaphore_arrays_used.md diff --git a/health/guides/ipc/semaphores_used.md b/src/health/guides/ipc/semaphores_used.md index 145ef0ad4..e58d1a60e 100644 --- a/health/guides/ipc/semaphores_used.md +++ b/src/health/guides/ipc/semaphores_used.md @@ -44,5 +44,4 @@ This alert monitors the percentage of allocated `System V IPC semaphores`. If yo ### Useful resources 1. [Interprocess Communication](https://docs.oracle.com/cd/E19455-01/806-4750/6jdqdfltn/index.html) -2. [IPC: Semaphores](https://users.cs.cf.ac.uk/Dave.Marshall/C/node26.html) -3. [Linux Kernel Documentation - IPC Semaphores](https://www.kernel.org/doc/Documentation/ipc/semaphore.txt)
\ No newline at end of file +2. [IPC: Semaphores](https://users.cs.cf.ac.uk/Dave.Marshall/C/node26.html)
\ No newline at end of file diff --git a/health/guides/ipfs/ipfs_datastore_usage.md b/src/health/guides/ipfs/ipfs_datastore_usage.md index 65c84c8b0..65c84c8b0 100644 --- a/health/guides/ipfs/ipfs_datastore_usage.md +++ b/src/health/guides/ipfs/ipfs_datastore_usage.md diff --git a/health/guides/ipmi/ipmi_events.md b/src/health/guides/ipmi/ipmi_events.md index 284abd4cd..284abd4cd 100644 --- a/health/guides/ipmi/ipmi_events.md +++ b/src/health/guides/ipmi/ipmi_events.md diff --git a/health/guides/ipmi/ipmi_sensors_states.md b/src/health/guides/ipmi/ipmi_sensors_states.md index e7521a306..e7521a306 100644 --- a/health/guides/ipmi/ipmi_sensors_states.md +++ b/src/health/guides/ipmi/ipmi_sensors_states.md diff --git a/health/guides/kubelet/kubelet_10s_pleg_relist_latency_quantile_05.md b/src/health/guides/kubelet/kubelet_10s_pleg_relist_latency_quantile_05.md index 595fae8a5..595fae8a5 100644 --- a/health/guides/kubelet/kubelet_10s_pleg_relist_latency_quantile_05.md +++ b/src/health/guides/kubelet/kubelet_10s_pleg_relist_latency_quantile_05.md diff --git a/health/guides/kubelet/kubelet_10s_pleg_relist_latency_quantile_09.md b/src/health/guides/kubelet/kubelet_10s_pleg_relist_latency_quantile_09.md index 05c030649..05c030649 100644 --- a/health/guides/kubelet/kubelet_10s_pleg_relist_latency_quantile_09.md +++ b/src/health/guides/kubelet/kubelet_10s_pleg_relist_latency_quantile_09.md diff --git a/health/guides/kubelet/kubelet_10s_pleg_relist_latency_quantile_099.md b/src/health/guides/kubelet/kubelet_10s_pleg_relist_latency_quantile_099.md index 76f1123ef..76f1123ef 100644 --- a/health/guides/kubelet/kubelet_10s_pleg_relist_latency_quantile_099.md +++ b/src/health/guides/kubelet/kubelet_10s_pleg_relist_latency_quantile_099.md diff --git a/health/guides/kubelet/kubelet_1m_pleg_relist_latency_quantile_05.md b/src/health/guides/kubelet/kubelet_1m_pleg_relist_latency_quantile_05.md index b448c4d9e..b448c4d9e 100644 --- a/health/guides/kubelet/kubelet_1m_pleg_relist_latency_quantile_05.md +++ b/src/health/guides/kubelet/kubelet_1m_pleg_relist_latency_quantile_05.md diff --git a/health/guides/kubelet/kubelet_1m_pleg_relist_latency_quantile_09.md b/src/health/guides/kubelet/kubelet_1m_pleg_relist_latency_quantile_09.md index 6c71f1cf6..6c71f1cf6 100644 --- a/health/guides/kubelet/kubelet_1m_pleg_relist_latency_quantile_09.md +++ b/src/health/guides/kubelet/kubelet_1m_pleg_relist_latency_quantile_09.md diff --git a/health/guides/kubelet/kubelet_1m_pleg_relist_latency_quantile_099.md b/src/health/guides/kubelet/kubelet_1m_pleg_relist_latency_quantile_099.md index 39e031628..39e031628 100644 --- a/health/guides/kubelet/kubelet_1m_pleg_relist_latency_quantile_099.md +++ b/src/health/guides/kubelet/kubelet_1m_pleg_relist_latency_quantile_099.md diff --git a/health/guides/kubelet/kubelet_node_config_error.md b/src/health/guides/kubelet/kubelet_node_config_error.md index 695a479c9..695a479c9 100644 --- a/health/guides/kubelet/kubelet_node_config_error.md +++ b/src/health/guides/kubelet/kubelet_node_config_error.md diff --git a/health/guides/kubelet/kubelet_operations_error.md b/src/health/guides/kubelet/kubelet_operations_error.md index 870993b55..870993b55 100644 --- a/health/guides/kubelet/kubelet_operations_error.md +++ b/src/health/guides/kubelet/kubelet_operations_error.md diff --git a/health/guides/kubelet/kubelet_token_requests.md b/src/health/guides/kubelet/kubelet_token_requests.md index 28d70241f..28d70241f 100644 --- a/health/guides/kubelet/kubelet_token_requests.md +++ b/src/health/guides/kubelet/kubelet_token_requests.md diff --git a/health/guides/linux_power_supply/linux_power_supply_capacity.md b/src/health/guides/linux_power_supply/linux_power_supply_capacity.md index 10ee32f4e..10ee32f4e 100644 --- a/health/guides/linux_power_supply/linux_power_supply_capacity.md +++ b/src/health/guides/linux_power_supply/linux_power_supply_capacity.md diff --git a/health/guides/load/load_average_1.md b/src/health/guides/load/load_average_1.md index 1f33f8ff5..1f33f8ff5 100644 --- a/health/guides/load/load_average_1.md +++ b/src/health/guides/load/load_average_1.md diff --git a/health/guides/load/load_average_15.md b/src/health/guides/load/load_average_15.md index ba8b1e3e0..37df648a5 100644 --- a/health/guides/load/load_average_15.md +++ b/src/health/guides/load/load_average_15.md @@ -4,7 +4,7 @@ This alarm calculates the system `load average` (CPU and I/O demand) over the pe The alert gets raised into warning if the metric is 2 times the expected value and cleared if the value is 1.75 times the expected value. -For further information on how our alerts are calculated, please have a look at our [Documentation](https://learn.netdata.cloud/docs/agent/health/reference#expressions). +For further information on how our alerts are calculated, please have a look at our [Documentation](/src/health/REFERENCE.md#expressions). ### What does "load average" mean? diff --git a/health/guides/load/load_average_5.md b/src/health/guides/load/load_average_5.md index 6eacfcec9..d284eb963 100644 --- a/health/guides/load/load_average_5.md +++ b/src/health/guides/load/load_average_5.md @@ -4,7 +4,7 @@ This alarm calculates the system `load average` (CPU and I/O demand) over the pe The alert gets raised into warning if the metric is 4 times the expected value and cleared if the value is 3.5 times the expected value. -For further information on how our alerts are calculated, please have a look at our [Documentation](https://learn.netdata.cloud/docs/agent/health/reference#expressions). +For further information on how our alerts are calculated, please have a look at our [Documentation](/src/health/REFERENCE.md#expressions). ### What does "load average" mean? diff --git a/health/guides/load/load_cpu_number.md b/src/health/guides/load/load_cpu_number.md index 250a6d069..250a6d069 100644 --- a/health/guides/load/load_cpu_number.md +++ b/src/health/guides/load/load_cpu_number.md diff --git a/health/guides/mdstat/mdstat_disks.md b/src/health/guides/mdstat/mdstat_disks.md index c3daf9619..c3daf9619 100644 --- a/health/guides/mdstat/mdstat_disks.md +++ b/src/health/guides/mdstat/mdstat_disks.md diff --git a/health/guides/mdstat/mdstat_mismatch_cnt.md b/src/health/guides/mdstat/mdstat_mismatch_cnt.md index 7a156e38f..7a156e38f 100644 --- a/health/guides/mdstat/mdstat_mismatch_cnt.md +++ b/src/health/guides/mdstat/mdstat_mismatch_cnt.md diff --git a/health/guides/mdstat/mdstat_nonredundant_last_collected.md b/src/health/guides/mdstat/mdstat_nonredundant_last_collected.md index f76c61483..2cc9574a1 100644 --- a/health/guides/mdstat/mdstat_nonredundant_last_collected.md +++ b/src/health/guides/mdstat/mdstat_nonredundant_last_collected.md @@ -52,4 +52,4 @@ The md (multiple device) driver is responsible for managing software RAID arrays ### Useful resources 1. [Linux RAID: A Quick Guide](https://www.cyberciti.biz/tips/linux-raid-increase-resync-rebuild-speed.html) -2. [Netdata Agent Configuration Guide](https://learn.netdata.cloud/docs/agent/daemon/config) +2. [Netdata Agent Configuration Guide](/src/daemon/config/README.md) diff --git a/health/guides/megacli/megacli_adapter_state.md b/src/health/guides/megacli/megacli_adapter_state.md index 1202184e9..1202184e9 100644 --- a/health/guides/megacli/megacli_adapter_state.md +++ b/src/health/guides/megacli/megacli_adapter_state.md diff --git a/health/guides/megacli/megacli_bbu_cycle_count.md b/src/health/guides/megacli/megacli_bbu_cycle_count.md index 14f1d22dd..14f1d22dd 100644 --- a/health/guides/megacli/megacli_bbu_cycle_count.md +++ b/src/health/guides/megacli/megacli_bbu_cycle_count.md diff --git a/health/guides/megacli/megacli_bbu_relative_charge.md b/src/health/guides/megacli/megacli_bbu_relative_charge.md index 74a03a3b1..74a03a3b1 100644 --- a/health/guides/megacli/megacli_bbu_relative_charge.md +++ b/src/health/guides/megacli/megacli_bbu_relative_charge.md diff --git a/health/guides/megacli/megacli_pd_media_errors.md b/src/health/guides/megacli/megacli_pd_media_errors.md index 8988d09e5..8988d09e5 100644 --- a/health/guides/megacli/megacli_pd_media_errors.md +++ b/src/health/guides/megacli/megacli_pd_media_errors.md diff --git a/health/guides/megacli/megacli_pd_predictive_failures.md b/src/health/guides/megacli/megacli_pd_predictive_failures.md index 1aa7b0d20..1aa7b0d20 100644 --- a/health/guides/megacli/megacli_pd_predictive_failures.md +++ b/src/health/guides/megacli/megacli_pd_predictive_failures.md diff --git a/health/guides/memcached/memcached_cache_fill_rate.md b/src/health/guides/memcached/memcached_cache_fill_rate.md index ec276b3a7..ec276b3a7 100644 --- a/health/guides/memcached/memcached_cache_fill_rate.md +++ b/src/health/guides/memcached/memcached_cache_fill_rate.md diff --git a/health/guides/memcached/memcached_cache_memory_usage.md b/src/health/guides/memcached/memcached_cache_memory_usage.md index 2a14f01fc..2a14f01fc 100644 --- a/health/guides/memcached/memcached_cache_memory_usage.md +++ b/src/health/guides/memcached/memcached_cache_memory_usage.md diff --git a/health/guides/memcached/memcached_out_of_cache_space_time.md b/src/health/guides/memcached/memcached_out_of_cache_space_time.md index 5f546553c..5f546553c 100644 --- a/health/guides/memcached/memcached_out_of_cache_space_time.md +++ b/src/health/guides/memcached/memcached_out_of_cache_space_time.md diff --git a/health/guides/memory/1hour_ecc_memory_correctable.md b/src/health/guides/memory/1hour_ecc_memory_correctable.md index 1893bbf7e..1893bbf7e 100644 --- a/health/guides/memory/1hour_ecc_memory_correctable.md +++ b/src/health/guides/memory/1hour_ecc_memory_correctable.md diff --git a/health/guides/memory/1hour_ecc_memory_uncorrectable.md b/src/health/guides/memory/1hour_ecc_memory_uncorrectable.md index 509ff5448..509ff5448 100644 --- a/health/guides/memory/1hour_ecc_memory_uncorrectable.md +++ b/src/health/guides/memory/1hour_ecc_memory_uncorrectable.md diff --git a/health/guides/memory/1hour_memory_hw_corrupted.md b/src/health/guides/memory/1hour_memory_hw_corrupted.md index 1be030480..1be030480 100644 --- a/health/guides/memory/1hour_memory_hw_corrupted.md +++ b/src/health/guides/memory/1hour_memory_hw_corrupted.md diff --git a/src/health/guides/ml/ml_1min_node_ar.md b/src/health/guides/ml/ml_1min_node_ar.md new file mode 100644 index 000000000..b5f12389b --- /dev/null +++ b/src/health/guides/ml/ml_1min_node_ar.md @@ -0,0 +1,26 @@ +### Understand the alert + +This alert is triggered when the [node anomaly rate](/src/ml/README.md) exceeds the threshold defined in the [alert configuration](https://github.com/netdata/netdata/blob/master/src/health/health.d/ml.conf) over the most recent 1 minute window evaluated. + +For example, with the default of `warn: $this > 1`, this means that 1% or more of the metrics collected on the node have across the most recent 1 minute window been flagged as [anomalous](/src/ml/README.md) by Netdata. + +### Troubleshoot the alert + +This alert is a signal that some significant percentage of metrics within your infrastructure have been flagged as anomalous accoring to the ML based anomaly detection models the Netdata agent continually trains and re-trains for each metric. This tells us something somewhere might look strange in some way. THe next step is to try drill in and see what metrics are actually driving this. + +1. **Filter for the node or nodes relevant**: First we need to reduce as much noise as possible by filtering for just those nodes that have the elevated node anomaly rate. Look at the `anomaly_detection.anomaly_rate` chart and group by `node` to see which nodes have an elevated anomaly rate. Filter for just those nodes since this will reduce any noise as much as possible. + +2. **Highlight the area of interest**: Highlight the timeframne of interest where you see an elevated anomaly rate. + +3. **Check the anomalies tab**: Check the [Anomaly Advisor tab](/docs/dashboards-and-charts/anomaly-advisor-tab.md) to see an ordered list of what metrics were most anomalous in the highlighted window. + +4. **Press the AR% button on Overview**: You can also press the "[AR%](https://blog.netdata.cloud/anomaly-rates-in-the-menu/)" button on the Overview or single node dashboard to see what parts of the menu have the highest chart anomaly rates. Pressing the AR% button should add some "pills" to each menu item and if you hover over it you will see that chart within each menu section that was most anomalous during the highlighted timeframe. + +5. **Use Metric Correlations**: Use [metric correlations](/docs/metric-correlations.md) to see what metrics may have changed most significantly comparing before to the highlighted timeframe. + +### Useful resources + +1. [Machine learning (ML) powered anomaly detection](/src/ml/README.md) +2. [Anomaly Advisor tab](/docs/dashboards-and-charts/anomaly-advisor-tab.md) +3. [Metric Correlations](/docs/metric-correlations.md) +4. [Anomaly Rates in the Menu!](https://blog.netdata.cloud/anomaly-rates-in-the-menu/) diff --git a/health/guides/mysql/mysql_10s_slow_queries.md b/src/health/guides/mysql/mysql_10s_slow_queries.md index 173218448..173218448 100644 --- a/health/guides/mysql/mysql_10s_slow_queries.md +++ b/src/health/guides/mysql/mysql_10s_slow_queries.md diff --git a/health/guides/mysql/mysql_10s_table_locks_immediate.md b/src/health/guides/mysql/mysql_10s_table_locks_immediate.md index 7b375b43d..7b375b43d 100644 --- a/health/guides/mysql/mysql_10s_table_locks_immediate.md +++ b/src/health/guides/mysql/mysql_10s_table_locks_immediate.md diff --git a/health/guides/mysql/mysql_10s_table_locks_waited.md b/src/health/guides/mysql/mysql_10s_table_locks_waited.md index 1cac9e929..1cac9e929 100644 --- a/health/guides/mysql/mysql_10s_table_locks_waited.md +++ b/src/health/guides/mysql/mysql_10s_table_locks_waited.md diff --git a/health/guides/mysql/mysql_10s_waited_locks_ratio.md b/src/health/guides/mysql/mysql_10s_waited_locks_ratio.md index 60c030590..60c030590 100644 --- a/health/guides/mysql/mysql_10s_waited_locks_ratio.md +++ b/src/health/guides/mysql/mysql_10s_waited_locks_ratio.md diff --git a/health/guides/mysql/mysql_connections.md b/src/health/guides/mysql/mysql_connections.md index 2f57fef2d..2f57fef2d 100644 --- a/health/guides/mysql/mysql_connections.md +++ b/src/health/guides/mysql/mysql_connections.md diff --git a/health/guides/mysql/mysql_galera_cluster_size.md b/src/health/guides/mysql/mysql_galera_cluster_size.md index ebe5d64a3..ebe5d64a3 100644 --- a/health/guides/mysql/mysql_galera_cluster_size.md +++ b/src/health/guides/mysql/mysql_galera_cluster_size.md diff --git a/health/guides/mysql/mysql_galera_cluster_size_max_2m.md b/src/health/guides/mysql/mysql_galera_cluster_size_max_2m.md index 0f14ca8af..0f14ca8af 100644 --- a/health/guides/mysql/mysql_galera_cluster_size_max_2m.md +++ b/src/health/guides/mysql/mysql_galera_cluster_size_max_2m.md diff --git a/health/guides/mysql/mysql_galera_cluster_state_crit.md b/src/health/guides/mysql/mysql_galera_cluster_state_crit.md index c1ac649e5..c1ac649e5 100644 --- a/health/guides/mysql/mysql_galera_cluster_state_crit.md +++ b/src/health/guides/mysql/mysql_galera_cluster_state_crit.md diff --git a/health/guides/mysql/mysql_galera_cluster_state_warn.md b/src/health/guides/mysql/mysql_galera_cluster_state_warn.md index e03ffa2ea..e03ffa2ea 100644 --- a/health/guides/mysql/mysql_galera_cluster_state_warn.md +++ b/src/health/guides/mysql/mysql_galera_cluster_state_warn.md diff --git a/health/guides/mysql/mysql_galera_cluster_status.md b/src/health/guides/mysql/mysql_galera_cluster_status.md index c5b07516f..c5b07516f 100644 --- a/health/guides/mysql/mysql_galera_cluster_status.md +++ b/src/health/guides/mysql/mysql_galera_cluster_status.md diff --git a/health/guides/mysql/mysql_replication.md b/src/health/guides/mysql/mysql_replication.md index 50f7e5437..50f7e5437 100644 --- a/health/guides/mysql/mysql_replication.md +++ b/src/health/guides/mysql/mysql_replication.md diff --git a/health/guides/mysql/mysql_replication_lag.md b/src/health/guides/mysql/mysql_replication_lag.md index 9c57f8108..9c57f8108 100644 --- a/health/guides/mysql/mysql_replication_lag.md +++ b/src/health/guides/mysql/mysql_replication_lag.md diff --git a/health/guides/net/10min_fifo_errors.md b/src/health/guides/net/10min_fifo_errors.md index 845ae6aff..845ae6aff 100644 --- a/health/guides/net/10min_fifo_errors.md +++ b/src/health/guides/net/10min_fifo_errors.md diff --git a/health/guides/net/10min_netisr_backlog_exceeded.md b/src/health/guides/net/10min_netisr_backlog_exceeded.md index d40d2c9ae..d40d2c9ae 100644 --- a/health/guides/net/10min_netisr_backlog_exceeded.md +++ b/src/health/guides/net/10min_netisr_backlog_exceeded.md diff --git a/health/guides/net/10s_received_packets_storm.md b/src/health/guides/net/10s_received_packets_storm.md index 29e1f5346..29e1f5346 100644 --- a/health/guides/net/10s_received_packets_storm.md +++ b/src/health/guides/net/10s_received_packets_storm.md diff --git a/health/guides/net/1m_received_packets_rate.md b/src/health/guides/net/1m_received_packets_rate.md index 891e8bf39..891e8bf39 100644 --- a/health/guides/net/1m_received_packets_rate.md +++ b/src/health/guides/net/1m_received_packets_rate.md diff --git a/health/guides/net/1m_received_traffic_overflow.md b/src/health/guides/net/1m_received_traffic_overflow.md index 270dd892d..270dd892d 100644 --- a/health/guides/net/1m_received_traffic_overflow.md +++ b/src/health/guides/net/1m_received_traffic_overflow.md diff --git a/health/guides/net/1m_sent_traffic_overflow.md b/src/health/guides/net/1m_sent_traffic_overflow.md index 376d578cd..376d578cd 100644 --- a/health/guides/net/1m_sent_traffic_overflow.md +++ b/src/health/guides/net/1m_sent_traffic_overflow.md diff --git a/health/guides/net/inbound_packets_dropped.md b/src/health/guides/net/inbound_packets_dropped.md index e25196309..e25196309 100644 --- a/health/guides/net/inbound_packets_dropped.md +++ b/src/health/guides/net/inbound_packets_dropped.md diff --git a/health/guides/net/inbound_packets_dropped_ratio.md b/src/health/guides/net/inbound_packets_dropped_ratio.md index 7bc9ed8e5..7bc9ed8e5 100644 --- a/health/guides/net/inbound_packets_dropped_ratio.md +++ b/src/health/guides/net/inbound_packets_dropped_ratio.md diff --git a/health/guides/net/interface_inbound_errors.md b/src/health/guides/net/interface_inbound_errors.md index 6c8bcfcd3..6c8bcfcd3 100644 --- a/health/guides/net/interface_inbound_errors.md +++ b/src/health/guides/net/interface_inbound_errors.md diff --git a/health/guides/net/interface_outbound_errors.md b/src/health/guides/net/interface_outbound_errors.md index 194d8aba2..194d8aba2 100644 --- a/health/guides/net/interface_outbound_errors.md +++ b/src/health/guides/net/interface_outbound_errors.md diff --git a/health/guides/net/interface_speed.md b/src/health/guides/net/interface_speed.md index 89f967c57..89f967c57 100644 --- a/health/guides/net/interface_speed.md +++ b/src/health/guides/net/interface_speed.md diff --git a/health/guides/net/outbound_packets_dropped.md b/src/health/guides/net/outbound_packets_dropped.md index 49291d1d9..f943c3fd8 100644 --- a/health/guides/net/outbound_packets_dropped.md +++ b/src/health/guides/net/outbound_packets_dropped.md @@ -14,7 +14,7 @@ Check the alert message for the `${label:device}` placeholder. It indicates the 2. Verify network congestion or excessive traffic: -Excessive traffic or network congestion can lead to dropped packets. To check network traffic, use the `nload` tool. If it isn't installed, you can follow the instructions given [here](https://www.howtoforge.com/tutorial/install-nload-on-linux/). +Excessive traffic or network congestion can lead to dropped packets. To check network traffic, use the `nload` tool. ```bash nload ${label:device} @@ -22,7 +22,7 @@ nload ${label:device} This will display the current network bandwidth usage on the specified interface. Look for unusually high or fluctuating usage patterns, which could indicate congestion or excessive traffic. -3. Verify hardware issues: +1. Verify hardware issues: Check the network interface and related hardware components (such as the network card, cables, and switches) for visible damage, loose connections, or other issues. Replace any defective components as needed. diff --git a/health/guides/net/outbound_packets_dropped_ratio.md b/src/health/guides/net/outbound_packets_dropped_ratio.md index 9b90a97b5..9b90a97b5 100644 --- a/health/guides/net/outbound_packets_dropped_ratio.md +++ b/src/health/guides/net/outbound_packets_dropped_ratio.md diff --git a/health/guides/netdev/1min_netdev_backlog_exceeded.md b/src/health/guides/netdev/1min_netdev_backlog_exceeded.md index dc7b6a2c9..dc7b6a2c9 100644 --- a/health/guides/netdev/1min_netdev_backlog_exceeded.md +++ b/src/health/guides/netdev/1min_netdev_backlog_exceeded.md diff --git a/health/guides/netdev/1min_netdev_budget_ran_outs.md b/src/health/guides/netdev/1min_netdev_budget_ran_outs.md index 305393225..305393225 100644 --- a/health/guides/netdev/1min_netdev_budget_ran_outs.md +++ b/src/health/guides/netdev/1min_netdev_budget_ran_outs.md diff --git a/health/guides/netfilter/netfilter_conntrack_full.md b/src/health/guides/netfilter/netfilter_conntrack_full.md index 667f0e499..667f0e499 100644 --- a/health/guides/netfilter/netfilter_conntrack_full.md +++ b/src/health/guides/netfilter/netfilter_conntrack_full.md diff --git a/health/guides/nvme/nvme_device_critical_warnings_state.md b/src/health/guides/nvme/nvme_device_critical_warnings_state.md index a12381bb3..a12381bb3 100644 --- a/health/guides/nvme/nvme_device_critical_warnings_state.md +++ b/src/health/guides/nvme/nvme_device_critical_warnings_state.md diff --git a/health/guides/pihole/pihole_blocklist_last_update.md b/src/health/guides/pihole/pihole_blocklist_last_update.md index d358e04ce..d358e04ce 100644 --- a/health/guides/pihole/pihole_blocklist_last_update.md +++ b/src/health/guides/pihole/pihole_blocklist_last_update.md diff --git a/health/guides/pihole/pihole_status.md b/src/health/guides/pihole/pihole_status.md index 57dd203f4..57dd203f4 100644 --- a/health/guides/pihole/pihole_status.md +++ b/src/health/guides/pihole/pihole_status.md diff --git a/health/guides/ping/ping_host_latency.md b/src/health/guides/ping/ping_host_latency.md index 59ea1be64..59ea1be64 100644 --- a/health/guides/ping/ping_host_latency.md +++ b/src/health/guides/ping/ping_host_latency.md diff --git a/health/guides/ping/ping_host_reachable.md b/src/health/guides/ping/ping_host_reachable.md index 75e24cbee..75e24cbee 100644 --- a/health/guides/ping/ping_host_reachable.md +++ b/src/health/guides/ping/ping_host_reachable.md diff --git a/health/guides/ping/ping_packet_loss.md b/src/health/guides/ping/ping_packet_loss.md index 546ecb000..546ecb000 100644 --- a/health/guides/ping/ping_packet_loss.md +++ b/src/health/guides/ping/ping_packet_loss.md diff --git a/health/guides/portcheck/portcheck_connection_fails.md b/src/health/guides/portcheck/portcheck_connection_fails.md index 781cf7a01..781cf7a01 100644 --- a/health/guides/portcheck/portcheck_connection_fails.md +++ b/src/health/guides/portcheck/portcheck_connection_fails.md diff --git a/health/guides/portcheck/portcheck_connection_timeouts.md b/src/health/guides/portcheck/portcheck_connection_timeouts.md index 5386f1509..b3608f62e 100644 --- a/health/guides/portcheck/portcheck_connection_timeouts.md +++ b/src/health/guides/portcheck/portcheck_connection_timeouts.md @@ -37,5 +37,4 @@ This alert triggers a warning state when the ratio of timeouts is between 10-40% ### Useful resources 1. [Netstat Command in Linux](https://www.tecmint.com/20-netstat-commands-for-linux-network-management/) -2. [Iostat Command Usage and Examples](https://www.thomas-krenn.com/en/wiki/Iostat_command_usage_and_examples) -3. [Iftop Guide](https://www.tecmint.com/iftop-linux-network-bandwidth-monitoring-tool/) +2. [Iftop Guide](https://www.tecmint.com/iftop-linux-network-bandwidth-monitoring-tool/) diff --git a/health/guides/portcheck/portcheck_service_reachable.md b/src/health/guides/portcheck/portcheck_service_reachable.md index 550db585e..550db585e 100644 --- a/health/guides/portcheck/portcheck_service_reachable.md +++ b/src/health/guides/portcheck/portcheck_service_reachable.md diff --git a/health/guides/postgres/postgres_acquired_locks_utilization.md b/src/health/guides/postgres/postgres_acquired_locks_utilization.md index d0b76eae3..d0b76eae3 100644 --- a/health/guides/postgres/postgres_acquired_locks_utilization.md +++ b/src/health/guides/postgres/postgres_acquired_locks_utilization.md diff --git a/health/guides/postgres/postgres_db_cache_io_ratio.md b/src/health/guides/postgres/postgres_db_cache_io_ratio.md index d39329763..d39329763 100644 --- a/health/guides/postgres/postgres_db_cache_io_ratio.md +++ b/src/health/guides/postgres/postgres_db_cache_io_ratio.md diff --git a/health/guides/postgres/postgres_db_deadlocks_rate.md b/src/health/guides/postgres/postgres_db_deadlocks_rate.md index 0b670b640..0b670b640 100644 --- a/health/guides/postgres/postgres_db_deadlocks_rate.md +++ b/src/health/guides/postgres/postgres_db_deadlocks_rate.md diff --git a/health/guides/postgres/postgres_db_transactions_rollback_ratio.md b/src/health/guides/postgres/postgres_db_transactions_rollback_ratio.md index b2f94fede..b2f94fede 100644 --- a/health/guides/postgres/postgres_db_transactions_rollback_ratio.md +++ b/src/health/guides/postgres/postgres_db_transactions_rollback_ratio.md diff --git a/health/guides/postgres/postgres_index_bloat_size_perc.md b/src/health/guides/postgres/postgres_index_bloat_size_perc.md index bd6e4ba08..bd6e4ba08 100644 --- a/health/guides/postgres/postgres_index_bloat_size_perc.md +++ b/src/health/guides/postgres/postgres_index_bloat_size_perc.md diff --git a/health/guides/postgres/postgres_table_bloat_size_perc.md b/src/health/guides/postgres/postgres_table_bloat_size_perc.md index 0edc21bb1..0edc21bb1 100644 --- a/health/guides/postgres/postgres_table_bloat_size_perc.md +++ b/src/health/guides/postgres/postgres_table_bloat_size_perc.md diff --git a/health/guides/postgres/postgres_table_cache_io_ratio.md b/src/health/guides/postgres/postgres_table_cache_io_ratio.md index 382f8ee4d..712f4aafc 100644 --- a/health/guides/postgres/postgres_table_cache_io_ratio.md +++ b/src/health/guides/postgres/postgres_table_cache_io_ratio.md @@ -28,5 +28,4 @@ Keep monitoring cache hit ratios after making changes to your configuration or o ### Useful resources -1. [Tuning Your PostgreSQL Server](https://www.postgresql.org/docs/current/runtime-config-resource.html) -2. [Performance Monitoring and Tuning in PostgreSQL](https://learn.netdata.cloud/docs/agent/collectors/python.d.plugin/postgres#monitoring) +1. [Tuning Your PostgreSQL Server](https://www.postgresql.org/docs/current/runtime-config-resource.html)
\ No newline at end of file diff --git a/health/guides/postgres/postgres_table_index_cache_io_ratio.md b/src/health/guides/postgres/postgres_table_index_cache_io_ratio.md index 5c5bb2bd8..5c5bb2bd8 100644 --- a/health/guides/postgres/postgres_table_index_cache_io_ratio.md +++ b/src/health/guides/postgres/postgres_table_index_cache_io_ratio.md diff --git a/health/guides/postgres/postgres_table_last_autoanalyze_time.md b/src/health/guides/postgres/postgres_table_last_autoanalyze_time.md index 1a7a3d79b..1a7a3d79b 100644 --- a/health/guides/postgres/postgres_table_last_autoanalyze_time.md +++ b/src/health/guides/postgres/postgres_table_last_autoanalyze_time.md diff --git a/health/guides/postgres/postgres_table_last_autovacuum_time.md b/src/health/guides/postgres/postgres_table_last_autovacuum_time.md index 8a79b0d31..8a79b0d31 100644 --- a/health/guides/postgres/postgres_table_last_autovacuum_time.md +++ b/src/health/guides/postgres/postgres_table_last_autovacuum_time.md diff --git a/health/guides/postgres/postgres_table_toast_cache_io_ratio.md b/src/health/guides/postgres/postgres_table_toast_cache_io_ratio.md index c33a2373c..c33a2373c 100644 --- a/health/guides/postgres/postgres_table_toast_cache_io_ratio.md +++ b/src/health/guides/postgres/postgres_table_toast_cache_io_ratio.md diff --git a/health/guides/postgres/postgres_table_toast_index_cache_io_ratio.md b/src/health/guides/postgres/postgres_table_toast_index_cache_io_ratio.md index 6aeb38624..6aeb38624 100644 --- a/health/guides/postgres/postgres_table_toast_index_cache_io_ratio.md +++ b/src/health/guides/postgres/postgres_table_toast_index_cache_io_ratio.md diff --git a/health/guides/postgres/postgres_total_connection_utilization.md b/src/health/guides/postgres/postgres_total_connection_utilization.md index 266f4cbd0..266f4cbd0 100644 --- a/health/guides/postgres/postgres_total_connection_utilization.md +++ b/src/health/guides/postgres/postgres_total_connection_utilization.md diff --git a/health/guides/postgres/postgres_txid_exhaustion_perc.md b/src/health/guides/postgres/postgres_txid_exhaustion_perc.md index 9c2284956..9c2284956 100644 --- a/health/guides/postgres/postgres_txid_exhaustion_perc.md +++ b/src/health/guides/postgres/postgres_txid_exhaustion_perc.md diff --git a/health/guides/processes/active_processes.md b/src/health/guides/processes/active_processes.md index 75ddd827c..75ddd827c 100644 --- a/health/guides/processes/active_processes.md +++ b/src/health/guides/processes/active_processes.md diff --git a/health/guides/qos/10min_qos_packet_drops.md b/src/health/guides/qos/10min_qos_packet_drops.md index b2e0d8c8a..b2e0d8c8a 100644 --- a/health/guides/qos/10min_qos_packet_drops.md +++ b/src/health/guides/qos/10min_qos_packet_drops.md diff --git a/health/guides/ram/oom_kill.md b/src/health/guides/ram/oom_kill.md index 69afb8146..69afb8146 100644 --- a/health/guides/ram/oom_kill.md +++ b/src/health/guides/ram/oom_kill.md diff --git a/health/guides/ram/ram_available.md b/src/health/guides/ram/ram_available.md index f94bdf3bd..f94bdf3bd 100644 --- a/health/guides/ram/ram_available.md +++ b/src/health/guides/ram/ram_available.md diff --git a/health/guides/ram/ram_in_use.md b/src/health/guides/ram/ram_in_use.md index 9c686daa8..9c686daa8 100644 --- a/health/guides/ram/ram_in_use.md +++ b/src/health/guides/ram/ram_in_use.md diff --git a/health/guides/redis/redis_bgsave_broken.md b/src/health/guides/redis/redis_bgsave_broken.md index 23ed75ff1..23ed75ff1 100644 --- a/health/guides/redis/redis_bgsave_broken.md +++ b/src/health/guides/redis/redis_bgsave_broken.md diff --git a/health/guides/redis/redis_bgsave_slow.md b/src/health/guides/redis/redis_bgsave_slow.md index 6a04bdf27..6a04bdf27 100644 --- a/health/guides/redis/redis_bgsave_slow.md +++ b/src/health/guides/redis/redis_bgsave_slow.md diff --git a/health/guides/redis/redis_connections_rejected.md b/src/health/guides/redis/redis_connections_rejected.md index 784602461..784602461 100644 --- a/health/guides/redis/redis_connections_rejected.md +++ b/src/health/guides/redis/redis_connections_rejected.md diff --git a/health/guides/redis/redis_master_link_down.md b/src/health/guides/redis/redis_master_link_down.md index 5a2d24293..5a2d24293 100644 --- a/health/guides/redis/redis_master_link_down.md +++ b/src/health/guides/redis/redis_master_link_down.md diff --git a/health/guides/retroshare/retroshare_dht_working.md b/src/health/guides/retroshare/retroshare_dht_working.md index d1e26ac18..d1e26ac18 100644 --- a/health/guides/retroshare/retroshare_dht_working.md +++ b/src/health/guides/retroshare/retroshare_dht_working.md diff --git a/health/guides/riakkv/riakkv_1h_kv_get_mean_latency.md b/src/health/guides/riakkv/riakkv_1h_kv_get_mean_latency.md index 7233423ee..7233423ee 100644 --- a/health/guides/riakkv/riakkv_1h_kv_get_mean_latency.md +++ b/src/health/guides/riakkv/riakkv_1h_kv_get_mean_latency.md diff --git a/health/guides/riakkv/riakkv_1h_kv_put_mean_latency.md b/src/health/guides/riakkv/riakkv_1h_kv_put_mean_latency.md index cc2cad28f..cc2cad28f 100644 --- a/health/guides/riakkv/riakkv_1h_kv_put_mean_latency.md +++ b/src/health/guides/riakkv/riakkv_1h_kv_put_mean_latency.md diff --git a/health/guides/riakkv/riakkv_kv_get_slow.md b/src/health/guides/riakkv/riakkv_kv_get_slow.md index 05fd67ce7..888c96e72 100644 --- a/health/guides/riakkv/riakkv_kv_get_slow.md +++ b/src/health/guides/riakkv/riakkv_kv_get_slow.md @@ -17,6 +17,5 @@ The `riakkv_kv_get_slow` alert is related to Riak KV, a distributed NoSQL key-va ### Useful resources 1. [Riak KV documentation](https://riak.com/documentation/) -2. [Monitoring Riak KV with Netdata](https://learn.netdata.cloud/docs/agent/collectors/python.d.plugin/riakkv/) -3. [Riak Control: Monitoring and Administration Interface](https://docs.riak.com/riak/kv/2.2.3/configuring/reference/riak-vars/#riak-control) -4. [Riak KV Monitoring and Metrics](https://docs.riak.com/riak/kv/2.2.3/using/performance/monitoring/index.html) +2. [Riak Control: Monitoring and Administration Interface](https://docs.riak.com/riak/kv/2.2.3/configuring/reference/riak-vars/#riak-control) +3. [Riak KV Monitoring and Metrics](https://docs.riak.com/riak/kv/2.2.3/using/performance/monitoring/index.html) diff --git a/health/guides/riakkv/riakkv_kv_put_slow.md b/src/health/guides/riakkv/riakkv_kv_put_slow.md index 9bd314e7e..9bd314e7e 100644 --- a/health/guides/riakkv/riakkv_kv_put_slow.md +++ b/src/health/guides/riakkv/riakkv_kv_put_slow.md diff --git a/health/guides/riakkv/riakkv_list_keys_active.md b/src/health/guides/riakkv/riakkv_list_keys_active.md index 38d42a37e..38d42a37e 100644 --- a/health/guides/riakkv/riakkv_list_keys_active.md +++ b/src/health/guides/riakkv/riakkv_list_keys_active.md diff --git a/health/guides/riakkv/riakkv_vm_high_process_count.md b/src/health/guides/riakkv/riakkv_vm_high_process_count.md index 7fd79517e..7fd79517e 100644 --- a/health/guides/riakkv/riakkv_vm_high_process_count.md +++ b/src/health/guides/riakkv/riakkv_vm_high_process_count.md diff --git a/health/guides/scaleio/scaleio_sdc_mdm_connection_state.md b/src/health/guides/scaleio/scaleio_sdc_mdm_connection_state.md index 1e09b978c..1e09b978c 100644 --- a/health/guides/scaleio/scaleio_sdc_mdm_connection_state.md +++ b/src/health/guides/scaleio/scaleio_sdc_mdm_connection_state.md diff --git a/health/guides/scaleio/scaleio_storage_pool_capacity_utilization.md b/src/health/guides/scaleio/scaleio_storage_pool_capacity_utilization.md index 0f8a723b8..0f8a723b8 100644 --- a/health/guides/scaleio/scaleio_storage_pool_capacity_utilization.md +++ b/src/health/guides/scaleio/scaleio_storage_pool_capacity_utilization.md diff --git a/health/guides/sync/sync_freq.md b/src/health/guides/sync/sync_freq.md index bb1043704..bb1043704 100644 --- a/health/guides/sync/sync_freq.md +++ b/src/health/guides/sync/sync_freq.md diff --git a/health/guides/systemdunits/systemd_automount_unit_failed_state.md b/src/health/guides/systemdunits/systemd_automount_unit_failed_state.md index eb3024a90..eb3024a90 100644 --- a/health/guides/systemdunits/systemd_automount_unit_failed_state.md +++ b/src/health/guides/systemdunits/systemd_automount_unit_failed_state.md diff --git a/health/guides/systemdunits/systemd_device_unit_failed_state.md b/src/health/guides/systemdunits/systemd_device_unit_failed_state.md index 8a7fc39d9..8a7fc39d9 100644 --- a/health/guides/systemdunits/systemd_device_unit_failed_state.md +++ b/src/health/guides/systemdunits/systemd_device_unit_failed_state.md diff --git a/health/guides/systemdunits/systemd_mount_unit_failed_state.md b/src/health/guides/systemdunits/systemd_mount_unit_failed_state.md index 5840b7ce3..5840b7ce3 100644 --- a/health/guides/systemdunits/systemd_mount_unit_failed_state.md +++ b/src/health/guides/systemdunits/systemd_mount_unit_failed_state.md diff --git a/health/guides/systemdunits/systemd_path_unit_failed_state.md b/src/health/guides/systemdunits/systemd_path_unit_failed_state.md index 9a4749b6d..9a4749b6d 100644 --- a/health/guides/systemdunits/systemd_path_unit_failed_state.md +++ b/src/health/guides/systemdunits/systemd_path_unit_failed_state.md diff --git a/health/guides/systemdunits/systemd_scope_unit_failed_state.md b/src/health/guides/systemdunits/systemd_scope_unit_failed_state.md index e080ae364..e080ae364 100644 --- a/health/guides/systemdunits/systemd_scope_unit_failed_state.md +++ b/src/health/guides/systemdunits/systemd_scope_unit_failed_state.md diff --git a/health/guides/systemdunits/systemd_service_unit_failed_state.md b/src/health/guides/systemdunits/systemd_service_unit_failed_state.md index f73567992..f73567992 100644 --- a/health/guides/systemdunits/systemd_service_unit_failed_state.md +++ b/src/health/guides/systemdunits/systemd_service_unit_failed_state.md diff --git a/health/guides/systemdunits/systemd_slice_unit_failed_state.md b/src/health/guides/systemdunits/systemd_slice_unit_failed_state.md index d736f83fe..d736f83fe 100644 --- a/health/guides/systemdunits/systemd_slice_unit_failed_state.md +++ b/src/health/guides/systemdunits/systemd_slice_unit_failed_state.md diff --git a/health/guides/systemdunits/systemd_socket_unit_failed_state.md b/src/health/guides/systemdunits/systemd_socket_unit_failed_state.md index 9d2d43665..9d2d43665 100644 --- a/health/guides/systemdunits/systemd_socket_unit_failed_state.md +++ b/src/health/guides/systemdunits/systemd_socket_unit_failed_state.md diff --git a/health/guides/systemdunits/systemd_swap_unit_failed_state.md b/src/health/guides/systemdunits/systemd_swap_unit_failed_state.md index 516156d0a..516156d0a 100644 --- a/health/guides/systemdunits/systemd_swap_unit_failed_state.md +++ b/src/health/guides/systemdunits/systemd_swap_unit_failed_state.md diff --git a/health/guides/systemdunits/systemd_target_unit_failed_state.md b/src/health/guides/systemdunits/systemd_target_unit_failed_state.md index 843405147..843405147 100644 --- a/health/guides/systemdunits/systemd_target_unit_failed_state.md +++ b/src/health/guides/systemdunits/systemd_target_unit_failed_state.md diff --git a/health/guides/tcp/10s_ipv4_tcp_resets_received.md b/src/health/guides/tcp/10s_ipv4_tcp_resets_received.md index c17954f2d..c17954f2d 100644 --- a/health/guides/tcp/10s_ipv4_tcp_resets_received.md +++ b/src/health/guides/tcp/10s_ipv4_tcp_resets_received.md diff --git a/health/guides/tcp/10s_ipv4_tcp_resets_sent.md b/src/health/guides/tcp/10s_ipv4_tcp_resets_sent.md index 9a941694e..9a941694e 100644 --- a/health/guides/tcp/10s_ipv4_tcp_resets_sent.md +++ b/src/health/guides/tcp/10s_ipv4_tcp_resets_sent.md diff --git a/health/guides/tcp/1m_ipv4_tcp_resets_received.md b/src/health/guides/tcp/1m_ipv4_tcp_resets_received.md index 89f01f3cb..89f01f3cb 100644 --- a/health/guides/tcp/1m_ipv4_tcp_resets_received.md +++ b/src/health/guides/tcp/1m_ipv4_tcp_resets_received.md diff --git a/health/guides/tcp/1m_ipv4_tcp_resets_sent.md b/src/health/guides/tcp/1m_ipv4_tcp_resets_sent.md index fa052e6bb..fa052e6bb 100644 --- a/health/guides/tcp/1m_ipv4_tcp_resets_sent.md +++ b/src/health/guides/tcp/1m_ipv4_tcp_resets_sent.md diff --git a/health/guides/tcp/1m_tcp_accept_queue_drops.md b/src/health/guides/tcp/1m_tcp_accept_queue_drops.md index 5926d24c9..5926d24c9 100644 --- a/health/guides/tcp/1m_tcp_accept_queue_drops.md +++ b/src/health/guides/tcp/1m_tcp_accept_queue_drops.md diff --git a/health/guides/tcp/1m_tcp_accept_queue_overflows.md b/src/health/guides/tcp/1m_tcp_accept_queue_overflows.md index 7c5ddf0f5..7c5ddf0f5 100644 --- a/health/guides/tcp/1m_tcp_accept_queue_overflows.md +++ b/src/health/guides/tcp/1m_tcp_accept_queue_overflows.md diff --git a/health/guides/tcp/1m_tcp_syn_queue_cookies.md b/src/health/guides/tcp/1m_tcp_syn_queue_cookies.md index 8dafb9f41..8dafb9f41 100644 --- a/health/guides/tcp/1m_tcp_syn_queue_cookies.md +++ b/src/health/guides/tcp/1m_tcp_syn_queue_cookies.md diff --git a/health/guides/tcp/1m_tcp_syn_queue_drops.md b/src/health/guides/tcp/1m_tcp_syn_queue_drops.md index c29d86d77..c29d86d77 100644 --- a/health/guides/tcp/1m_tcp_syn_queue_drops.md +++ b/src/health/guides/tcp/1m_tcp_syn_queue_drops.md diff --git a/health/guides/tcp/tcp_connections.md b/src/health/guides/tcp/tcp_connections.md index 849a05ac2..849a05ac2 100644 --- a/health/guides/tcp/tcp_connections.md +++ b/src/health/guides/tcp/tcp_connections.md diff --git a/health/guides/tcp/tcp_memory.md b/src/health/guides/tcp/tcp_memory.md index 99223c224..99223c224 100644 --- a/health/guides/tcp/tcp_memory.md +++ b/src/health/guides/tcp/tcp_memory.md diff --git a/health/guides/tcp/tcp_orphans.md b/src/health/guides/tcp/tcp_orphans.md index d7dd35a87..d7dd35a87 100644 --- a/health/guides/tcp/tcp_orphans.md +++ b/src/health/guides/tcp/tcp_orphans.md diff --git a/health/guides/timex/system_clock_sync_state.md b/src/health/guides/timex/system_clock_sync_state.md index c242e0a55..c242e0a55 100644 --- a/health/guides/timex/system_clock_sync_state.md +++ b/src/health/guides/timex/system_clock_sync_state.md diff --git a/health/guides/udp/1m_ipv4_udp_receive_buffer_errors.md b/src/health/guides/udp/1m_ipv4_udp_receive_buffer_errors.md index a100ebbb1..a100ebbb1 100644 --- a/health/guides/udp/1m_ipv4_udp_receive_buffer_errors.md +++ b/src/health/guides/udp/1m_ipv4_udp_receive_buffer_errors.md diff --git a/health/guides/udp/1m_ipv4_udp_send_buffer_errors.md b/src/health/guides/udp/1m_ipv4_udp_send_buffer_errors.md index 7d0411e3d..7d0411e3d 100644 --- a/health/guides/udp/1m_ipv4_udp_send_buffer_errors.md +++ b/src/health/guides/udp/1m_ipv4_udp_send_buffer_errors.md diff --git a/health/guides/unbound/unbound_request_list_dropped.md b/src/health/guides/unbound/unbound_request_list_dropped.md index deed815ee..deed815ee 100644 --- a/health/guides/unbound/unbound_request_list_dropped.md +++ b/src/health/guides/unbound/unbound_request_list_dropped.md diff --git a/health/guides/unbound/unbound_request_list_overwritten.md b/src/health/guides/unbound/unbound_request_list_overwritten.md index fd74a1632..fd74a1632 100644 --- a/health/guides/unbound/unbound_request_list_overwritten.md +++ b/src/health/guides/unbound/unbound_request_list_overwritten.md diff --git a/health/guides/upsd/upsd_10min_ups_load.md b/src/health/guides/upsd/upsd_10min_ups_load.md index fad4a2f6f..fad4a2f6f 100644 --- a/health/guides/upsd/upsd_10min_ups_load.md +++ b/src/health/guides/upsd/upsd_10min_ups_load.md diff --git a/health/guides/upsd/upsd_ups_battery_charge.md b/src/health/guides/upsd/upsd_ups_battery_charge.md index 0d8f757f2..0d8f757f2 100644 --- a/health/guides/upsd/upsd_ups_battery_charge.md +++ b/src/health/guides/upsd/upsd_ups_battery_charge.md diff --git a/health/guides/upsd/upsd_ups_last_collected_secs.md b/src/health/guides/upsd/upsd_ups_last_collected_secs.md index 818247834..818247834 100644 --- a/health/guides/upsd/upsd_ups_last_collected_secs.md +++ b/src/health/guides/upsd/upsd_ups_last_collected_secs.md diff --git a/health/guides/vcsa/vcsa_applmgmt_health.md b/src/health/guides/vcsa/vcsa_applmgmt_health.md index 06f391b3d..06f391b3d 100644 --- a/health/guides/vcsa/vcsa_applmgmt_health.md +++ b/src/health/guides/vcsa/vcsa_applmgmt_health.md diff --git a/health/guides/vcsa/vcsa_database_storage_health.md b/src/health/guides/vcsa/vcsa_database_storage_health.md index eb978b07b..eb978b07b 100644 --- a/health/guides/vcsa/vcsa_database_storage_health.md +++ b/src/health/guides/vcsa/vcsa_database_storage_health.md diff --git a/health/guides/vcsa/vcsa_load_health.md b/src/health/guides/vcsa/vcsa_load_health.md index 026138d52..026138d52 100644 --- a/health/guides/vcsa/vcsa_load_health.md +++ b/src/health/guides/vcsa/vcsa_load_health.md diff --git a/health/guides/vcsa/vcsa_mem_health.md b/src/health/guides/vcsa/vcsa_mem_health.md index 1e3604656..1e3604656 100644 --- a/health/guides/vcsa/vcsa_mem_health.md +++ b/src/health/guides/vcsa/vcsa_mem_health.md diff --git a/health/guides/vcsa/vcsa_software_updates_health.md b/src/health/guides/vcsa/vcsa_software_updates_health.md index 505e20f5c..505e20f5c 100644 --- a/health/guides/vcsa/vcsa_software_updates_health.md +++ b/src/health/guides/vcsa/vcsa_software_updates_health.md diff --git a/health/guides/vcsa/vcsa_storage_health.md b/src/health/guides/vcsa/vcsa_storage_health.md index 9dbfe69cb..9dbfe69cb 100644 --- a/health/guides/vcsa/vcsa_storage_health.md +++ b/src/health/guides/vcsa/vcsa_storage_health.md diff --git a/health/guides/vcsa/vcsa_swap_health.md b/src/health/guides/vcsa/vcsa_swap_health.md index 6e236ed34..6e236ed34 100644 --- a/health/guides/vcsa/vcsa_swap_health.md +++ b/src/health/guides/vcsa/vcsa_swap_health.md diff --git a/health/guides/vcsa/vcsa_system_health.md b/src/health/guides/vcsa/vcsa_system_health.md index 6e58a68dc..6e58a68dc 100644 --- a/health/guides/vcsa/vcsa_system_health.md +++ b/src/health/guides/vcsa/vcsa_system_health.md diff --git a/health/guides/vernemq/vernemq_average_scheduler_utilization.md b/src/health/guides/vernemq/vernemq_average_scheduler_utilization.md index 5e5bc6d43..5e5bc6d43 100644 --- a/health/guides/vernemq/vernemq_average_scheduler_utilization.md +++ b/src/health/guides/vernemq/vernemq_average_scheduler_utilization.md diff --git a/health/guides/vernemq/vernemq_cluster_dropped.md b/src/health/guides/vernemq/vernemq_cluster_dropped.md index 0bdc6f08d..0bdc6f08d 100644 --- a/health/guides/vernemq/vernemq_cluster_dropped.md +++ b/src/health/guides/vernemq/vernemq_cluster_dropped.md diff --git a/health/guides/vernemq/vernemq_mqtt_connack_sent_reason_unsuccessful.md b/src/health/guides/vernemq/vernemq_mqtt_connack_sent_reason_unsuccessful.md index d68db0d1c..d68db0d1c 100644 --- a/health/guides/vernemq/vernemq_mqtt_connack_sent_reason_unsuccessful.md +++ b/src/health/guides/vernemq/vernemq_mqtt_connack_sent_reason_unsuccessful.md diff --git a/health/guides/vernemq/vernemq_mqtt_disconnect_received_reason_not_normal.md b/src/health/guides/vernemq/vernemq_mqtt_disconnect_received_reason_not_normal.md index 014c5b0cf..014c5b0cf 100644 --- a/health/guides/vernemq/vernemq_mqtt_disconnect_received_reason_not_normal.md +++ b/src/health/guides/vernemq/vernemq_mqtt_disconnect_received_reason_not_normal.md diff --git a/health/guides/vernemq/vernemq_mqtt_disconnect_sent_reason_not_normal.md b/src/health/guides/vernemq/vernemq_mqtt_disconnect_sent_reason_not_normal.md index 7bbc1ba16..7bbc1ba16 100644 --- a/health/guides/vernemq/vernemq_mqtt_disconnect_sent_reason_not_normal.md +++ b/src/health/guides/vernemq/vernemq_mqtt_disconnect_sent_reason_not_normal.md diff --git a/health/guides/vernemq/vernemq_mqtt_puback_received_reason_unsuccessful.md b/src/health/guides/vernemq/vernemq_mqtt_puback_received_reason_unsuccessful.md index f7b506669..f7b506669 100644 --- a/health/guides/vernemq/vernemq_mqtt_puback_received_reason_unsuccessful.md +++ b/src/health/guides/vernemq/vernemq_mqtt_puback_received_reason_unsuccessful.md diff --git a/health/guides/vernemq/vernemq_mqtt_puback_sent_reason_unsuccessful.md b/src/health/guides/vernemq/vernemq_mqtt_puback_sent_reason_unsuccessful.md index 85a06a220..85a06a220 100644 --- a/health/guides/vernemq/vernemq_mqtt_puback_sent_reason_unsuccessful.md +++ b/src/health/guides/vernemq/vernemq_mqtt_puback_sent_reason_unsuccessful.md diff --git a/health/guides/vernemq/vernemq_mqtt_puback_unexpected.md b/src/health/guides/vernemq/vernemq_mqtt_puback_unexpected.md index b2541e867..b2541e867 100644 --- a/health/guides/vernemq/vernemq_mqtt_puback_unexpected.md +++ b/src/health/guides/vernemq/vernemq_mqtt_puback_unexpected.md diff --git a/health/guides/vernemq/vernemq_mqtt_pubcomp_received_reason_unsuccessful.md b/src/health/guides/vernemq/vernemq_mqtt_pubcomp_received_reason_unsuccessful.md index 5bdfd5b38..5bdfd5b38 100644 --- a/health/guides/vernemq/vernemq_mqtt_pubcomp_received_reason_unsuccessful.md +++ b/src/health/guides/vernemq/vernemq_mqtt_pubcomp_received_reason_unsuccessful.md diff --git a/health/guides/vernemq/vernemq_mqtt_pubcomp_sent_reason_unsuccessful.md b/src/health/guides/vernemq/vernemq_mqtt_pubcomp_sent_reason_unsuccessful.md index cc71b739b..cc71b739b 100644 --- a/health/guides/vernemq/vernemq_mqtt_pubcomp_sent_reason_unsuccessful.md +++ b/src/health/guides/vernemq/vernemq_mqtt_pubcomp_sent_reason_unsuccessful.md diff --git a/health/guides/vernemq/vernemq_mqtt_pubcomp_unexpected.md b/src/health/guides/vernemq/vernemq_mqtt_pubcomp_unexpected.md index ab4932177..b6fb32d6c 100644 --- a/health/guides/vernemq/vernemq_mqtt_pubcomp_unexpected.md +++ b/src/health/guides/vernemq/vernemq_mqtt_pubcomp_unexpected.md @@ -25,5 +25,5 @@ In MQTT, the PUBCOMP packet is used when QoS (Quality of Service) 2 is applied. ### Useful resources -1. [VerneMQ Documentation](https://vernemq.com/documentation/) +1. [VerneMQ Documentation](https://docs.vernemq.com/) 2. [MQTT Specification - MQTT Control Packets](https://docs.oasis-open.org/mqtt/mqtt/v5.0/os/mqtt-v5.0-os.html#_Toc3901046) diff --git a/health/guides/vernemq/vernemq_mqtt_publish_auth_errors.md b/src/health/guides/vernemq/vernemq_mqtt_publish_auth_errors.md index 46bc7d312..46bc7d312 100644 --- a/health/guides/vernemq/vernemq_mqtt_publish_auth_errors.md +++ b/src/health/guides/vernemq/vernemq_mqtt_publish_auth_errors.md diff --git a/health/guides/vernemq/vernemq_mqtt_publish_errors.md b/src/health/guides/vernemq/vernemq_mqtt_publish_errors.md index 9b57b1a74..9b57b1a74 100644 --- a/health/guides/vernemq/vernemq_mqtt_publish_errors.md +++ b/src/health/guides/vernemq/vernemq_mqtt_publish_errors.md diff --git a/health/guides/vernemq/vernemq_mqtt_pubrec_invalid_error.md b/src/health/guides/vernemq/vernemq_mqtt_pubrec_invalid_error.md index 47cd0fefc..47cd0fefc 100644 --- a/health/guides/vernemq/vernemq_mqtt_pubrec_invalid_error.md +++ b/src/health/guides/vernemq/vernemq_mqtt_pubrec_invalid_error.md diff --git a/health/guides/vernemq/vernemq_mqtt_pubrec_received_reason_unsuccessful.md b/src/health/guides/vernemq/vernemq_mqtt_pubrec_received_reason_unsuccessful.md index b01dc9fbb..b01dc9fbb 100644 --- a/health/guides/vernemq/vernemq_mqtt_pubrec_received_reason_unsuccessful.md +++ b/src/health/guides/vernemq/vernemq_mqtt_pubrec_received_reason_unsuccessful.md diff --git a/health/guides/vernemq/vernemq_mqtt_pubrec_sent_reason_unsuccessful.md b/src/health/guides/vernemq/vernemq_mqtt_pubrec_sent_reason_unsuccessful.md index 9b1976494..2a7a0ca5b 100644 --- a/health/guides/vernemq/vernemq_mqtt_pubrec_sent_reason_unsuccessful.md +++ b/src/health/guides/vernemq/vernemq_mqtt_pubrec_sent_reason_unsuccessful.md @@ -25,6 +25,6 @@ In the MQTT protocol, when a client sends a PUBLISH message with Quality of Serv ### Useful resources -1. [VerneMQ Documentation](https://vernemq.com/documentation.html) +1. [VerneMQ Documentation](https://docs.vernemq.com/) 2. [MQTT Essentials – All Core MQTT Concepts explained](https://www.hivemq.com/mqtt-essentials/) 3. [Understanding QoS Levels in MQTT](https://www.hivemq.com/blog/mqtt-essentials-part-6-mqtt-quality-of-service-levels/)
\ No newline at end of file diff --git a/health/guides/vernemq/vernemq_mqtt_pubrel_received_reason_unsuccessful.md b/src/health/guides/vernemq/vernemq_mqtt_pubrel_received_reason_unsuccessful.md index 67a54f0c3..67a54f0c3 100644 --- a/health/guides/vernemq/vernemq_mqtt_pubrel_received_reason_unsuccessful.md +++ b/src/health/guides/vernemq/vernemq_mqtt_pubrel_received_reason_unsuccessful.md diff --git a/health/guides/vernemq/vernemq_mqtt_pubrel_sent_reason_unsuccessful.md b/src/health/guides/vernemq/vernemq_mqtt_pubrel_sent_reason_unsuccessful.md index 18e85e12a..85bc661a5 100644 --- a/health/guides/vernemq/vernemq_mqtt_pubrel_sent_reason_unsuccessful.md +++ b/src/health/guides/vernemq/vernemq_mqtt_pubrel_sent_reason_unsuccessful.md @@ -45,5 +45,4 @@ This alert is related to VerneMQ, a high-performance MQTT broker. It monitors th ### Useful resources 1. [VerneMQ - Official Documentation](https://docs.vernemq.com/) -2. [MQTT Essentials: Quality of Service 2 (QoS 2)](https://www.hivemq.com/blog/mqtt-essentials-part-6-mqtt-quality-of-service-levels/) -3. [Netdata - VerneMQ monitoring](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/vernemq)
\ No newline at end of file +2. [MQTT Essentials: Quality of Service 2 (QoS 2)](https://www.hivemq.com/blog/mqtt-essentials-part-6-mqtt-quality-of-service-levels/)
\ No newline at end of file diff --git a/health/guides/vernemq/vernemq_mqtt_subscribe_auth_error.md b/src/health/guides/vernemq/vernemq_mqtt_subscribe_auth_error.md index b80118730..b80118730 100644 --- a/health/guides/vernemq/vernemq_mqtt_subscribe_auth_error.md +++ b/src/health/guides/vernemq/vernemq_mqtt_subscribe_auth_error.md diff --git a/health/guides/vernemq/vernemq_mqtt_subscribe_error.md b/src/health/guides/vernemq/vernemq_mqtt_subscribe_error.md index f14d18d55..f14d18d55 100644 --- a/health/guides/vernemq/vernemq_mqtt_subscribe_error.md +++ b/src/health/guides/vernemq/vernemq_mqtt_subscribe_error.md diff --git a/health/guides/vernemq/vernemq_mqtt_unsubscribe_error.md b/src/health/guides/vernemq/vernemq_mqtt_unsubscribe_error.md index 55feb0a17..55feb0a17 100644 --- a/health/guides/vernemq/vernemq_mqtt_unsubscribe_error.md +++ b/src/health/guides/vernemq/vernemq_mqtt_unsubscribe_error.md diff --git a/health/guides/vernemq/vernemq_netsplits.md b/src/health/guides/vernemq/vernemq_netsplits.md index 15d4d4498..15d4d4498 100644 --- a/health/guides/vernemq/vernemq_netsplits.md +++ b/src/health/guides/vernemq/vernemq_netsplits.md diff --git a/health/guides/vernemq/vernemq_queue_message_drop.md b/src/health/guides/vernemq/vernemq_queue_message_drop.md index 0b97c6b7a..0b97c6b7a 100644 --- a/health/guides/vernemq/vernemq_queue_message_drop.md +++ b/src/health/guides/vernemq/vernemq_queue_message_drop.md diff --git a/health/guides/vernemq/vernemq_queue_message_expired.md b/src/health/guides/vernemq/vernemq_queue_message_expired.md index bd0533402..85a8688b8 100644 --- a/health/guides/vernemq/vernemq_queue_message_expired.md +++ b/src/health/guides/vernemq/vernemq_queue_message_expired.md @@ -50,4 +50,3 @@ Expired messages are removed from the queue and are not delivered to subscribers ### Useful resources 1. [VerneMQ Documentation](https://vernemq.com/docs/) -2. [How to Monitor VerneMQ MQTT broker with Netdata](https://learn.netdata.cloud/guides/monitor/vernemq.html) diff --git a/health/guides/vernemq/vernemq_queue_message_unhandled.md b/src/health/guides/vernemq/vernemq_queue_message_unhandled.md index e2b5c5034..e2b5c5034 100644 --- a/health/guides/vernemq/vernemq_queue_message_unhandled.md +++ b/src/health/guides/vernemq/vernemq_queue_message_unhandled.md diff --git a/health/guides/vernemq/vernemq_socket_errors.md b/src/health/guides/vernemq/vernemq_socket_errors.md index 0be28eb6c..0be28eb6c 100644 --- a/health/guides/vernemq/vernemq_socket_errors.md +++ b/src/health/guides/vernemq/vernemq_socket_errors.md diff --git a/health/guides/vsphere/vsphere_cpu_usage.md b/src/health/guides/vsphere/vsphere_cpu_usage.md index 0278edae4..0278edae4 100644 --- a/health/guides/vsphere/vsphere_cpu_usage.md +++ b/src/health/guides/vsphere/vsphere_cpu_usage.md diff --git a/health/guides/vsphere/vsphere_host_mem_usage.md b/src/health/guides/vsphere/vsphere_host_mem_usage.md index 458e403a2..991a76f85 100644 --- a/health/guides/vsphere/vsphere_host_mem_usage.md +++ b/src/health/guides/vsphere/vsphere_host_mem_usage.md @@ -28,6 +28,5 @@ The `vsphere_host_mem_usage` alert is triggered when the memory utilization of a ### Useful resources -1. [Understanding Memory Utilization in VMware vSphere Host](https://www.altaro.com/vmware/memory-utilization-vmware-esxi/) -2. [vSphere Monitoring and Performance Documentation](https://docs.vmware.com/en/VMware-vSphere/7.0/com.vmware.vsphere.monitoring.doc/GUID-115861E6-810A-43BB-8CDB-EE99CF8F3250.html) -3. [Optimizing Memory Performance in VMware vSphere](https://blogs.vmware.com/performance/2021/04/optimizing-memory-performance-in-vmware-vsphere.html)
\ No newline at end of file +1. [vSphere Monitoring and Performance Documentation](https://docs.vmware.com/en/VMware-vSphere/7.0/com.vmware.vsphere.monitoring.doc/GUID-115861E6-810A-43BB-8CDB-EE99CF8F3250.html) +2. [Optimizing Memory Performance in VMware vSphere](https://blogs.vmware.com/performance/2021/04/optimizing-memory-performance-in-vmware-vsphere.html)
\ No newline at end of file diff --git a/health/guides/vsphere/vsphere_inbound_packets_dropped.md b/src/health/guides/vsphere/vsphere_inbound_packets_dropped.md index 6c3da6561..6c3da6561 100644 --- a/health/guides/vsphere/vsphere_inbound_packets_dropped.md +++ b/src/health/guides/vsphere/vsphere_inbound_packets_dropped.md diff --git a/health/guides/vsphere/vsphere_inbound_packets_dropped_ratio.md b/src/health/guides/vsphere/vsphere_inbound_packets_dropped_ratio.md index 6dccfa79a..6dccfa79a 100644 --- a/health/guides/vsphere/vsphere_inbound_packets_dropped_ratio.md +++ b/src/health/guides/vsphere/vsphere_inbound_packets_dropped_ratio.md diff --git a/health/guides/vsphere/vsphere_inbound_packets_errors.md b/src/health/guides/vsphere/vsphere_inbound_packets_errors.md index ef56fd6ee..ef56fd6ee 100644 --- a/health/guides/vsphere/vsphere_inbound_packets_errors.md +++ b/src/health/guides/vsphere/vsphere_inbound_packets_errors.md diff --git a/health/guides/vsphere/vsphere_inbound_packets_errors_ratio.md b/src/health/guides/vsphere/vsphere_inbound_packets_errors_ratio.md index b7d0af21f..b7d0af21f 100644 --- a/health/guides/vsphere/vsphere_inbound_packets_errors_ratio.md +++ b/src/health/guides/vsphere/vsphere_inbound_packets_errors_ratio.md diff --git a/health/guides/vsphere/vsphere_outbound_packets_dropped.md b/src/health/guides/vsphere/vsphere_outbound_packets_dropped.md index 93c508e95..93c508e95 100644 --- a/health/guides/vsphere/vsphere_outbound_packets_dropped.md +++ b/src/health/guides/vsphere/vsphere_outbound_packets_dropped.md diff --git a/health/guides/vsphere/vsphere_outbound_packets_dropped_ratio.md b/src/health/guides/vsphere/vsphere_outbound_packets_dropped_ratio.md index 8296198f5..8296198f5 100644 --- a/health/guides/vsphere/vsphere_outbound_packets_dropped_ratio.md +++ b/src/health/guides/vsphere/vsphere_outbound_packets_dropped_ratio.md diff --git a/health/guides/vsphere/vsphere_outbound_packets_errors.md b/src/health/guides/vsphere/vsphere_outbound_packets_errors.md index 7f50579d4..7f50579d4 100644 --- a/health/guides/vsphere/vsphere_outbound_packets_errors.md +++ b/src/health/guides/vsphere/vsphere_outbound_packets_errors.md diff --git a/health/guides/vsphere/vsphere_outbound_packets_errors_ratio.md b/src/health/guides/vsphere/vsphere_outbound_packets_errors_ratio.md index 333566ee1..333566ee1 100644 --- a/health/guides/vsphere/vsphere_outbound_packets_errors_ratio.md +++ b/src/health/guides/vsphere/vsphere_outbound_packets_errors_ratio.md diff --git a/health/guides/vsphere/vsphere_vm_mem_usage.md b/src/health/guides/vsphere/vsphere_vm_mem_usage.md index 0e6992149..0e6992149 100644 --- a/health/guides/vsphere/vsphere_vm_mem_usage.md +++ b/src/health/guides/vsphere/vsphere_vm_mem_usage.md diff --git a/health/guides/web_log/1m_bad_requests.md b/src/health/guides/web_log/1m_bad_requests.md index d8702b244..d8702b244 100644 --- a/health/guides/web_log/1m_bad_requests.md +++ b/src/health/guides/web_log/1m_bad_requests.md diff --git a/health/guides/web_log/1m_internal_errors.md b/src/health/guides/web_log/1m_internal_errors.md index 64a1ce081..64a1ce081 100644 --- a/health/guides/web_log/1m_internal_errors.md +++ b/src/health/guides/web_log/1m_internal_errors.md diff --git a/health/guides/web_log/1m_successful.md b/src/health/guides/web_log/1m_successful.md index abe790086..abe790086 100644 --- a/health/guides/web_log/1m_successful.md +++ b/src/health/guides/web_log/1m_successful.md diff --git a/health/guides/web_log/web_log_10m_response_time.md b/src/health/guides/web_log/web_log_10m_response_time.md index 603482a9b..603482a9b 100644 --- a/health/guides/web_log/web_log_10m_response_time.md +++ b/src/health/guides/web_log/web_log_10m_response_time.md diff --git a/health/guides/web_log/web_log_1m_bad_requests.md b/src/health/guides/web_log/web_log_1m_bad_requests.md index a296c90e6..a296c90e6 100644 --- a/health/guides/web_log/web_log_1m_bad_requests.md +++ b/src/health/guides/web_log/web_log_1m_bad_requests.md diff --git a/health/guides/web_log/web_log_1m_internal_errors.md b/src/health/guides/web_log/web_log_1m_internal_errors.md index 6eff7c68a..6eff7c68a 100644 --- a/health/guides/web_log/web_log_1m_internal_errors.md +++ b/src/health/guides/web_log/web_log_1m_internal_errors.md diff --git a/health/guides/web_log/web_log_1m_redirects.md b/src/health/guides/web_log/web_log_1m_redirects.md index 663f04f5f..663f04f5f 100644 --- a/health/guides/web_log/web_log_1m_redirects.md +++ b/src/health/guides/web_log/web_log_1m_redirects.md diff --git a/health/guides/web_log/web_log_1m_requests.md b/src/health/guides/web_log/web_log_1m_requests.md index 230aa8c8e..230aa8c8e 100644 --- a/health/guides/web_log/web_log_1m_requests.md +++ b/src/health/guides/web_log/web_log_1m_requests.md diff --git a/health/guides/web_log/web_log_1m_successful.md b/src/health/guides/web_log/web_log_1m_successful.md index b97515388..b97515388 100644 --- a/health/guides/web_log/web_log_1m_successful.md +++ b/src/health/guides/web_log/web_log_1m_successful.md diff --git a/health/guides/web_log/web_log_1m_total_requests.md b/src/health/guides/web_log/web_log_1m_total_requests.md index c867cfbf6..7dc19983d 100644 --- a/health/guides/web_log/web_log_1m_total_requests.md +++ b/src/health/guides/web_log/web_log_1m_total_requests.md @@ -30,7 +30,4 @@ An increase in workload means that your web server is handling more traffic than ### Useful resources -1. [Analyzing Web server logs with ApacheTop](https://www.howtoforge.com/how-to-analyze-apache-web-server-logs-apachetop) -2. [Logstash Guide: Analyzing Logs](https://www.elastic.co/guide/en/logstash/current/logstash-intro.html) -3. [Web Application Performance Monitoring with New Relic](https://newrelic.com/platform/web-application-monitoring) -4. [Vertically or Horizontally Scaling Your Web Server](https://www.digitalocean.com/community/tutorials/5-common-server-setups-for-your-web-application)
\ No newline at end of file +1. [Vertically or Horizontally Scaling Your Web Server](https://www.digitalocean.com/community/tutorials/5-common-server-setups-for-your-web-application)
\ No newline at end of file diff --git a/src/health/guides/web_log/web_log_1m_unmatched.md b/src/health/guides/web_log/web_log_1m_unmatched.md new file mode 100644 index 000000000..b95fa28bf --- /dev/null +++ b/src/health/guides/web_log/web_log_1m_unmatched.md @@ -0,0 +1,15 @@ +### Understand the alert + +In a webserver, all activity should be monitored. By default, most of the webservers log activity in an `access.log` file. The access log is a list of all requests for individual files that people or bots have requested from a website. Log File strings include notes about their requests for the HTML files and their embedded graphic images, along with any other associated files that are transmitted. + +The Netdata Agent calculates the percentage of unparsed log lines over the last minute. These are entries in the log file that didn't match in any of the common pattern operations (1XX, 2XX, etc) of the webserver. This can indicate an abnormal activity on your web server, or that your server is performing operations that you cannot monitor with the Agent. + +Web servers like NGINX and Apache2 give you the ability to modify the log patterns for each request. If you have done that, you also need to adjust the Netdata Agent to parse those patterns. + +### Troubleshoot the alert + +- Create a custom log format job + +You must create a new job in the `web_log` collector for your Agent. + +1. See how you can [configure this collector](https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/weblog#configuration) diff --git a/health/guides/web_log/web_log_5m_requests_ratio.md b/src/health/guides/web_log/web_log_5m_requests_ratio.md index e2cf46f16..e2cf46f16 100644 --- a/health/guides/web_log/web_log_5m_requests_ratio.md +++ b/src/health/guides/web_log/web_log_5m_requests_ratio.md diff --git a/health/guides/web_log/web_log_5m_successful.md b/src/health/guides/web_log/web_log_5m_successful.md index 5c5b2c4e6..d3ca5916a 100644 --- a/health/guides/web_log/web_log_5m_successful.md +++ b/src/health/guides/web_log/web_log_5m_successful.md @@ -31,6 +31,4 @@ A successful HTTP request is one that receives a response with an HTTP status co ### Useful resources 1. [Apache Log Files](https://httpd.apache.org/docs/current/logs.html) -2. [Nginx Log Files](https://nginx.org/en/docs/ngx_core_module.html#error_log) -3. [Introduction to Identifying Security Vulnerabilities in Web Servers](https://www.acunetix.com/blog/articles/introduction-identifying-security-vulnerabilities-web-servers) -4. [Web Application Performance Analysis and Monitoring](https://www.site24x7.com/learning/web-application-performance.html)
\ No newline at end of file +2. [Nginx Log Files](https://nginx.org/en/docs/ngx_core_module.html#error_log)
\ No newline at end of file diff --git a/health/guides/web_log/web_log_5m_successful_old.md b/src/health/guides/web_log/web_log_5m_successful_old.md index bbee58a42..bbee58a42 100644 --- a/health/guides/web_log/web_log_5m_successful_old.md +++ b/src/health/guides/web_log/web_log_5m_successful_old.md diff --git a/health/guides/web_log/web_log_web_slow.md b/src/health/guides/web_log/web_log_web_slow.md index 7ed3ebe1f..917d0325c 100644 --- a/health/guides/web_log/web_log_web_slow.md +++ b/src/health/guides/web_log/web_log_web_slow.md @@ -41,8 +41,7 @@ There are several factors that can cause slow web server performance. To trouble ### Useful resources -1. [Optimizing NGINX for Performance](https://easyengine.io/tutorials/nginx/performance/) -2. [Apache Performance Tuning](https://httpd.apache.org/docs/2.4/misc/perf-tuning.html) -3. [Top 10 MySQL Performance Tuning Tips](https://www.databasejournal.com/features/mysql/top-10-mysql-performance-tuning-tips.html) -4. [10 Tips for Optimal PostgreSQL Performance](https://www.digitalocean.com/community/tutorials/10-tips-for-optimizing-postgresql-performance-on-a-digitalocean-droplet) -5. [A Beginner's Guide to HTTP Cache Headers](https://www.keycdn.com/blog/http-cache-headers)
\ No newline at end of file +1. [Apache Performance Tuning](https://httpd.apache.org/docs/2.4/misc/perf-tuning.html) +2. [Top 10 MySQL Performance Tuning Tips](https://www.databasejournal.com/features/mysql/top-10-mysql-performance-tuning-tips.html) +3. [10 Tips for Optimal PostgreSQL Performance](https://www.digitalocean.com/community/tutorials/10-tips-for-optimizing-postgresql-performance-on-a-digitalocean-droplet) +4. [A Beginner's Guide to HTTP Cache Headers](https://www.keycdn.com/blog/http-cache-headers) diff --git a/health/guides/whoisquery/whoisquery_days_until_expiration.md b/src/health/guides/whoisquery/whoisquery_days_until_expiration.md index 7775bd9bc..7775bd9bc 100644 --- a/health/guides/whoisquery/whoisquery_days_until_expiration.md +++ b/src/health/guides/whoisquery/whoisquery_days_until_expiration.md diff --git a/health/guides/wifi/wifi_inbound_packets_dropped_ratio.md b/src/health/guides/wifi/wifi_inbound_packets_dropped_ratio.md index ce26c1e5e..0ba4a1fea 100644 --- a/health/guides/wifi/wifi_inbound_packets_dropped_ratio.md +++ b/src/health/guides/wifi/wifi_inbound_packets_dropped_ratio.md @@ -47,6 +47,5 @@ sudo tcpdump -i <interface_name> 1. [Top 20 Netstat Command Examples in Linux](https://www.tecmint.com/20-netstat-commands-for-linux-network-management/) 2. [iftop command in Linux to monitor network traffic](https://www.tecmint.com/iftop-linux-network-bandwidth-monitoring-tool/) -3. [An Overview of Packet Sniffing using Tcpdump](https://www.ubuntupit.com/tcpdump-useful-unix-packet-sniffer-command/) Remember to replace `<interface_name>` with the actual name of the WiFi network interface causing the alert.
\ No newline at end of file diff --git a/health/guides/wifi/wifi_outbound_packets_dropped_ratio.md b/src/health/guides/wifi/wifi_outbound_packets_dropped_ratio.md index 8441885df..5223fc073 100644 --- a/health/guides/wifi/wifi_outbound_packets_dropped_ratio.md +++ b/src/health/guides/wifi/wifi_outbound_packets_dropped_ratio.md @@ -49,6 +49,4 @@ You can adjust network settings, like buffers or queues, to mitigate dropped pac ### Useful resources 1. [ifconfig command in Linux](https://www.geeksforgeeks.org/ifconfig-command-in-linux-with-examples/) -2. [How to monitor network usage with iftop](https://www.binarytides.com/monitor-network-usage-with-iftop/) -3. [nload – Monitor Network Traffic and Bandwidth Usage in Real Time](https://www.tecmint.com/nload-monitor-linux-network-traffic-bandwidth-usage/) -4. [VNstat – A Network Traffic Monitor](https://www.tecmint.com/vnstat-monitor-network-traffic-in-linux/)
\ No newline at end of file +2. [nload – Monitor Network Traffic and Bandwidth Usage in Real Time](https://www.tecmint.com/nload-monitor-linux-network-traffic-bandwidth-usage/)
\ No newline at end of file diff --git a/health/guides/windows/windows_10min_cpu_usage.md b/src/health/guides/windows/windows_10min_cpu_usage.md index 5b585c714..70edb16d9 100644 --- a/health/guides/windows/windows_10min_cpu_usage.md +++ b/src/health/guides/windows/windows_10min_cpu_usage.md @@ -30,7 +30,6 @@ This alert calculates the average total `CPU utilization` on a Windows system ov ### Useful resources -1. [How to Monitor CPU Usage on Windows](https://www.tomsguide.com/how-to/how-to-monitor-cpu-usage-on-windows) -2. [Windows Task Manager: A Troubleshooting Guide](https://www.howtogeek.com/66622/stupid-geek-tricks-6-ways-to-open-windows-task-manager/) -3. [How to Use the Performance Monitor on Windows](https://www.digitalcitizen.life/how-use-performance-monitor-windows/) -4. [Understanding Process Explorer](https://docs.microsoft.com/en-us/sysinternals/downloads/process-explorer)
\ No newline at end of file +1. [Windows Task Manager: A Troubleshooting Guide](https://www.howtogeek.com/66622/stupid-geek-tricks-6-ways-to-open-windows-task-manager/) +2. [How to Use the Performance Monitor on Windows](https://www.digitalcitizen.life/how-use-performance-monitor-windows/) +3. [Understanding Process Explorer](https://docs.microsoft.com/en-us/sysinternals/downloads/process-explorer)
\ No newline at end of file diff --git a/health/guides/windows/windows_disk_in_use.md b/src/health/guides/windows/windows_disk_in_use.md index 4642b79ce..1830a2b2c 100644 --- a/health/guides/windows/windows_disk_in_use.md +++ b/src/health/guides/windows/windows_disk_in_use.md @@ -30,5 +30,4 @@ Disk space utilization is crucial for the stable and efficient operation of your ### Useful resources 1. [Windows 10 Tips & Tricks: Analyze Disk Space & Free Space - YouTube](https://www.youtube.com/watch?v=NolLC9tBP_Y) -2. [5 Free Tools to Visualize Disk Space Usage on Windows](https://www.hongkiat.com/blog/visualize-hard-disk-usage-free-tools-for-windows/) -3. [10 Ways to Free Up Hard Drive Space on Windows](https://www.howtogeek.com/125923/7-ways-to-free-up-hard-disk-space-on-windows/)
\ No newline at end of file +2. [10 Ways to Free Up Hard Drive Space on Windows](https://www.howtogeek.com/125923/7-ways-to-free-up-hard-disk-space-on-windows/)
\ No newline at end of file diff --git a/health/guides/windows/windows_inbound_packets_discarded.md b/src/health/guides/windows/windows_inbound_packets_discarded.md index 829e34ffe..039cf9e55 100644 --- a/health/guides/windows/windows_inbound_packets_discarded.md +++ b/src/health/guides/windows/windows_inbound_packets_discarded.md @@ -35,5 +35,4 @@ Packet corruption can be caused by faulty hardware, software issues, or even cyb ### Useful resources 1. [Windows Performance Monitor](https://docs.microsoft.com/en-us/windows-server/administration/windows-commands/perfmon) -2. [Windows Event Viewer](https://docs.microsoft.com/en-us/windows/win32/eventlog/event-log-reference) -3. [How to troubleshoot networking problems on the Windows platform](https://support.microsoft.com/en-us/help/10267)
\ No newline at end of file +2. [Windows Event Viewer](https://docs.microsoft.com/en-us/windows/win32/eventlog/event-log-reference)
\ No newline at end of file diff --git a/health/guides/windows/windows_inbound_packets_errors.md b/src/health/guides/windows/windows_inbound_packets_errors.md index aee982d6a..be1a2869f 100644 --- a/health/guides/windows/windows_inbound_packets_errors.md +++ b/src/health/guides/windows/windows_inbound_packets_errors.md @@ -37,5 +37,4 @@ To troubleshoot this alert, you can perform the following steps: ### Useful resources 1. [How to use Network Monitor in Windows](https://docs.microsoft.com/en-us/windows/client-management/troubleshoot-tcpip-network-monitor) -2. [Network Troubleshooting Guide for Windows](https://techcommunity.microsoft.com/t5/networking-blog/network-troubleshooting-guide-for-windows/ba-p/428114) -3. [How to Troubleshoot Network Connections with Ping and Tracert](https://www.windowscentral.com/how-troubleshoot-network-connection-ping-and-traceroute)
\ No newline at end of file +2. [Network Troubleshooting Guide for Windows](https://techcommunity.microsoft.com/t5/networking-blog/network-troubleshooting-guide-for-windows/ba-p/428114)
\ No newline at end of file diff --git a/health/guides/windows/windows_outbound_packets_discarded.md b/src/health/guides/windows/windows_outbound_packets_discarded.md index 226c3b0ba..1cd5f922f 100644 --- a/health/guides/windows/windows_outbound_packets_discarded.md +++ b/src/health/guides/windows/windows_outbound_packets_discarded.md @@ -44,5 +44,4 @@ If your network is congested, it can cause an increase in discarded packets. Con ### Useful resources 1. [Using Performance Monitor to monitor network performance](https://techcommunity.microsoft.com/t5/ask-the-performance-team/using-perfmon-to-monitor-your-servers-network-performance/ba-p/373944) -2. [Monitoring Network Performance with Resource Monitor](https://www.online-tech-tips.com/computer-tips/monitoring-network-performance-with-resource-monitor/) -3. [Event Viewer in Windows](https://www.dummies.com/computers/operating-systems/windows-10/how-to-use-event-viewer-in-windows-10/)
\ No newline at end of file +2. [Event Viewer in Windows](https://www.dummies.com/computers/operating-systems/windows-10/how-to-use-event-viewer-in-windows-10/)
\ No newline at end of file diff --git a/health/guides/windows/windows_outbound_packets_errors.md b/src/health/guides/windows/windows_outbound_packets_errors.md index 2ccb8ef16..7fcfb65f2 100644 --- a/health/guides/windows/windows_outbound_packets_errors.md +++ b/src/health/guides/windows/windows_outbound_packets_errors.md @@ -40,7 +40,6 @@ Ensure that your network interface card (NIC) drivers and firmware are up-to-dat ### Useful resources -1. [Netstat Command Usage on Windows](https://www.computerhope.com/issues/ch001/stat.htm) -2. [Wireshark - A Network Protocol Analyzer](https://www.wireshark.org/) -3. [Tcpdump - A Packet Analyzer](https://www.tcpdump.org/) -4. [Network Performance Monitoring and Diagnostics Guide](https://docs.microsoft.com/en-us/windows-server/networking/technologies/npmd/npmd)
\ No newline at end of file +1. [Wireshark - A Network Protocol Analyzer](https://www.wireshark.org/) +2. [Tcpdump - A Packet Analyzer](https://www.tcpdump.org/) +3. [Network Performance Monitoring and Diagnostics Guide](https://docs.microsoft.com/en-us/windows-server/networking/technologies/npmd/npmd)
\ No newline at end of file diff --git a/health/guides/windows/windows_ram_in_use.md b/src/health/guides/windows/windows_ram_in_use.md index ef85588b0..f51a92eda 100644 --- a/health/guides/windows/windows_ram_in_use.md +++ b/src/health/guides/windows/windows_ram_in_use.md @@ -33,6 +33,4 @@ Memory utilization refers to the percentage of a system's RAM that is currently ### Useful resources -1. [How to use Task Manager to monitor Windows PC's performance](https://support.microsoft.com/en-us/windows/how-to-use-task-manager-to-monitor-windows-pc-s-performance-171100cb-5e7d-aaba-29abfedfb06f) -2. [How to use Performance Monitor on Windows 10](https://www.windowscentral.com/how-use-performance-monitor-windows-10) -3. [How to fix high memory usage in Windows](https://pureinfotech.com/reduce-ram-memory-usage-windows/)
\ No newline at end of file +1. [How to use Performance Monitor on Windows 10](https://www.windowscentral.com/how-use-performance-monitor-windows-10)
\ No newline at end of file diff --git a/health/guides/windows/windows_swap_in_use.md b/src/health/guides/windows/windows_swap_in_use.md index 5a6500915..38fa06548 100644 --- a/health/guides/windows/windows_swap_in_use.md +++ b/src/health/guides/windows/windows_swap_in_use.md @@ -36,6 +36,5 @@ Swap memory is a virtual memory management technique where a portion of the disk ### Useful resources -1. [How to Manage Virtual Memory (Pagefile) in Windows 10](https://www.techbout.com/manage-virtual-memory-pagefile-windows-10-29638/) -2. [Troubleshooting Windows Performance Issues Using the Resource Monitor](https://docs.microsoft.com/en-us/archive/blogs/askcore/troubleshooting-windows-performance-issues-using-the-resource-monitor) -3. [Windows Performance Monitor](https://docs.microsoft.com/en-us/windows-server/administration/windows-server-2008-help/troubleshoot/windows-rel-performance-monitor)
\ No newline at end of file +1. [Troubleshooting Windows Performance Issues Using the Resource Monitor](https://docs.microsoft.com/en-us/archive/blogs/askcore/troubleshooting-windows-performance-issues-using-the-resource-monitor) +2. [Windows Performance Monitor](https://docs.microsoft.com/en-us/windows-server/administration/windows-server-2008-help/troubleshoot/windows-rel-performance-monitor)
\ No newline at end of file diff --git a/health/guides/x509check/x509check_days_until_expiration.md b/src/health/guides/x509check/x509check_days_until_expiration.md index a37792ab0..18314de54 100644 --- a/health/guides/x509check/x509check_days_until_expiration.md +++ b/src/health/guides/x509check/x509check_days_until_expiration.md @@ -41,5 +41,4 @@ If there are still issues or the alert persists, double-check your certificate m ### Useful resources 1. [Sectigo: What is an X.509 certificate?](https://sectigo.com/resource-library/what-is-x509-certificate) -2. [Netdata: x509 certificate monitoring](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/x509check) -3. [OpenSSL: X.509 Certificate Commands](https://www.openssl.org/docs/man1.1.1/man1/x509.html)
\ No newline at end of file +2. [OpenSSL: X.509 Certificate Commands](https://www.openssl.org/docs/man1.1.1/man1/x509.html)
\ No newline at end of file diff --git a/health/guides/x509check/x509check_revocation_status.md b/src/health/guides/x509check/x509check_revocation_status.md index 2d14f1062..fc48deefe 100644 --- a/health/guides/x509check/x509check_revocation_status.md +++ b/src/health/guides/x509check/x509check_revocation_status.md @@ -28,8 +28,6 @@ This alert indicates that the X.509 certificate has been revoked, meaning that i ### Useful resources -1. [X.509 Certificate Monitoring with Netdata](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/x509check) -2. [How to use OpenSSL to verify a certificate against a CRL](https://raymii.org/s/tutorials/OpenSSL_command_line_Root_and_Intermediate_CA_including_OCSP_CRL_Signed_Certs.html) -3. [SSL Shopper's SSL Checker](https://www.sslshopper.com/ssl-checker.html) -4. [Renewing certificates with Certbot](https://certbot.eff.org/docs/using.html#renewing-certificates) -5. [Creating a Self-Signed SSL Certificate](https://www.akadia.com/services/ssh_test_certificate.html)
\ No newline at end of file +1. [SSL Shopper's SSL Checker](https://www.sslshopper.com/ssl-checker.html) +2. [Renewing certificates with Certbot](https://certbot.eff.org/docs/using.html#renewing-certificates) +3. [Creating a Self-Signed SSL Certificate](https://www.akadia.com/services/ssh_test_certificate.html)
\ No newline at end of file diff --git a/health/guides/zfs/zfs_memory_throttle.md b/src/health/guides/zfs/zfs_memory_throttle.md index 3903a02e7..3903a02e7 100644 --- a/health/guides/zfs/zfs_memory_throttle.md +++ b/src/health/guides/zfs/zfs_memory_throttle.md diff --git a/health/guides/zfs/zfs_pool_state_crit.md b/src/health/guides/zfs/zfs_pool_state_crit.md index 72db4b06b..72db4b06b 100644 --- a/health/guides/zfs/zfs_pool_state_crit.md +++ b/src/health/guides/zfs/zfs_pool_state_crit.md diff --git a/health/guides/zfs/zfs_pool_state_warn.md b/src/health/guides/zfs/zfs_pool_state_warn.md index ffba20456..ffba20456 100644 --- a/health/guides/zfs/zfs_pool_state_warn.md +++ b/src/health/guides/zfs/zfs_pool_state_warn.md diff --git a/src/health/health.c b/src/health/health.c new file mode 100644 index 000000000..7039a193c --- /dev/null +++ b/src/health/health.c @@ -0,0 +1,172 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "health.h" +#include "health_internals.h" + +struct health_plugin_globals health_globals = { + .initialization = { + .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .done = false, + }, + .config = { + .enabled = true, + .stock_enabled = true, + .use_summary_for_notifications = true, + + .health_log_entries_max = HEALTH_LOG_ENTRIES_DEFAULT, + .health_log_history = HEALTH_LOG_HISTORY_DEFAULT, + + .default_warn_repeat_every = 0, + .default_crit_repeat_every = 0, + + .run_at_least_every_seconds = 10, + .postpone_alarms_during_hibernation_for_seconds = 60, + }, + .prototypes = { + .dict = NULL, + } +}; + +bool health_plugin_enabled(void) { + return health_globals.config.enabled; +} + +void health_plugin_disable(void) { + health_globals.config.enabled = false; +} + + +static void health_load_config_defaults(void) { + char filename[FILENAME_MAX + 1]; + + health_globals.config.enabled = + config_get_boolean(CONFIG_SECTION_HEALTH, + "enabled", + health_globals.config.enabled); + + health_globals.config.stock_enabled = + config_get_boolean(CONFIG_SECTION_HEALTH, + "enable stock health configuration", + health_globals.config.stock_enabled); + + health_globals.config.use_summary_for_notifications = + config_get_boolean(CONFIG_SECTION_HEALTH, + "use summary for notifications", + health_globals.config.use_summary_for_notifications); + + health_globals.config.default_warn_repeat_every = + config_get_duration(CONFIG_SECTION_HEALTH, "default repeat warning", "never"); + + health_globals.config.default_crit_repeat_every = + config_get_duration(CONFIG_SECTION_HEALTH, "default repeat critical", "never"); + + health_globals.config.health_log_entries_max = + config_get_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", + health_globals.config.health_log_entries_max); + + health_globals.config.health_log_history = + config_get_number(CONFIG_SECTION_HEALTH, "health log history", HEALTH_LOG_DEFAULT_HISTORY); + + snprintfz(filename, FILENAME_MAX, "%s/alarm-notify.sh", netdata_configured_primary_plugins_dir); + health_globals.config.default_exec = + string_strdupz(config_get(CONFIG_SECTION_HEALTH, "script to execute on alarm", filename)); + + health_globals.config.enabled_alerts = + simple_pattern_create(config_get(CONFIG_SECTION_HEALTH, "enabled alarms", "*"), + NULL, SIMPLE_PATTERN_EXACT, true); + + health_globals.config.run_at_least_every_seconds = + (int)config_get_number(CONFIG_SECTION_HEALTH, + "run at least every seconds", + health_globals.config.run_at_least_every_seconds); + + health_globals.config.postpone_alarms_during_hibernation_for_seconds = + config_get_number(CONFIG_SECTION_HEALTH, + "postpone alarms during hibernation for seconds", + health_globals.config.postpone_alarms_during_hibernation_for_seconds); + + health_globals.config.default_recipient = + string_strdupz("root"); + + // ------------------------------------------------------------------------ + // verify after loading + + if(health_globals.config.run_at_least_every_seconds < 1) + health_globals.config.run_at_least_every_seconds = 1; + + if(health_globals.config.health_log_entries_max < HEALTH_LOG_ENTRIES_MIN) { + nd_log(NDLS_DAEMON, NDLP_WARNING, + "Health configuration has invalid max log entries %u, using minimum of %u", + health_globals.config.health_log_entries_max, + HEALTH_LOG_ENTRIES_MIN); + + health_globals.config.health_log_entries_max = HEALTH_LOG_ENTRIES_MIN; + config_set_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", + (long)health_globals.config.health_log_entries_max); + } + else if(health_globals.config.health_log_entries_max > HEALTH_LOG_ENTRIES_MAX) { + nd_log(NDLS_DAEMON, NDLP_WARNING, + "Health configuration has invalid max log entries %u, using maximum of %u", + health_globals.config.health_log_entries_max, + HEALTH_LOG_ENTRIES_MAX); + + health_globals.config.health_log_entries_max = HEALTH_LOG_ENTRIES_MAX; + config_set_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", + (long)health_globals.config.health_log_entries_max); + } + + if (health_globals.config.health_log_history < HEALTH_LOG_MINIMUM_HISTORY) { + nd_log(NDLS_DAEMON, NDLP_WARNING, + "Health configuration has invalid health log history %u. Using minimum %d", + health_globals.config.health_log_history, HEALTH_LOG_MINIMUM_HISTORY); + + health_globals.config.health_log_history = HEALTH_LOG_MINIMUM_HISTORY; + config_set_number(CONFIG_SECTION_HEALTH, "health log history", health_globals.config.health_log_history); + } + + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "Health log history is set to %u seconds (%u days)", + health_globals.config.health_log_history, health_globals.config.health_log_history / 86400); +} + +inline char *health_user_config_dir(void) { + char buffer[FILENAME_MAX + 1]; + snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_user_config_dir); + return config_get(CONFIG_SECTION_DIRECTORIES, "health config", buffer); +} + +inline char *health_stock_config_dir(void) { + char buffer[FILENAME_MAX + 1]; + snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_stock_config_dir); + return config_get(CONFIG_SECTION_DIRECTORIES, "stock health config", buffer); +} + +void health_plugin_init(void) { + spinlock_lock(&health_globals.initialization.spinlock); + + if(health_globals.initialization.done) + goto cleanup; + + health_globals.initialization.done = true; + + health_init_prototypes(); + health_load_config_defaults(); + + if(!health_plugin_enabled()) + goto cleanup; + + health_reload_prototypes(); + health_silencers_init(); + +cleanup: + spinlock_unlock(&health_globals.initialization.spinlock); +} + +void health_plugin_destroy(void) { + ; +} + +void health_plugin_reload(void) { + health_reload_prototypes(); + health_apply_prototypes_to_all_hosts(); +} diff --git a/src/health/health.d/adaptec_raid.conf b/src/health/health.d/adaptec_raid.conf new file mode 100644 index 000000000..b01113b69 --- /dev/null +++ b/src/health/health.d/adaptec_raid.conf @@ -0,0 +1,29 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: adaptec_raid_ld_health_status + on: adaptecraid.logical_device_status + class: Errors + type: System +component: RAID + lookup: average -1m unaligned percentage of ok + units: % + every: 10s + crit: $this < 100 + delay: down 5m multiplier 1.5 max 1h + summary: Adaptec RAID LD (number ${label:ld_number}) health status + info: Adaptec RAID logical device (number ${label:ld_number} name ${label:ld_name}) health status is critical + to: sysadmin + + template: adaptec_raid_pd_health_state + on: adaptecraid.physical_device_state + class: Errors + type: System +component: RAID + lookup: average -1m unaligned percentage of ok + units: % + every: 10s + crit: $this < 100 + delay: down 5m multiplier 1.5 max 1h + summary: Adaptec RAID PD (number ${label:pd_number}) health state + info: Adaptec RAID physical device (number ${label:pd_number} location ${label:location}) health state is critical + to: sysadmin diff --git a/src/health/health.d/anomalies.conf b/src/health/health.d/anomalies.conf new file mode 100644 index 000000000..80d63bb8d --- /dev/null +++ b/src/health/health.d/anomalies.conf @@ -0,0 +1,25 @@ +## raise a warning alarm if an anomaly probability is consistently above 50% + +## "foreach" was removed, these alarms don't work anymore + +# template: anomalies_anomaly_probabilities +# on: anomalies.probability +# class: Errors +# type: Netdata +#component: ML +# lookup: average -2m foreach * +# every: 1m +# warn: $this > 50 +# info: average anomaly probability over the last 2 minutes + +# raise a warning alarm if an anomaly flag is consistently firing + +# template: anomalies_anomaly_flags +# on: anomalies.anomaly +# class: Errors +# type: Netdata +#component: ML +# lookup: sum -2m foreach * +# every: 1m +# warn: $this > 10 +# info: number of anomalies in the last 2 minutes diff --git a/health/health.d/apcupsd.conf b/src/health/health.d/apcupsd.conf index 90a72af19..5fd7aa112 100644 --- a/health/health.d/apcupsd.conf +++ b/src/health/health.d/apcupsd.conf @@ -5,8 +5,6 @@ class: Utilization type: Power Supply component: UPS - os: * - hosts: * lookup: average -10m unaligned of percentage units: % every: 1m @@ -23,8 +21,6 @@ component: UPS class: Errors type: Power Supply component: UPS - os: * - hosts: * lookup: average -60s unaligned of charge units: % every: 60s diff --git a/health/health.d/bcache.conf b/src/health/health.d/bcache.conf index 446173428..446173428 100644 --- a/health/health.d/bcache.conf +++ b/src/health/health.d/bcache.conf diff --git a/health/health.d/beanstalkd.conf b/src/health/health.d/beanstalkd.conf index 0d37f28e0..0d37f28e0 100644 --- a/health/health.d/beanstalkd.conf +++ b/src/health/health.d/beanstalkd.conf diff --git a/health/health.d/boinc.conf b/src/health/health.d/boinc.conf index 092a56845..6fd987de1 100644 --- a/health/health.d/boinc.conf +++ b/src/health/health.d/boinc.conf @@ -1,4 +1,4 @@ -# Alarms for various BOINC issues. +# you can disable an alarm notification by setting the 'to' line to: silent # Warn on any compute errors encountered. template: boinc_compute_errors @@ -6,8 +6,6 @@ class: Errors type: Computing component: BOINC - os: * - hosts: * lookup: average -10m unaligned of comperror units: tasks every: 1m @@ -23,8 +21,6 @@ component: BOINC class: Errors type: Computing component: BOINC - os: * - hosts: * lookup: average -10m unaligned of upload_failed units: tasks every: 1m @@ -40,8 +36,6 @@ component: BOINC class: Utilization type: Computing component: BOINC - os: * - hosts: * lookup: average -10m unaligned of total units: tasks every: 1m @@ -57,8 +51,6 @@ component: BOINC class: Utilization type: Computing component: BOINC - os: * - hosts: * lookup: average -10m unaligned of active calc: ($boinc_total_tasks >= 1) ? ($this) : (inf) units: tasks diff --git a/health/health.d/btrfs.conf b/src/health/health.d/btrfs.conf index 1557a5941..f43f600c0 100644 --- a/health/health.d/btrfs.conf +++ b/src/health/health.d/btrfs.conf @@ -1,11 +1,10 @@ +# you can disable an alarm notification by setting the 'to' line to: silent template: btrfs_allocated on: btrfs.disk class: Utilization type: System component: File system - os: * - hosts: * calc: 100 - ($unallocated * 100 / ($unallocated + $data_used + $data_free + $meta_used + $meta_free + $sys_used + $sys_free)) units: % every: 10s @@ -20,8 +19,6 @@ component: File system class: Utilization type: System component: File system - os: * - hosts: * calc: $used * 100 / ($used + $free) units: % every: 10s @@ -37,8 +34,6 @@ component: File system class: Utilization type: System component: File system - os: * - hosts: * calc: ($used + $reserved) * 100 / ($used + $free + $reserved) units: % every: 10s @@ -54,8 +49,6 @@ component: File system class: Utilization type: System component: File system - os: * - hosts: * calc: $used * 100 / ($used + $free) units: % every: 10s @@ -71,8 +64,6 @@ component: File system class: Errors type: System component: File system - os: * - hosts: * units: errors lookup: max -10m every 1m of read_errs warn: $this > 0 @@ -86,8 +77,6 @@ component: File system class: Errors type: System component: File system - os: * - hosts: * units: errors lookup: max -10m every 1m of write_errs crit: $this > 0 @@ -101,8 +90,6 @@ component: File system class: Errors type: System component: File system - os: * - hosts: * units: errors lookup: max -10m every 1m of flush_errs crit: $this > 0 @@ -116,8 +103,6 @@ component: File system class: Errors type: System component: File system - os: * - hosts: * units: errors lookup: max -10m every 1m of corruption_errs warn: $this > 0 @@ -131,8 +116,6 @@ component: File system class: Errors type: System component: File system - os: * - hosts: * units: errors lookup: max -10m every 1m of generation_errs warn: $this > 0 diff --git a/health/health.d/ceph.conf b/src/health/health.d/ceph.conf index 44d351338..44d351338 100644 --- a/health/health.d/ceph.conf +++ b/src/health/health.d/ceph.conf diff --git a/src/health/health.d/cgroups.conf b/src/health/health.d/cgroups.conf new file mode 100644 index 000000000..52ca02624 --- /dev/null +++ b/src/health/health.d/cgroups.conf @@ -0,0 +1,67 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: cgroup_10min_cpu_usage + on: cgroup.cpu_limit + class: Utilization + type: Cgroups + component: CPU +host labels: _os=linux + lookup: average -10m unaligned + units: % + every: 1m + warn: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + summary: Cgroup ${label:cgroup_name} CPU utilization + info: Cgroup ${label:cgroup_name} average CPU utilization over the last 10 minutes + to: silent + + template: cgroup_ram_in_use + on: cgroup.mem_usage + class: Utilization + type: Cgroups + component: Memory +host labels: _os=linux + calc: ($ram) * 100 / $memory_limit + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + summary: Cgroup ${label:cgroup_name} memory utilization + info: Cgroup ${label:cgroup_name} memory utilization + to: silent + +# ---------------------------------K8s containers-------------------------------------------- + + template: k8s_cgroup_10min_cpu_usage + on: k8s.cgroup.cpu_limit + class: Utilization + type: Cgroups + component: CPU +host labels: _os=linux + lookup: average -10m unaligned + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (75) : (85)) + delay: down 15m multiplier 1.5 max 1h + summary: Container ${label:k8s_container_name} pod ${label:k8s_pod_name} CPU utilization + info: Container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \ + average CPU utilization over the last 10 minutes + to: silent + + template: k8s_cgroup_ram_in_use + on: k8s.cgroup.mem_usage + class: Utilization + type: Cgroups + component: Memory +host labels: _os=linux + calc: ($ram) * 100 / $memory_limit + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + summary: Container ${label:k8s_container_name} pod ${label:k8s_pod_name} memory utilization + info: container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \ + memory utilization + to: silent diff --git a/src/health/health.d/clickhouse.conf b/src/health/health.d/clickhouse.conf new file mode 100644 index 000000000..e24f71830 --- /dev/null +++ b/src/health/health.d/clickhouse.conf @@ -0,0 +1,140 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: clickhouse_restarted + on: clickhouse.uptime + class: Error + type: Database +component: ClickHouse + calc: $uptime + units: seconds + every: 10s + warn: $this > 1 AND $this < 180 + summary: ClickHouse restart detected + info: ClickHouse has recently been restarted + to: silent + + template: clickhouse_queries_preempted + on: clickhouse.queries_preempted + class: Workload + type: Database +component: ClickHouse + lookup: max -1m unaligned + units: preempted_queries + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 1h + summary: ClickHouse preempted queries detected + info: ClickHouse has queries that are stopped and waiting due to priority setting + to: dba + + template: clickhouse_long_running_query + on: clickhouse.longest_running_query_time + class: Latency + type: Database +component: ClickHouse + lookup: max -1m unaligned + units: seconds + every: 10s + warn: $this > (($status >= $WARNING) ? (300) : (600)) + delay: down 5m multiplier 1.5 max 1h + summary: ClickHouse long-running query detected + info: ClickHouse has a long-running query exceeding the threshold + to: dba + + template: clickhouse_rejected_inserts + on: clickhouse.rejected_inserts + class: Workload + type: Database +component: ClickHouse + lookup: sum -1m unaligned + units: rejected_inserts + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 1h + summary: ClickHouse rejected INSERT queries detected + info: ClickHouse has INSERT queries that are rejected due to high number of active data parts for partition in a MergeTree + to: dba + + template: clickhouse_delayed_inserts + on: clickhouse.delayed_inserts + class: Workload + type: Database +component: ClickHouse + lookup: sum -1m unaligned + units: delayed_inserts + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 1h + summary: ClickHouse delayed INSERT queries detected + info: ClickHouse has INSERT queries that are throttled due to high number of active data parts for partition in a MergeTree + to: silent + + template: clickhouse_replication_lag + on: clickhouse.replicas_max_absolute_delay + class: Workload + type: Database +component: ClickHouse + lookup: avg -1m unaligned + units: seconds + every: 10s + warn: $this > (($status >= $WARNING) ? (250) : (300)) + delay: down 5m multiplier 1.5 max 1h + summary: ClickHouse high replication lag detected + info: ClickHouse is experiencing replication lag greater than 5 minutes + to: dba + + template: clickhouse_replicated_readonly_tables + on: clickhouse.replicated_readonly_tables + class: Error + type: Database +component: ClickHouse + lookup: max -1m unaligned + units: readonly_tables + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 1h + summary: ClickHouse replicated tables in readonly state detected + info: ClickHouse has replicated tables in readonly state due to ZooKeeper session loss/startup without ZooKeeper configured + to: dba + + template: clickhouse_max_part_count_for_partition + on: clickhouse.max_part_count_for_partition + class: Workload + type: Database +component: ClickHouse + lookup: avg -1m unaligned + units: parts + every: 10s + warn: $this > (($status >= $WARNING) ? (200) : (300)) + delay: down 5m multiplier 1.5 max 1h + summary: ClickHouse high parts/partition detected + info: ClickHouse high number of parts per partition + to: dba + + template: clickhouse_distributed_connections_failures + on: clickhouse.distributed_connections_fail_exhausted_retries + class: Error + type: Database +component: ClickHouse + lookup: sum -1m unaligned + units: failures + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 1h + summary: ClickHouse distributed connections failures detected + info: ClickHouse has failed distributed connections after exhausting all retry attempts + to: dba + + template: clickhouse_distributed_files_to_insert + on: clickhouse.distributed_files_to_insert + class: Workload + type: Database +component: ClickHouse + lookup: max -1m unaligned + units: files + every: 10s + warn: $this > (($status >= $WARNING) ? (40) : (80)) + delay: down 5m multiplier 1.5 max 1h + summary: ClickHouse high files to insert detected + info: ClickHouse high number of pending files to process for asynchronous insertion into Distributed tables + to: silent diff --git a/health/health.d/cockroachdb.conf b/src/health/health.d/cockroachdb.conf index 60f178354..60f178354 100644 --- a/health/health.d/cockroachdb.conf +++ b/src/health/health.d/cockroachdb.conf diff --git a/health/health.d/consul.conf b/src/health/health.d/consul.conf index 8b414a26d..8b414a26d 100644 --- a/health/health.d/consul.conf +++ b/src/health/health.d/consul.conf diff --git a/src/health/health.d/cpu.conf b/src/health/health.d/cpu.conf new file mode 100644 index 000000000..29f541e56 --- /dev/null +++ b/src/health/health.d/cpu.conf @@ -0,0 +1,65 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + + template: 10min_cpu_usage + on: system.cpu + class: Utilization + type: System + component: CPU +host labels: _os=linux + lookup: average -10m unaligned of user,system,softirq,irq,guest + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (75) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + summary: System CPU utilization + info: Average CPU utilization over the last 10 minutes (excluding iowait, nice and steal) + to: sysadmin + + template: 10min_cpu_iowait + on: system.cpu + class: Utilization + type: System + component: CPU +host labels: _os=linux + lookup: average -10m unaligned of iowait + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (20) : (40)) + delay: up 30m down 30m multiplier 1.5 max 2h + summary: System CPU iowait time + info: Average CPU iowait time over the last 10 minutes + to: silent + + template: 20min_steal_cpu + on: system.cpu + class: Latency + type: System + component: CPU +host labels: _os=linux + lookup: average -20m unaligned of steal + units: % + every: 5m + warn: $this > (($status >= $WARNING) ? (5) : (10)) + delay: down 1h multiplier 1.5 max 2h + summary: System CPU steal time + info: Average CPU steal time over the last 20 minutes + to: silent + +## FreeBSD + template: 10min_cpu_usage + on: system.cpu + class: Utilization + type: System + component: CPU +host labels: _os=freebsd + lookup: average -10m unaligned of user,system,interrupt + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (75) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + summary: System CPU utilization + info: Average CPU utilization over the last 10 minutes (excluding nice) + to: sysadmin diff --git a/health/health.d/dbengine.conf b/src/health/health.d/dbengine.conf index 0a70d2e8f..5585a9533 100644 --- a/health/health.d/dbengine.conf +++ b/src/health/health.d/dbengine.conf @@ -1,4 +1,3 @@ - # you can disable an alarm notification by setting the 'to' line to: silent alarm: 10min_dbengine_global_fs_errors @@ -6,8 +5,6 @@ class: Errors type: Netdata component: DB engine - os: linux freebsd macos - hosts: * lookup: sum -10m unaligned of fs_errors units: errors every: 10s @@ -22,8 +19,6 @@ component: DB engine class: Errors type: Netdata component: DB engine - os: linux freebsd macos - hosts: * lookup: sum -10m unaligned of io_errors units: errors every: 10s @@ -38,8 +33,6 @@ component: DB engine class: Errors type: Netdata component: DB engine - os: linux freebsd macos - hosts: * lookup: sum -10m unaligned of pg_cache_over_half_dirty_events units: errors every: 10s @@ -55,8 +48,6 @@ component: DB engine class: Errors type: Netdata component: DB engine - os: linux freebsd macos - hosts: * lookup: sum -10m unaligned of flushing_pressure_deletions units: pages every: 10s diff --git a/src/health/health.d/disks.conf b/src/health/health.d/disks.conf new file mode 100644 index 000000000..fe96837fb --- /dev/null +++ b/src/health/health.d/disks.conf @@ -0,0 +1,161 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + +# ----------------------------------------------------------------------------- +# low disk space + +# checking the latest collected values +# raise an alarm if the disk is low on +# available disk space + + template: disk_space_usage + on: disk.space + class: Utilization + type: System + component: Disk + host labels: _os=linux freebsd +chart labels: mount_point=!/dev !/dev/* !/run !/run/* * + calc: $used * 100 / ($avail + $used) + units: % + every: 1m + warn: $this > (($status >= $WARNING ) ? (80) : (90)) + crit: ($this > (($status == $CRITICAL) ? (90) : (98))) && $avail < 5 + delay: up 1m down 15m multiplier 1.5 max 1h + summary: Disk ${label:mount_point} space usage + info: Total space utilization of disk ${label:mount_point} + to: sysadmin + + template: disk_inode_usage + on: disk.inodes + class: Utilization + type: System + component: Disk + host labels: _os=linux freebsd +chart labels: mount_point=!/dev !/dev/* !/run !/run/* * + calc: $used * 100 / ($avail + $used) + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: up 1m down 15m multiplier 1.5 max 1h + summary: Disk ${label:mount_point} inode usage + info: Total inode utilization of disk ${label:mount_point} + to: sysadmin + + +# ----------------------------------------------------------------------------- +# disk fill rate + +# calculate the rate the disk fills +# use as base, the available space change +# during the last hour + +# this is just a calculation - it has no alarm +# we will use it in the next template to find +# the hours remaining + + template: disk_fill_rate + on: disk.space +host labels: _os=linux freebsd + lookup: min -10m at -50m unaligned of avail + calc: ($this - $avail) / (($now - $after) / 3600) + every: 1m + units: GB/hour + info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour + +# calculate the hours remaining +# if the disk continues to fill in this rate + + template: out_of_disk_space_time + on: disk.space +host labels: _os=linux freebsd + calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf) + units: hours + every: 10s + warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) + crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) + delay: down 15m multiplier 1.2 max 1h + summary: Disk ${label:mount_point} estimation of lack of space + info: Estimated time the disk ${label:mount_point} will run out of space, if the system continues to add data with the rate of the last hour + to: silent + + +# ----------------------------------------------------------------------------- +# disk inode fill rate + +# calculate the rate the disk inodes are allocated +# use as base, the available inodes change +# during the last hour + +# this is just a calculation - it has no alarm +# we will use it in the next template to find +# the hours remaining + + template: disk_inode_rate + on: disk.inodes +host labels: _os=linux freebsd + lookup: min -10m at -50m unaligned of avail + calc: ($this - $avail) / (($now - $after) / 3600) + every: 1m + units: inodes/hour + info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour + +# calculate the hours remaining +# if the disk inodes are allocated +# in this rate + + template: out_of_disk_inodes_time + on: disk.inodes +host labels: _os=linux freebsd + calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf) + units: hours + every: 10s + warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) + crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) + delay: down 15m multiplier 1.2 max 1h + summary: Disk ${label:mount_point} estimation of lack of inodes + info: Estimated time the disk ${label:mount_point} will run out of inodes, if the system continues to allocate inodes with the rate of the last hour + to: silent + + +# ----------------------------------------------------------------------------- +# disk congestion + +# raise an alarm if the disk is congested +# by calculating the average disk utilization +# for the last 10 minutes + + template: 10min_disk_utilization + on: disk.util + class: Utilization + type: System + component: Disk +host labels: _os=linux freebsd + lookup: average -10m unaligned + units: % + every: 1m + warn: $this > 98 * (($status >= $WARNING) ? (0.7) : (1)) + delay: down 15m multiplier 1.2 max 1h + summary: Disk ${label:device} utilization + info: Average percentage of time ${label:device} disk was busy over the last 10 minutes + to: silent + + +# raise an alarm if the disk backlog +# is above 1000ms (1s) per second +# for 10 minutes +# (i.e. the disk cannot catch up) + + template: 10min_disk_backlog + on: disk.backlog + class: Latency + type: System + component: Disk +host labels: _os=linux freebsd + lookup: average -10m unaligned + units: ms + every: 1m + warn: $this > 5000 * (($status >= $WARNING) ? (0.7) : (1)) + delay: down 15m multiplier 1.2 max 1h + summary: Disk ${label:device} backlog + info: Average backlog size of the ${label:device} disk over the last 10 minutes + to: silent diff --git a/health/health.d/dns_query.conf b/src/health/health.d/dns_query.conf index 756c6a1b6..756c6a1b6 100644 --- a/health/health.d/dns_query.conf +++ b/src/health/health.d/dns_query.conf diff --git a/health/health.d/dnsmasq_dhcp.conf b/src/health/health.d/dnsmasq_dhcp.conf index f6ef01940..f6ef01940 100644 --- a/health/health.d/dnsmasq_dhcp.conf +++ b/src/health/health.d/dnsmasq_dhcp.conf diff --git a/health/health.d/docker.conf b/src/health/health.d/docker.conf index 668614d4d..668614d4d 100644 --- a/health/health.d/docker.conf +++ b/src/health/health.d/docker.conf diff --git a/health/health.d/elasticsearch.conf b/src/health/health.d/elasticsearch.conf index 600840c58..600840c58 100644 --- a/health/health.d/elasticsearch.conf +++ b/src/health/health.d/elasticsearch.conf diff --git a/src/health/health.d/entropy.conf b/src/health/health.d/entropy.conf new file mode 100644 index 000000000..f7671353c --- /dev/null +++ b/src/health/health.d/entropy.conf @@ -0,0 +1,19 @@ + +# check if entropy is too low +# the alarm is checked every 1 minute +# and examines the last hour of data + + alarm: lowest_entropy + on: system.entropy + class: Utilization + type: System + component: Cryptography +host labels: _os=linux + lookup: min -5m unaligned + units: entries + every: 5m + warn: $this < (($status >= $WARNING) ? (200) : (100)) + delay: down 1h multiplier 1.5 max 2h + summary: System entropy pool number of entries + info: Minimum number of entries in the random numbers pool in the last 5 minutes + to: silent diff --git a/health/health.d/exporting.conf b/src/health/health.d/exporting.conf index c0320193c..c0320193c 100644 --- a/health/health.d/exporting.conf +++ b/src/health/health.d/exporting.conf diff --git a/src/health/health.d/file_descriptors.conf b/src/health/health.d/file_descriptors.conf new file mode 100644 index 000000000..b4b4500e3 --- /dev/null +++ b/src/health/health.d/file_descriptors.conf @@ -0,0 +1,30 @@ + # you can disable an alarm notification by setting the 'to' line to: silent + + template: system_file_descriptors_utilization + on: system.file_nr_utilization + class: Utilization + type: System + component: Processes + lookup: max -1m unaligned + units: % + every: 1m + crit: $this > 90 + delay: down 15m multiplier 1.5 max 1h + summary: System open file descriptors utilization + info: System-wide utilization of open files + to: sysadmin + + template: apps_group_file_descriptors_utilization + on: app.fds_open_limit + class: Utilization + type: System + component: Process +host labels: _os=linux + lookup: max -10s unaligned + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + summary: App group ${label:app_group} file descriptors utilization + info: Open files percentage against the processes limits, among all PIDs in application group + to: sysadmin diff --git a/health/health.d/gearman.conf b/src/health/health.d/gearman.conf index 78e1165d1..78e1165d1 100644 --- a/health/health.d/gearman.conf +++ b/src/health/health.d/gearman.conf diff --git a/health/health.d/geth.conf b/src/health/health.d/geth.conf index 361b6b41f..361b6b41f 100644 --- a/health/health.d/geth.conf +++ b/src/health/health.d/geth.conf diff --git a/src/health/health.d/go.d.plugin.conf b/src/health/health.d/go.d.plugin.conf new file mode 100644 index 000000000..eb951448b --- /dev/null +++ b/src/health/health.d/go.d.plugin.conf @@ -0,0 +1,17 @@ +# make sure go.d.plugin data collection job is running + + template: go.d_job_last_collected_secs + on: netdata.go_plugin_execution_time + class: Errors + type: Netdata + component: go.d.plugin +host labels: _hostname=!* + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + summary: Go.d plugin last collection + info: Number of seconds since the last successful data collection + to: webmaster diff --git a/health/health.d/haproxy.conf b/src/health/health.d/haproxy.conf index 66a488fa4..66a488fa4 100644 --- a/health/health.d/haproxy.conf +++ b/src/health/health.d/haproxy.conf diff --git a/health/health.d/hdfs.conf b/src/health/health.d/hdfs.conf index 566e815aa..566e815aa 100644 --- a/health/health.d/hdfs.conf +++ b/src/health/health.d/hdfs.conf diff --git a/src/health/health.d/httpcheck.conf b/src/health/health.d/httpcheck.conf new file mode 100644 index 000000000..3d32dedbf --- /dev/null +++ b/src/health/health.d/httpcheck.conf @@ -0,0 +1,88 @@ + +# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges + template: httpcheck_web_service_up + on: httpcheck.status + class: Utilization + type: Web Server +component: HTTP endpoint + lookup: average -1m unaligned percentage of success + calc: ($this < 75) ? (0) : ($this) + every: 5s + units: up/down + info: HTTP check endpoint ${label:url} liveness status + to: silent + + template: httpcheck_web_service_bad_content + on: httpcheck.status + class: Workload + type: Web Server +component: HTTP endpoint + lookup: average -5m unaligned percentage of bad_content + every: 10s + units: % + warn: $this >= 10 AND $this < 40 + crit: $this >= 40 + delay: down 5m multiplier 1.5 max 1h + summary: HTTP check for ${label:url} unexpected content + info: Percentage of HTTP responses from ${label:url} with unexpected content in the last 5 minutes + to: webmaster + + template: httpcheck_web_service_bad_status + on: httpcheck.status + class: Workload + type: Web Server +component: HTTP endpoint + lookup: average -5m unaligned percentage of bad_status + every: 10s + units: % + warn: $this >= 10 AND $this < 40 + crit: $this >= 40 + delay: down 5m multiplier 1.5 max 1h + summary: HTTP check for ${label:url} unexpected status + info: Percentage of HTTP responses from ${label:url} with unexpected status in the last 5 minutes + to: webmaster + + template: httpcheck_web_service_bad_header + on: httpcheck.status + class: Errors + type: Web Server +component: HTTP endpoint + lookup: average -5m unaligned percentage of bad_header + every: 10s + units: % + warn: $this >= 10 AND $this < 40 + crit: $this >= 40 + delay: down 5m multiplier 1.5 max 1h + summary: HTTP check for ${label:url} unexpected header + info: Percentage of HTTP responses from ${label:url} with unexpected header in the last 5 minutes + to: webmaster + + template: httpcheck_web_service_timeouts + on: httpcheck.status + class: Latency + type: Web Server +component: HTTP endpoint + lookup: average -5m unaligned percentage of timeout + every: 10s + units: % + warn: $this >= 10 AND $this < 40 + crit: $this >= 40 + delay: down 5m multiplier 1.5 max 1h + summary: HTTP check for ${label:url} timeouts + info: Percentage of timed-out HTTP requests to ${label:url} in the last 5 minutes + to: webmaster + + template: httpcheck_web_service_no_connection + on: httpcheck.status + class: Errors + type: Other +component: HTTP endpoint + lookup: average -5m unaligned percentage of no_connection + every: 10s + units: % + warn: $this >= 10 AND $this < 40 + crit: $this >= 40 + delay: down 5m multiplier 1.5 max 1h + summary: HTTP check for ${label:url} failed requests + info: Percentage of failed HTTP requests to ${label:url} in the last 5 minutes + to: webmaster diff --git a/health/health.d/ioping.conf b/src/health/health.d/ioping.conf index 6d832bf00..6d832bf00 100644 --- a/health/health.d/ioping.conf +++ b/src/health/health.d/ioping.conf diff --git a/src/health/health.d/ipc.conf b/src/health/health.d/ipc.conf new file mode 100644 index 000000000..f46cf4285 --- /dev/null +++ b/src/health/health.d/ipc.conf @@ -0,0 +1,32 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + + alarm: semaphores_used + on: system.ipc_semaphores + class: Utilization + type: System + component: IPC +host labels: _os=linux + calc: $semaphores * 100 / $ipc_semaphores_max + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (70) : (80)) + delay: down 5m multiplier 1.5 max 1h + summary: IPC semaphores used + info: IPC semaphore utilization + to: sysadmin + + alarm: semaphore_arrays_used + on: system.ipc_semaphore_arrays + class: Utilization + type: System + component: IPC +host labels: _os=linux + calc: $arrays * 100 / $ipc_semaphores_arrays_max + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (70) : (80)) + delay: down 5m multiplier 1.5 max 1h + summary: IPC semaphore arrays used + info: IPC semaphore arrays utilization + to: sysadmin diff --git a/health/health.d/ipfs.conf b/src/health/health.d/ipfs.conf index 4dfee3c7f..4dfee3c7f 100644 --- a/health/health.d/ipfs.conf +++ b/src/health/health.d/ipfs.conf diff --git a/health/health.d/ipmi.conf b/src/health/health.d/ipmi.conf index cec2320a9..cec2320a9 100644 --- a/health/health.d/ipmi.conf +++ b/src/health/health.d/ipmi.conf diff --git a/src/health/health.d/isc_dhcpd.conf b/src/health/health.d/isc_dhcpd.conf new file mode 100644 index 000000000..3f6e9d5d4 --- /dev/null +++ b/src/health/health.d/isc_dhcpd.conf @@ -0,0 +1,15 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: isc_dhcpd_dhcp_pool_utilization + on: isc_dhcpd.dhcp_pool_utilization + class: Utilization + type: DHCP +component: DHCPd + every: 10s + units: % + calc: $used + warn: $this > ( ($status >= $WARNING ) ? ( 80 ) : ( 90 ) ) + delay: down 5m + summary: ISC DHCP pool ${label:dhcp_pool_name} utilization + info: ISC DHCP pool ${label:dhcp_pool_name} utilization + to: sysadmin diff --git a/health/health.d/kubelet.conf b/src/health/health.d/kubelet.conf index 8adf5f7d4..8adf5f7d4 100644 --- a/health/health.d/kubelet.conf +++ b/src/health/health.d/kubelet.conf diff --git a/health/health.d/linux_power_supply.conf b/src/health/health.d/linux_power_supply.conf index b0d35e752..b0d35e752 100644 --- a/health/health.d/linux_power_supply.conf +++ b/src/health/health.d/linux_power_supply.conf diff --git a/src/health/health.d/load.conf b/src/health/health.d/load.conf new file mode 100644 index 000000000..e639c9ad5 --- /dev/null +++ b/src/health/health.d/load.conf @@ -0,0 +1,67 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + +# Calculate the base trigger point for the load average alarms. +# This is the maximum number of CPU's in the system over the past 1 +# minute, with a special case for a single CPU of setting the trigger at 2. + alarm: load_cpu_number + on: system.load + class: Utilization + type: System + component: Load +host labels: _os=linux + calc: ($active_processors == nan or $active_processors == 0) ? (nan) : ( ($active_processors < 2) ? ( 2 ) : ( $active_processors ) ) + units: cpus + every: 1m + info: Number of active CPU cores in the system + +# Send alarms if the load average is unusually high. +# These intentionally _do not_ calculate the average over the sampled +# time period because the values being checked already are averages. + + alarm: load_average_15 + on: system.load + class: Utilization + type: System + component: Load +host labels: _os=linux + lookup: max -1m unaligned of load15 + calc: ($load_cpu_number == nan) ? (nan) : ($this) + units: load + every: 1m + warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 175 : 200) + delay: down 15m multiplier 1.5 max 1h + summary: System load average (15 minutes) + info: System load average for the past 15 minutes + to: silent + + alarm: load_average_5 + on: system.load + class: Utilization + type: System + component: Load +host labels: _os=linux + lookup: max -1m unaligned of load5 + calc: ($load_cpu_number == nan) ? (nan) : ($this) + units: load + every: 1m + warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 350 : 400) + delay: down 15m multiplier 1.5 max 1h + summary: System load average (5 minutes) + info: System load average for the past 5 minutes + to: silent + + alarm: load_average_1 + on: system.load + class: Utilization + type: System + component: Load +host labels: _os=linux + lookup: max -1m unaligned of load1 + calc: ($load_cpu_number == nan) ? (nan) : ($this) + units: load + every: 1m + warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 700 : 800) + delay: down 15m multiplier 1.5 max 1h + summary: System load average (1 minute) + info: System load average for the past 1 minute + to: silent diff --git a/src/health/health.d/lvm.conf b/src/health/health.d/lvm.conf new file mode 100644 index 000000000..570aa14d3 --- /dev/null +++ b/src/health/health.d/lvm.conf @@ -0,0 +1,31 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: lvm_lv_data_space_utilization + on: lvm.lv_data_space_utilization + class: Utilization + type: System + component: LVM + calc: $utilization + units: % + every: 1m + warn: $this > (($status >= $WARNING ) ? (85) : (90)) + crit: ($this > (($status == $CRITICAL) ? (90) : (98))) + delay: down 5m multiplier 1.5 max 1h + summary: LVM LV ${label:lv_name} on VG ${label:vg_name} high data space usage + info: LVM logical volume high data space usage (LV ${label:lv_name} VG ${label:vg_name} Type ${label:volume_type}) + to: sysadmin + + template: lvm_lv_metadata_space_utilization + on: lvm.lv_metadata_space_utilization + class: Utilization + type: System + component: LVM + calc: $utilization + units: % + every: 1m + warn: $this > (($status >= $WARNING ) ? (85) : (90)) + crit: ($this > (($status == $CRITICAL) ? (90) : (98))) + delay: down 5m multiplier 1.5 max 1h + summary: LVM LV ${label:lv_name} on VG ${label:vg_name} high metadata space usage + info: LVM logical volume high metadata space usage (LV ${label:lv_name} VG ${label:vg_name} Type ${label:volume_type}) + to: sysadmin diff --git a/health/health.d/mdstat.conf b/src/health/health.d/mdstat.conf index 90f97d851..90f97d851 100644 --- a/health/health.d/mdstat.conf +++ b/src/health/health.d/mdstat.conf diff --git a/src/health/health.d/megacli.conf b/src/health/health.d/megacli.conf new file mode 100644 index 000000000..27721fa9a --- /dev/null +++ b/src/health/health.d/megacli.conf @@ -0,0 +1,77 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + +# Adapters (controllers) + + template: megacli_adapter_health_state + on: megacli.adapter_health_state + class: Errors + type: System +component: RAID + lookup: average -1m unaligned percentage of optimal + units: % + every: 10s + crit: $this < 100 + delay: down 5m multiplier 2 max 10m + summary: MegaCLI adapter ${label:adapter_number} health + info: MegaCLI adapter ${label:adapter_number} is in the degraded state + to: sysadmin + + template: megacli_phys_drive_media_errors + on: megacli.phys_drive_media_errors + class: Errors + type: System +component: RAID + lookup: sum -10s + units: media errors + every: 10s + warn: $this > 0 + delay: up 1m down 5m multiplier 2 max 10m + summary: MegaCLI PD adapter ${label:adapter_number} slot ${label:slot_number} media errors + info: MegaCLI physical drive adapter ${label:adapter_number} slot ${label:slot_number} media errors + to: sysadmin + +# Physical Drives + + template: megacli_phys_drive_predictive_failures + on: megacli.phys_drive_predictive_failures + class: Errors + type: System +component: RAID + lookup: sum -10s + units: failures + every: 10s + warn: $this > 0 + delay: up 1m down 5m multiplier 2 max 10m + summary: MegaCLI PD adapter ${label:adapter_number} slot ${label:slot_number} predictive failures + info: MegaCLI physical drive (adapter ${label:adapter_number} slot ${label:slot_number}) predictive failures + to: sysadmin + +# Backup Battery Unit + + template: megacli_bbu_charge + on: megacli.bbu_charge + class: Workload + type: System +component: RAID + lookup: average -10s + units: percent + every: 10s + warn: $this <= (($status >= $WARNING) ? (85) : (80)) + crit: $this <= (($status == $CRITICAL) ? (50) : (40)) + summary: MegaCLI BBU charge + info: MegaCLI Backup Battery Unit (adapter ${label:adapter_number}) average charge over the last minute + to: sysadmin + + template: megacli_bbu_recharge_cycles + on: megacli.bbu_recharge_cycles + class: Workload + type: System +component: RAID + lookup: average -10s + units: cycles + every: 10s + warn: $this >= 100 + crit: $this >= 500 + summary: MegaCLI BBU recharge cycles + info: MegaCLI Backup Battery Unit (adapter ${label:adapter_number}) recharge cycles + to: sysadmin diff --git a/health/health.d/memcached.conf b/src/health/health.d/memcached.conf index 77ca0afa9..77ca0afa9 100644 --- a/health/health.d/memcached.conf +++ b/src/health/health.d/memcached.conf diff --git a/src/health/health.d/memory.conf b/src/health/health.d/memory.conf new file mode 100644 index 000000000..2b2b4e4da --- /dev/null +++ b/src/health/health.d/memory.conf @@ -0,0 +1,76 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + alarm: 1hour_memory_hw_corrupted + on: mem.hwcorrupt + class: Errors + type: System + component: Memory +host labels: _os=linux + calc: $HardwareCorrupted + units: MB + every: 10s + warn: $this > 0 + delay: down 1h multiplier 1.5 max 1h + summary: System corrupted memory + info: Amount of memory corrupted due to a hardware failure + to: sysadmin + +## ECC Controller + + template: ecc_memory_mc_correctable + on: mem.edac_mc_errors + class: Errors + type: System + component: Memory +host labels: _os=linux + calc: $correctable + $correctable_noinfo + units: errors + every: 1m + warn: $this > 0 + summary: System ECC memory ${label:controller} correctable errors + info: Memory controller ${label:controller} ECC correctable errors + to: sysadmin + + template: ecc_memory_mc_uncorrectable + on: mem.edac_mc_errors + class: Errors + type: System + component: Memory +host labels: _os=linux + calc: $uncorrectable + $uncorrectable_noinfo + units: errors + every: 1m + crit: $this > 0 + summary: System ECC memory ${label:controller} uncorrectable errors + info: Memory controller ${label:controller} ECC uncorrectable errors + to: sysadmin + +## ECC DIMM + + template: ecc_memory_dimm_correctable + on: mem.edac_mc_dimm_errors + class: Errors + type: System + component: Memory +host labels: _os=linux + calc: $correctable + units: errors + every: 1m + warn: $this > 0 + summary: System ECC memory DIMM ${label:dimm} correctable errors + info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC correctable errors + to: sysadmin + + template: ecc_memory_dimm_uncorrectable + on: mem.edac_mc_dimm_errors + class: Errors + type: System + component: Memory +host labels: _os=linux + calc: $uncorrectable + units: errors + every: 1m + crit: $this > 0 + summary: System ECC memory DIMM ${label:dimm} uncorrectable errors + info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC uncorrectable errors + to: sysadmin diff --git a/health/health.d/ml.conf b/src/health/health.d/ml.conf index aef9b0368..b6a5df6dd 100644 --- a/health/health.d/ml.conf +++ b/src/health/health.d/ml.conf @@ -13,8 +13,6 @@ class: Workload type: System component: ML - os: * - hosts: * lookup: average -1m of anomaly_rate calc: $this units: % @@ -29,8 +27,6 @@ component: ML # if anomaly rate is above 20% then critical (pick your own threshold that works best via tial and error). # template: ml_5min_cpu_dims # on: system.cpu -# os: linux -# hosts: * # lookup: average -5m anomaly-bit foreach * # calc: $this # units: % @@ -44,8 +40,6 @@ component: ML # if anomaly rate is above 20% then critical (pick your own threshold that works best via tial and error). # template: ml_5min_cpu_chart # on: system.cpu -# os: linux -# hosts: * # lookup: average -5m anomaly-bit of * # calc: $this # units: % @@ -53,4 +47,3 @@ component: ML # warn: $this > (($status >= $WARNING) ? (5) : (20)) # crit: $this > (($status == $CRITICAL) ? (20) : (100)) # info: rolling 5min anomaly rate for system.cpu chart - diff --git a/health/health.d/mysql.conf b/src/health/health.d/mysql.conf index 572560b4e..572560b4e 100644 --- a/health/health.d/mysql.conf +++ b/src/health/health.d/mysql.conf diff --git a/src/health/health.d/net.conf b/src/health/health.d/net.conf new file mode 100644 index 000000000..448a3733d --- /dev/null +++ b/src/health/health.d/net.conf @@ -0,0 +1,239 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + +# ----------------------------------------------------------------------------- +# net traffic overflow + + template: interface_speed + on: net.net + class: Latency + type: System +component: Network + calc: ( $nic_speed_max > 0 ) ? ( $nic_speed_max / 1000) : ( nan ) + units: Mbit + every: 10s + info: Network interface ${label:device} current speed + + template: 1m_received_traffic_overflow + on: net.net + class: Workload + type: System + component: Network +host labels: _os=linux + lookup: average -1m unaligned absolute of received + calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed * 1000)) : ( nan ) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (85) : (90)) + delay: up 1m down 1m multiplier 1.5 max 1h + summary: System network interface ${label:device} inbound utilization + info: Average inbound utilization for the network interface ${label:device} over the last minute + to: silent + + template: 1m_sent_traffic_overflow + on: net.net + class: Workload + type: System + component: Network +host labels: _os=linux + lookup: average -1m unaligned absolute of sent + calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed * 1000)) : ( nan ) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (85) : (90)) + delay: up 1m down 1m multiplier 1.5 max 1h + summary: System network interface ${label:device} outbound utilization + info: Average outbound utilization for the network interface ${label:device} over the last minute + to: silent + +# ----------------------------------------------------------------------------- +# dropped packets + +# check if an interface is dropping packets +# the alarm is checked every 1 minute +# and examines the last 10 minutes of data +# +# it is possible to have expected packet drops on an interface for some network configurations +# look at the Monitoring Network Interfaces section in the proc.plugin documentation for more information + + template: net_interface_inbound_packets + on: net.packets + class: Workload + type: System +component: Network + lookup: sum -10m unaligned absolute of received + units: packets + every: 1m + summary: Network interface ${label:device} received packets + info: Received packets for the network interface ${label:device} in the last 10 minutes + + template: net_interface_outbound_packets + on: net.packets + class: Workload + type: System +component: Network + lookup: sum -10m unaligned absolute of sent + units: packets + every: 1m + summary: Network interface ${label:device} sent packets + info: Sent packets for the network interface ${label:device} in the last 10 minutes + + template: inbound_packets_dropped_ratio + on: net.drops + class: Errors + type: System + component: Network +chart labels: device=!wl* * + lookup: sum -10m unaligned absolute of inbound + calc: (($net_interface_inbound_packets > 10000) ? ($this * 100 / $net_interface_inbound_packets) : (0)) + units: % + every: 1m + warn: $this >= 2 + delay: up 1m down 1h multiplier 1.5 max 2h + summary: System network interface ${label:device} inbound drops + info: Ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes + to: silent + + template: outbound_packets_dropped_ratio + on: net.drops + class: Errors + type: System + component: Network +chart labels: device=!wl* * + lookup: sum -10m unaligned absolute of outbound + calc: (($net_interface_outbound_packets > 1000) ? ($this * 100 / $net_interface_outbound_packets) : (0)) + units: % + every: 1m + warn: $this >= 2 + delay: up 1m down 1h multiplier 1.5 max 2h + summary: System network interface ${label:device} outbound drops + info: Ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes + to: silent + + template: wifi_inbound_packets_dropped_ratio + on: net.drops + class: Errors + type: System + component: Network + host labels: _os=linux +chart labels: device=wl* + lookup: sum -10m unaligned absolute of received + calc: (($net_interface_inbound_packets > 10000) ? ($this * 100 / $net_interface_inbound_packets) : (0)) + units: % + every: 1m + warn: $this >= 10 + delay: up 1m down 1h multiplier 1.5 max 2h + summary: System network interface ${label:device} inbound drops ratio + info: Ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes + to: silent + + template: wifi_outbound_packets_dropped_ratio + on: net.drops + class: Errors + type: System + component: Network + host labels: _os=linux +chart labels: device=wl* + lookup: sum -10m unaligned absolute of sent + calc: (($net_interface_outbound_packets > 1000) ? ($this * 100 / $net_interface_outbound_packets) : (0)) + units: % + every: 1m + warn: $this >= 10 + delay: up 1m down 1h multiplier 1.5 max 2h + summary: System network interface ${label:device} outbound drops ratio + info: Ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes + to: silent + +# ----------------------------------------------------------------------------- +# interface errors + + template: interface_inbound_errors + on: net.errors + class: Errors + type: System + component: Network +host labels: _os=freebsd + lookup: sum -10m unaligned absolute of inbound + units: errors + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + summary: System network interface ${label:device} inbound errors + info: Number of inbound errors for the network interface ${label:device} in the last 10 minutes + to: silent + + template: interface_outbound_errors + on: net.errors + class: Errors + type: System + component: Network +host labels: _os=freebsd + lookup: sum -10m unaligned absolute of outbound + units: errors + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + summary: System network interface ${label:device} outbound errors + info: Number of outbound errors for the network interface ${label:device} in the last 10 minutes + to: silent + +# ----------------------------------------------------------------------------- +# FIFO errors + +# check if an interface is having FIFO +# buffer errors +# the alarm is checked every 1 minute +# and examines the last 10 minutes of data + + template: 10min_fifo_errors + on: net.fifo + class: Errors + type: System + component: Network +host labels: _os=linux + lookup: sum -10m unaligned absolute + units: errors + every: 1m + warn: $this > 0 + delay: down 1h multiplier 1.5 max 2h + summary: System network interface ${label:device} FIFO errors + info: Number of FIFO errors for the network interface ${label:device} in the last 10 minutes + to: silent + +# ----------------------------------------------------------------------------- +# check for packet storms + +# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate +# 2. do the same for the last 10s +# 3. raise an alarm if the later is 10x or 20x the first +# we assume the minimum packet storm should at least have +# 10000 packets/s, average of the last 10 seconds + + template: 1m_received_packets_rate + on: net.packets + class: Workload + type: System + component: Network +host labels: _os=linux freebsd + lookup: average -1m unaligned of received + units: packets + every: 10s + info: Average number of packets received by the network interface ${label:device} over the last minute + + template: 10s_received_packets_storm + on: net.packets + class: Workload + type: System + component: Network +host labels: _os=linux freebsd + lookup: average -10s unaligned of received + calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate)) + every: 10s + units: % + warn: $this > (($status >= $WARNING)?(200):(5000)) + crit: $this > (($status == $CRITICAL)?(5000):(6000)) + options: no-clear-notification + summary: System network interface ${label:device} inbound packet storm + info: Ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \ + compared to the rate over the last minute + to: silent diff --git a/src/health/health.d/netfilter.conf b/src/health/health.d/netfilter.conf new file mode 100644 index 000000000..e0a05c8de --- /dev/null +++ b/src/health/health.d/netfilter.conf @@ -0,0 +1,18 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + alarm: netfilter_conntrack_full + on: netfilter.conntrack_sockets + class: Workload + type: System + component: Network +host labels: _os=linux + lookup: max -10s unaligned of connections + calc: $this * 100 / $netfilter_conntrack_max + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (85) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (95)) + delay: down 5m multiplier 1.5 max 1h + summary: System Netfilter connection tracker utilization + info: Netfilter connection tracker table size utilization + to: sysadmin diff --git a/health/health.d/nvme.conf b/src/health/health.d/nvme.conf index aea402e88..aea402e88 100644 --- a/health/health.d/nvme.conf +++ b/src/health/health.d/nvme.conf diff --git a/health/health.d/pihole.conf b/src/health/health.d/pihole.conf index c4db835ce..c4db835ce 100644 --- a/health/health.d/pihole.conf +++ b/src/health/health.d/pihole.conf diff --git a/src/health/health.d/ping.conf b/src/health/health.d/ping.conf new file mode 100644 index 000000000..a91b231c3 --- /dev/null +++ b/src/health/health.d/ping.conf @@ -0,0 +1,50 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: ping_host_reachable + on: ping.host_packet_loss + class: Errors + type: Other +component: Network + lookup: average -30s unaligned of loss + calc: ($this == nan) ? (nan) : ($this < 100) + units: up/down + every: 10s + crit: $this == 0 + delay: down 30m multiplier 1.5 max 2h + summary: Host ${label:host} ping status + info: Network host ${label:host} reachability status + to: sysadmin + + template: ping_packet_loss + on: ping.host_packet_loss + class: Errors + type: Other +component: Network + lookup: average -10m unaligned of loss + green: 5 + red: 10 + units: % + every: 10s + warn: $this > $green + crit: $this > $red + delay: down 30m multiplier 1.5 max 2h + summary: Host ${label:host} ping packet loss + info: Packet loss percentage to the network host ${label:host} over the last 10 minutes + to: sysadmin + + template: ping_host_latency + on: ping.host_rtt + class: Latency + type: Other +component: Network + lookup: average -10s unaligned of avg + units: ms + every: 10s + green: 500 + red: 1000 + warn: $this > $green OR $max > $red + crit: $this > $red + delay: down 30m multiplier 1.5 max 2h + summary: Host ${label:host} ping latency + info: Average latency to the network host ${label:host} over the last 10 seconds + to: sysadmin diff --git a/health/health.d/plugin.conf b/src/health/health.d/plugin.conf index 8615a0213..8615a0213 100644 --- a/health/health.d/plugin.conf +++ b/src/health/health.d/plugin.conf diff --git a/health/health.d/portcheck.conf b/src/health/health.d/portcheck.conf index 281731c86..281731c86 100644 --- a/health/health.d/portcheck.conf +++ b/src/health/health.d/portcheck.conf diff --git a/src/health/health.d/postgres.conf b/src/health/health.d/postgres.conf new file mode 100644 index 000000000..17e418758 --- /dev/null +++ b/src/health/health.d/postgres.conf @@ -0,0 +1,216 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: postgres_total_connection_utilization + on: postgres.connections_utilization + class: Utilization + type: Database +component: PostgreSQL + lookup: average -1m unaligned of used + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (70) : (80)) + crit: $this > (($status == $CRITICAL) ? (80) : (90)) + delay: down 15m multiplier 1.5 max 1h + summary: PostgreSQL connection utilization + info: Average total connection utilization over the last minute + to: dba + + template: postgres_acquired_locks_utilization + on: postgres.locks_utilization + class: Utilization + type: Database +component: PostgreSQL + lookup: average -1m unaligned of used + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (15) : (20)) + delay: down 15m multiplier 1.5 max 1h + summary: PostgreSQL acquired locks utilization + info: Average acquired locks utilization over the last minute + to: dba + + template: postgres_txid_exhaustion_perc + on: postgres.txid_exhaustion_perc + class: Utilization + type: Database +component: PostgreSQL + calc: $txid_exhaustion + units: % + every: 1m + warn: $this > 90 + delay: down 15m multiplier 1.5 max 1h + summary: PostgreSQL TXID exhaustion + info: Percent towards TXID wraparound + to: dba + +# Database alarms + + template: postgres_db_cache_io_ratio + on: postgres.db_cache_io_ratio + class: Workload + type: Database +component: PostgreSQL + lookup: average -1m unaligned of miss + calc: 100 - $this + units: % + every: 1m + warn: $this < (($status >= $WARNING) ? (70) : (60)) + crit: $this < (($status == $CRITICAL) ? (60) : (50)) + delay: down 15m multiplier 1.5 max 1h + summary: PostgreSQL DB ${label:database} cache hit ratio + info: Average cache hit ratio in db ${label:database} over the last minute + to: dba + + template: postgres_db_transactions_rollback_ratio + on: postgres.db_transactions_ratio + class: Workload + type: Database +component: PostgreSQL + lookup: average -5m unaligned of rollback + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (2)) + delay: down 15m multiplier 1.5 max 1h + summary: PostgreSQL DB ${label:database} aborted transactions + info: Average aborted transactions percentage in db ${label:database} over the last five minutes + to: dba + + template: postgres_db_deadlocks_rate + on: postgres.db_deadlocks_rate + class: Errors + type: Database +component: PostgreSQL + lookup: sum -1m unaligned of deadlocks + units: deadlocks + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (10)) + delay: down 15m multiplier 1.5 max 1h + summary: PostgreSQL DB ${label:database} deadlocks rate + info: Number of deadlocks detected in db ${label:database} in the last minute + to: dba + +# Table alarms + + template: postgres_table_cache_io_ratio + on: postgres.table_cache_io_ratio + class: Workload + type: Database +component: PostgreSQL + lookup: average -1m unaligned of miss + calc: 100 - $this + units: % + every: 1m + warn: $this < (($status >= $WARNING) ? (70) : (60)) + crit: $this < (($status == $CRITICAL) ? (60) : (50)) + delay: down 15m multiplier 1.5 max 1h + summary: PostgreSQL table ${label:table} db ${label:database} cache hit ratio + info: Average cache hit ratio in db ${label:database} table ${label:table} over the last minute + to: dba + + template: postgres_table_index_cache_io_ratio + on: postgres.table_index_cache_io_ratio + class: Workload + type: Database +component: PostgreSQL + lookup: average -1m unaligned of miss + calc: 100 - $this + units: % + every: 1m + warn: $this < (($status >= $WARNING) ? (70) : (60)) + crit: $this < (($status == $CRITICAL) ? (60) : (50)) + delay: down 15m multiplier 1.5 max 1h + summary: PostgreSQL table ${label:table} db ${label:database} index cache hit ratio + info: Average index cache hit ratio in db ${label:database} table ${label:table} over the last minute + to: dba + + template: postgres_table_toast_cache_io_ratio + on: postgres.table_toast_cache_io_ratio + class: Workload + type: Database +component: PostgreSQL + lookup: average -1m unaligned of miss + calc: 100 - $this + units: % + every: 1m + warn: $this < (($status >= $WARNING) ? (70) : (60)) + crit: $this < (($status == $CRITICAL) ? (60) : (50)) + delay: down 15m multiplier 1.5 max 1h + summary: PostgreSQL table ${label:table} db ${label:database} toast cache hit ratio + info: Average TOAST hit ratio in db ${label:database} table ${label:table} over the last minute + to: dba + + template: postgres_table_toast_index_cache_io_ratio + on: postgres.table_toast_index_cache_io_ratio + class: Workload + type: Database +component: PostgreSQL + lookup: average -1m unaligned of miss + calc: 100 - $this + units: % + every: 1m + warn: $this < (($status >= $WARNING) ? (70) : (60)) + crit: $this < (($status == $CRITICAL) ? (60) : (50)) + delay: down 15m multiplier 1.5 max 1h + summary: PostgreSQL table ${label:table} db ${label:database} index toast hit ratio + info: average index TOAST hit ratio in db ${label:database} table ${label:table} over the last minute + to: dba + + template: postgres_table_bloat_size_perc + on: postgres.table_bloat_size_perc + class: Errors + type: Database +component: PostgreSQL + calc: ($table_size > (1024 * 1024 * 100)) ? ($bloat) : (0) + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (60) : (70)) + crit: $this > (($status == $CRITICAL) ? (70) : (80)) + delay: down 15m multiplier 1.5 max 1h + summary: PostgreSQL table ${label:table} db ${label:database} bloat size + info: Bloat size percentage in db ${label:database} table ${label:table} + to: dba + + template: postgres_table_last_autovacuum_time + on: postgres.table_autovacuum_since_time + class: Errors + type: Database +component: PostgreSQL +host labels: _hostname=!* + calc: $time + units: seconds + every: 1m + warn: $this != nan AND $this > (60 * 60 * 24 * 7) + summary: PostgreSQL table ${label:table} db ${label:database} last autovacuum + info: Time elapsed since db ${label:database} table ${label:table} was vacuumed by the autovacuum daemon + to: dba + + template: postgres_table_last_autoanalyze_time + on: postgres.table_autoanalyze_since_time + class: Errors + type: Database +component: PostgreSQL +host labels: _hostname=!* + calc: $time + units: seconds + every: 1m + warn: $this != nan AND $this > (60 * 60 * 24 * 7) + summary: PostgreSQL table ${label:table} db ${label:database} last autoanalyze + info: Time elapsed since db ${label:database} table ${label:table} was analyzed by the autovacuum daemon + to: dba + +# Index alarms + + template: postgres_index_bloat_size_perc + on: postgres.index_bloat_size_perc + class: Errors + type: Database +component: PostgreSQL + calc: ($index_size > (1024 * 1024 * 10)) ? ($bloat) : (0) + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (60) : (70)) + crit: $this > (($status == $CRITICAL) ? (70) : (80)) + delay: down 15m multiplier 1.5 max 1h + summary: PostgreSQL table ${label:table} db ${label:database} index bloat size + info: Bloat size percentage in db ${label:database} table ${label:table} index ${label:index} + to: dba diff --git a/health/health.d/processes.conf b/src/health/health.d/processes.conf index 8f2e0fda5..2029c76e4 100644 --- a/health/health.d/processes.conf +++ b/src/health/health.d/processes.conf @@ -5,7 +5,6 @@ class: Workload type: System component: Processes - hosts: * calc: $active * 100 / $pidmax units: % every: 5s diff --git a/src/health/health.d/python.d.plugin.conf b/src/health/health.d/python.d.plugin.conf new file mode 100644 index 000000000..f962b07f2 --- /dev/null +++ b/src/health/health.d/python.d.plugin.conf @@ -0,0 +1,17 @@ +# make sure python.d.plugin data collection job is running + + template: python.d_job_last_collected_secs + on: netdata.pythond_runtime + class: Errors + type: Netdata + component: python.d.plugin +host labels: _hostname=!* + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + summary: Python.d plugin last collection + info: Number of seconds since the last successful data collection + to: webmaster diff --git a/src/health/health.d/qos.conf b/src/health/health.d/qos.conf new file mode 100644 index 000000000..f524a1578 --- /dev/null +++ b/src/health/health.d/qos.conf @@ -0,0 +1,16 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + +# check if a QoS class is dropping packets +# the alarm is checked every 10 seconds +# and examines the last minute of data + + template: 10min_qos_packet_drops + on: tc.qos_dropped +host labels: _os=linux + lookup: sum -5m unaligned absolute + every: 30s + warn: $this > 0 + units: packets + summary: QOS packet drops + info: Dropped packets in the last 5 minutes + to: silent diff --git a/src/health/health.d/ram.conf b/src/health/health.d/ram.conf new file mode 100644 index 000000000..573bc0aca --- /dev/null +++ b/src/health/health.d/ram.conf @@ -0,0 +1,76 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + alarm: ram_in_use + on: system.ram + class: Utilization + type: System + component: Memory +host labels: _os=linux + calc: $used * 100 / ($used + $cached + $free + $buffers) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + summary: System memory utilization + info: System memory utilization + to: sysadmin + + alarm: ram_available + on: mem.available + class: Utilization + type: System + component: Memory +host labels: _os=linux + calc: $avail * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers) + units: % + every: 10s + warn: $this < (($status >= $WARNING) ? (15) : (10)) + delay: down 15m multiplier 1.5 max 1h + summary: System available memory + info: Percentage of estimated amount of RAM available for userspace processes, without causing swapping + to: silent + + alarm: oom_kill + on: mem.oom_kill +host labels: _os=linux + lookup: sum -30m unaligned + units: kills + every: 5m + warn: $this > 0 + delay: down 10m + summary: System OOM kills + info: Number of out of memory kills in the last 30 minutes + to: silent + +## FreeBSD + alarm: ram_in_use + on: system.ram + class: Utilization + type: System + component: Memory +host labels: _os=freebsd + calc: ($active + $wired + $laundry + $buffers) * 100 / ($active + $wired + $laundry + $buffers + $cache + $free + $inactive) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + summary: System memory utilization + info: System memory utilization + to: sysadmin + + alarm: ram_available + on: mem.available + class: Utilization + type: System + component: Memory +host labels: _os=freebsd + calc: $avail * 100 / ($system.ram.free + $system.ram.active + $system.ram.inactive + $system.ram.wired + $system.ram.cache + $system.ram.laundry + $system.ram.buffers) + units: % + every: 10s + warn: $this < (($status >= $WARNING) ? (15) : (10)) + delay: down 15m multiplier 1.5 max 1h + summary: System available memory + info: Percentage of estimated amount of RAM available for userspace processes, without causing swapping + to: silent diff --git a/src/health/health.d/redis.conf b/src/health/health.d/redis.conf new file mode 100644 index 000000000..4f82830a9 --- /dev/null +++ b/src/health/health.d/redis.conf @@ -0,0 +1,58 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: redis_connections_rejected + on: redis.connections + class: Errors + type: KV Storage +component: Redis + lookup: sum -1m unaligned of rejected + every: 10s + units: connections + warn: $this > 0 + summary: Redis rejected connections + info: Connections rejected because of maxclients limit in the last minute + delay: down 5m multiplier 1.5 max 1h + to: dba + + template: redis_bgsave_broken + on: redis.bgsave_health + class: Errors + type: KV Storage +component: Redis + every: 10s + calc: $last_bgsave != nan AND $last_bgsave != 0 + crit: $this + units: ok/failed + summary: Redis background save + info: Status of the last RDB save operation (0: ok, 1: error) + delay: down 5m multiplier 1.5 max 1h + to: dba + + template: redis_bgsave_slow + on: redis.bgsave_now + class: Latency + type: KV Storage +component: Redis + every: 10s + calc: $current_bgsave_time + warn: $this > 600 + crit: $this > 1200 + units: seconds + summary: Redis slow background save + info: Duration of the on-going RDB save operation + delay: down 5m multiplier 1.5 max 1h + to: dba + + template: redis_master_link_down + on: redis.master_link_down_since_time + class: Errors + type: KV Storage +component: Redis + every: 10s + calc: $time + units: seconds + crit: $this != nan AND $this > 0 + summary: Redis master link down + info: Time elapsed since the link between master and slave is down + delay: down 5m multiplier 1.5 max 1h + to: dba diff --git a/health/health.d/retroshare.conf b/src/health/health.d/retroshare.conf index c665430fa..c665430fa 100644 --- a/health/health.d/retroshare.conf +++ b/src/health/health.d/retroshare.conf diff --git a/health/health.d/riakkv.conf b/src/health/health.d/riakkv.conf index 677e3cb4f..677e3cb4f 100644 --- a/health/health.d/riakkv.conf +++ b/src/health/health.d/riakkv.conf diff --git a/health/health.d/scaleio.conf b/src/health/health.d/scaleio.conf index b089cb85e..b089cb85e 100644 --- a/health/health.d/scaleio.conf +++ b/src/health/health.d/scaleio.conf diff --git a/src/health/health.d/softnet.conf b/src/health/health.d/softnet.conf new file mode 100644 index 000000000..03a4ceebd --- /dev/null +++ b/src/health/health.d/softnet.conf @@ -0,0 +1,53 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + +# check for common /proc/net/softnet_stat errors + + alarm: 1min_netdev_backlog_exceeded + on: system.softnet_stat + class: Errors + type: System + component: Network +host labels: _os=linux + lookup: average -1m unaligned absolute of dropped + units: packets + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : (10)) + delay: down 1h multiplier 1.5 max 2h + summary: System netdev dropped packets + info: Average number of dropped packets in the last minute \ + due to exceeded net.core.netdev_max_backlog + to: silent + + alarm: 1min_netdev_budget_ran_outs + on: system.softnet_stat + class: Errors + type: System + component: Network +host labels: _os=linux + lookup: average -1m unaligned absolute of squeezed + units: events + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : (10)) + delay: down 1h multiplier 1.5 max 2h + summary: System netdev budget run outs + info: Average number of times ksoftirq ran out of sysctl net.core.netdev_budget or \ + net.core.netdev_budget_usecs with work remaining over the last minute \ + (this can be a cause for dropped packets) + to: silent + + alarm: 10min_netisr_backlog_exceeded + on: system.softnet_stat + class: Errors + type: System + component: Network +host labels: _os=freebsd + lookup: average -1m unaligned absolute of qdrops + units: packets + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : (10)) + delay: down 1h multiplier 1.5 max 2h + summary: System netisr drops + info: Average number of drops in the last minute \ + due to exceeded sysctl net.route.netisr_maxqlen \ + (this can be a cause for dropped packets) + to: silent diff --git a/src/health/health.d/storcli.conf b/src/health/health.d/storcli.conf new file mode 100644 index 000000000..be71b517e --- /dev/null +++ b/src/health/health.d/storcli.conf @@ -0,0 +1,61 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + +# Controllers + + template: storcli_controller_health_status + on: storcli.controller_health_status + class: Errors + type: System +component: RAID + lookup: average -1m unaligned percentage of healthy + units: % + every: 10s + crit: $this < 100 + delay: down 5m multiplier 2 max 10m + summary: RAID controller ${label:controller_number} health + info: RAID controller ${label:controller_number} is unhealthy + to: sysadmin + + template: storcli_controller_bbu_status + on: storcli.controller_bbu_status + class: Errors + type: System +component: RAID + lookup: average -1m unaligned percentage of healthy,na + units: % + every: 10s + crit: $this < 100 + delay: down 5m multiplier 2 max 10m + summary: RAID controller ${label:controller_number} BBU health + info: RAID controller ${label:controller_number} BBU is unhealthy + to: sysadmin + +# Physical Drives + + template: storcli_phys_drive_errors + on: storcli.phys_drive_errors + class: Errors + type: System +component: RAID + lookup: sum -10s + units: errors + every: 10s + warn: $this > 0 + delay: up 1m down 5m multiplier 2 max 10m + summary: RAID PD c${label:controller_number}/e${label:enclosure_number}/s${label:slot_number} errors + info: RAID physical drive c${label:controller_number}/e${label:enclosure_number}/s${label:slot_number} errors + to: sysadmin + + template: storcli_phys_drive_predictive_failures + on: storcli.phys_drive_predictive_failures + class: Errors + type: System +component: RAID + lookup: sum -10s + units: failures + every: 10s + warn: $this > 0 + delay: up 1m down 5m multiplier 2 max 10m + summary: RAID PD c${label:controller_number}/e${label:enclosure_number}/s${label:slot_number} predictive failures + info: RAID physical drive c${label:controller_number}/e${label:enclosure_number}/s${label:slot_number} predictive failures + to: sysadmin diff --git a/src/health/health.d/swap.conf b/src/health/health.d/swap.conf new file mode 100644 index 000000000..297aebd1e --- /dev/null +++ b/src/health/health.d/swap.conf @@ -0,0 +1,34 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + alarm: 30min_ram_swapped_out + on: mem.swapio + class: Workload + type: System + component: Memory +host labels: _os=linux freebsd + lookup: sum -30m unaligned absolute of out + # we have to convert KB to MB by dividing $this (i.e. the result of the lookup) with 1024 + calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free ) + units: % of RAM + every: 1m + warn: $this > (($status >= $WARNING) ? (20) : (30)) + delay: down 15m multiplier 1.5 max 1h + summary: System memory swapped out + info: Percentage of the system RAM swapped in the last 30 minutes + to: silent + + alarm: used_swap + on: mem.swap + class: Utilization + type: System + component: Memory +host labels: _os=linux freebsd + calc: (($used + $free) > 0) ? ($used * 100 / ($used + $free)) : 0 + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: up 30s down 15m multiplier 1.5 max 1h + summary: System swap memory utilization + info: Swap memory utilization + to: sysadmin diff --git a/health/health.d/synchronization.conf b/src/health/health.d/synchronization.conf index 6c947d90b..28b1817ac 100644 --- a/health/health.d/synchronization.conf +++ b/src/health/health.d/synchronization.conf @@ -2,7 +2,6 @@ on: mem.sync lookup: sum -1m of sync units: calls - plugin: ebpf.plugin every: 1m warn: $this > 6 delay: up 1m down 10m multiplier 1.5 max 1h diff --git a/src/health/health.d/systemdunits.conf b/src/health/health.d/systemdunits.conf new file mode 100644 index 000000000..bb5c627e8 --- /dev/null +++ b/src/health/health.d/systemdunits.conf @@ -0,0 +1,177 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + +## Service units + template: systemd_service_unit_failed_state + on: systemd.service_unit_state + class: Errors + type: Linux + component: Systemd units +chart labels: unit_name=!* + calc: $failed + units: state + every: 10s + warn: $this != nan AND $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: systemd unit ${label:unit_name} state + info: systemd service unit in the failed state + to: sysadmin + +## Socket units + template: systemd_socket_unit_failed_state + on: systemd.socket_unit_state + class: Errors + type: Linux + component: Systemd units +chart labels: unit_name=!* + calc: $failed + units: state + every: 10s + warn: $this != nan AND $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: systemd unit ${label:unit_name} state + info: systemd socket unit in the failed state + to: sysadmin + +## Target units + template: systemd_target_unit_failed_state + on: systemd.target_unit_state + class: Errors + type: Linux + component: Systemd units +chart labels: unit_name=!* + calc: $failed + units: state + every: 10s + warn: $this != nan AND $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: systemd unit ${label:unit_name} state + info: systemd target unit in the failed state + to: sysadmin + +## Path units + template: systemd_path_unit_failed_state + on: systemd.path_unit_state + class: Errors + type: Linux + component: Systemd units +chart labels: unit_name=!* + calc: $failed + units: state + every: 10s + warn: $this != nan AND $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: systemd unit ${label:unit_name} state + info: systemd path unit in the failed state + to: sysadmin + +## Device units + template: systemd_device_unit_failed_state + on: systemd.device_unit_state + class: Errors + type: Linux + component: Systemd units +chart labels: unit_name=!* + calc: $failed + units: state + every: 10s + warn: $this != nan AND $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: systemd unit ${label:unit_name} state + info: systemd device unit in the failed state + to: sysadmin + +## Mount units + template: systemd_mount_unit_failed_state + on: systemd.mount_unit_state + class: Errors + type: Linux + component: Systemd units +chart labels: unit_name=!* + calc: $failed + units: state + every: 10s + warn: $this != nan AND $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: systemd unit ${label:unit_name} state + info: systemd mount units in the failed state + to: sysadmin + +## Automount units + template: systemd_automount_unit_failed_state + on: systemd.automount_unit_state + class: Errors + type: Linux + component: Systemd units +chart labels: unit_name=!* + calc: $failed + units: state + every: 10s + warn: $this != nan AND $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: systemd unit ${label:unit_name} state + info: systemd automount unit in the failed state + to: sysadmin + +## Swap units + template: systemd_swap_unit_failed_state + on: systemd.swap_unit_state + class: Errors + type: Linux + component: Systemd units +chart labels: unit_name=!* + calc: $failed + units: state + every: 10s + warn: $this != nan AND $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: systemd unit ${label:unit_name} state + info: systemd swap units in the failed state + to: sysadmin + +## Scope units + template: systemd_scope_unit_failed_state + on: systemd.scope_unit_state + class: Errors + type: Linux + component: Systemd units +chart labels: unit_name=!* + calc: $failed + units: state + every: 10s + warn: $this != nan AND $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: systemd unit ${label:unit_name} state + info: systemd scope units in the failed state + to: sysadmin + +## Slice units + template: systemd_slice_unit_failed_state + on: systemd.slice_unit_state + class: Errors + type: Linux + component: Systemd units +chart labels: unit_name=!* + calc: $failed + units: state + every: 10s + warn: $this != nan AND $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: systemd unit ${label:unit_name} state + info: systemd slice units in the failed state + to: sysadmin + +## Timer units + template: systemd_timer_unit_failed_state + on: systemd.timer_unit_state + class: Errors + type: Linux + component: Systemd units +chart labels: unit_name=!* + calc: $failed + units: state + every: 10s + warn: $this != nan AND $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: systemd unit ${label:unit_name} state + info: systemd timer unit in the failed state + to: sysadmin diff --git a/src/health/health.d/tcp_conn.conf b/src/health/health.d/tcp_conn.conf new file mode 100644 index 000000000..fe4b98db0 --- /dev/null +++ b/src/health/health.d/tcp_conn.conf @@ -0,0 +1,21 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + +# ${tcp_max_connections} may be nan or -1 if the system +# supports dynamic threshold for TCP connections. +# In this case, the alarm will always be zero. + + alarm: tcp_connections + on: ip.tcpsock + class: Workload + type: System + component: Network +host labels: _os=linux + calc: (${tcp_max_connections} > 0) ? ( ${connections} * 100 / ${tcp_max_connections} ) : 0 + units: % + every: 10s + warn: $this > (($status >= $WARNING ) ? ( 60 ) : ( 80 )) + crit: $this > (($status == $CRITICAL) ? ( 80 ) : ( 90 )) + delay: up 0 down 5m multiplier 1.5 max 1h + summary: System TCP connections utilization + info: IPv4 TCP connections utilization + to: sysadmin diff --git a/src/health/health.d/tcp_listen.conf b/src/health/health.d/tcp_listen.conf new file mode 100644 index 000000000..bdcce79d4 --- /dev/null +++ b/src/health/health.d/tcp_listen.conf @@ -0,0 +1,93 @@ +# There are two queues involved when incoming TCP connections are handled +# (both at the kernel): +# +# SYN queue +# The SYN queue tracks TCP handshakes until connections are fully established. +# It overflows when too many incoming TCP connection requests hang in the +# half-open state and the server is not configured to fall back to SYN cookies. +# Overflows are usually caused by SYN flood DoS attacks (i.e. someone sends +# lots of SYN packets and never completes the handshakes). +# +# Accept queue +# The accept queue holds fully established TCP connections waiting to be handled +# by the listening application. It overflows when the server application fails +# to accept new connections at the rate they are coming in. +# +# +# ----------------------------------------------------------------------------- +# tcp accept queue (at the kernel) + + alarm: 1m_tcp_accept_queue_overflows + on: ip.tcp_accept_queue + class: Workload + type: System + component: Network +host labels: _os=linux + lookup: average -60s unaligned absolute of ListenOverflows + units: overflows + every: 10s + warn: $this > 1 + crit: $this > (($status == $CRITICAL) ? (1) : (5)) + delay: up 0 down 5m multiplier 1.5 max 1h + summary: System TCP accept queue overflows + info: Average number of overflows in the TCP accept queue over the last minute + to: silent + +# THIS IS TOO GENERIC +# CHECK: https://github.com/netdata/netdata/issues/3234#issuecomment-423935842 + alarm: 1m_tcp_accept_queue_drops + on: ip.tcp_accept_queue + class: Workload + type: System + component: Network +host labels: _os=linux + lookup: average -60s unaligned absolute of ListenDrops + units: drops + every: 10s + warn: $this > 1 + crit: $this > (($status == $CRITICAL) ? (1) : (5)) + delay: up 0 down 5m multiplier 1.5 max 1h + summary: System TCP accept queue dropped packets + info: Average number of dropped packets in the TCP accept queue over the last minute + to: silent + +# ----------------------------------------------------------------------------- +# tcp SYN queue (at the kernel) + +# When the SYN queue is full, either TcpExtTCPReqQFullDoCookies or +# TcpExtTCPReqQFullDrop is incremented, depending on whether SYN cookies are +# enabled or not. In both cases this probably indicates a SYN flood attack, +# so i guess a notification should be sent. + + alarm: 1m_tcp_syn_queue_drops + on: ip.tcp_syn_queue + class: Workload + type: System + component: Network +host labels: _os=linux + lookup: average -60s unaligned absolute of TCPReqQFullDrop + units: drops + every: 10s + warn: $this > 1 + crit: $this > (($status == $CRITICAL) ? (0) : (5)) + delay: up 10 down 5m multiplier 1.5 max 1h + summary: System TCP SYN queue drops + info: Average number of SYN requests was dropped due to the full TCP SYN queue over the last minute \ + (SYN cookies were not enabled) + to: silent + + alarm: 1m_tcp_syn_queue_cookies + on: ip.tcp_syn_queue + class: Workload + type: System + component: Network +host labels: _os=linux + lookup: average -60s unaligned absolute of TCPReqQFullDoCookies + units: cookies + every: 10s + warn: $this > 1 + crit: $this > (($status == $CRITICAL) ? (0) : (5)) + delay: up 10 down 5m multiplier 1.5 max 1h + summary: System TCP SYN queue cookies + info: Average number of sent SYN cookies due to the full TCP SYN queue over the last minute + to: silent diff --git a/src/health/health.d/tcp_mem.conf b/src/health/health.d/tcp_mem.conf new file mode 100644 index 000000000..b9350e3cd --- /dev/null +++ b/src/health/health.d/tcp_mem.conf @@ -0,0 +1,22 @@ +# check +# http://blog.tsunanet.net/2011/03/out-of-socket-memory.html +# +# We give a warning when TCP is under memory pressure +# and a critical when TCP is 90% of its upper memory limit +# + + alarm: tcp_memory + on: ipv4.sockstat_tcp_mem + class: Utilization + type: System + component: Network +host labels: _os=linux + calc: ${mem} * 100 / ${tcp_mem_high} + units: % + every: 10s + warn: ${mem} > (($status >= $WARNING ) ? ( ${tcp_mem_pressure} * 0.8 ) : ( ${tcp_mem_pressure} )) + crit: ${mem} > (($status == $CRITICAL ) ? ( ${tcp_mem_pressure} ) : ( ${tcp_mem_high} * 0.9 )) + delay: up 0 down 5m multiplier 1.5 max 1h + summary: System TCP memory utilization + info: TCP memory utilization + to: silent diff --git a/src/health/health.d/tcp_orphans.conf b/src/health/health.d/tcp_orphans.conf new file mode 100644 index 000000000..7b2d95edb --- /dev/null +++ b/src/health/health.d/tcp_orphans.conf @@ -0,0 +1,22 @@ +# check +# http://blog.tsunanet.net/2011/03/out-of-socket-memory.html +# +# The kernel may penalize orphans by 2x or even 4x +# so we alarm warning at 25% and critical at 50% +# + + alarm: tcp_orphans + on: ipv4.sockstat_tcp_sockets + class: Errors + type: System + component: Network +host labels: _os=linux + calc: ${orphan} * 100 / ${tcp_max_orphans} + units: % + every: 10s + warn: $this > (($status >= $WARNING ) ? ( 20 ) : ( 25 )) + crit: $this > (($status == $CRITICAL) ? ( 25 ) : ( 50 )) + delay: up 0 down 5m multiplier 1.5 max 1h + summary: System TCP orphan sockets utilization + info: Orphan IPv4 TCP sockets utilization + to: silent diff --git a/src/health/health.d/tcp_resets.conf b/src/health/health.d/tcp_resets.conf new file mode 100644 index 000000000..63f798d78 --- /dev/null +++ b/src/health/health.d/tcp_resets.conf @@ -0,0 +1,66 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + +# ----------------------------------------------------------------------------- +# tcp resets this host sends + + alarm: 1m_ip_tcp_resets_sent + on: ip.tcphandshake + class: Errors + type: System + component: Network +host labels: _os=linux + lookup: average -1m at -10s unaligned absolute of OutRsts + units: tcp resets/s + every: 10s + info: average number of sent TCP RESETS over the last minute + + alarm: 10s_ip_tcp_resets_sent + on: ip.tcphandshake + class: Errors + type: System + component: Network +host labels: _os=linux + lookup: average -10s unaligned absolute of OutRsts + units: tcp resets/s + every: 10s + warn: $netdata.uptime.uptime > (1 * 60) AND $this > ((($1m_ip_tcp_resets_sent < 5)?(5):($1m_ip_tcp_resets_sent)) * (($status >= $WARNING) ? (1) : (10))) + delay: up 20s down 60m multiplier 1.2 max 2h + options: no-clear-notification + summary: System TCP outbound resets + info: Average number of sent TCP RESETS over the last 10 seconds. \ + This can indicate a port scan, \ + or that a service running on this host has crashed. \ + Netdata will not send a clear notification for this alarm. + to: silent + +# ----------------------------------------------------------------------------- +# tcp resets this host receives + + alarm: 1m_ip_tcp_resets_received + on: ip.tcphandshake + class: Errors + type: System + component: Network +host labels: _os=linux freebsd + lookup: average -1m at -10s unaligned absolute of AttemptFails + units: tcp resets/s + every: 10s + info: average number of received TCP RESETS over the last minute + + alarm: 10s_ip_tcp_resets_received + on: ip.tcphandshake + class: Errors + type: System + component: Network +host labels: _os=linux freebsd + lookup: average -10s unaligned absolute of AttemptFails + units: tcp resets/s + every: 10s + warn: $netdata.uptime.uptime > (1 * 60) AND $this > ((($1m_ip_tcp_resets_received < 5)?(5):($1m_ip_tcp_resets_received)) * (($status >= $WARNING) ? (1) : (10))) + delay: up 20s down 60m multiplier 1.2 max 2h + options: no-clear-notification + summary: System TCP inbound resets + info: average number of received TCP RESETS over the last 10 seconds. \ + This can be an indication that a service this host needs has crashed. \ + Netdata will not send a clear notification for this alarm. + to: silent diff --git a/src/health/health.d/timex.conf b/src/health/health.d/timex.conf new file mode 100644 index 000000000..053dc9290 --- /dev/null +++ b/src/health/health.d/timex.conf @@ -0,0 +1,17 @@ +# It can take several minutes before ntpd selects a server to synchronize with; +# try checking after 17 minutes (1024 seconds). + + alarm: system_clock_sync_state + on: system.clock_sync_state + class: Errors + type: System + component: Clock +host labels: _os=linux + calc: $state + units: synchronization state + every: 10s + warn: $system.uptime.uptime > 17 * 60 AND $this == 0 + delay: down 5m + summary: System clock sync state + info: When set to 0, the system kernel believes the system clock is not properly synchronized to a reliable server + to: silent diff --git a/src/health/health.d/udp_errors.conf b/src/health/health.d/udp_errors.conf new file mode 100644 index 000000000..745c11e21 --- /dev/null +++ b/src/health/health.d/udp_errors.conf @@ -0,0 +1,37 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + +# ----------------------------------------------------------------------------- +# UDP receive buffer errors + + alarm: 1m_ipv4_udp_receive_buffer_errors + on: ipv4.udperrors + class: Errors + type: System + component: Network +host labels: _os=linux freebsd + lookup: average -1m unaligned absolute of RcvbufErrors + units: errors + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : (10)) + summary: System UDP receive buffer errors + info: Average number of UDP receive buffer errors over the last minute + delay: up 1m down 60m multiplier 1.2 max 2h + to: silent + +# ----------------------------------------------------------------------------- +# UDP send buffer errors + + alarm: 1m_ipv4_udp_send_buffer_errors + on: ipv4.udperrors + class: Errors + type: System + component: Network +host labels: _os=linux + lookup: average -1m unaligned absolute of SndbufErrors + units: errors + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : (10)) + summary: System UDP send buffer errors + info: Average number of UDP send buffer errors over the last minute + delay: up 1m down 60m multiplier 1.2 max 2h + to: silent diff --git a/health/health.d/unbound.conf b/src/health/health.d/unbound.conf index 3c898f1d5..3c898f1d5 100644 --- a/health/health.d/unbound.conf +++ b/src/health/health.d/unbound.conf diff --git a/src/health/health.d/upsd.conf b/src/health/health.d/upsd.conf new file mode 100644 index 000000000..17eb5263d --- /dev/null +++ b/src/health/health.d/upsd.conf @@ -0,0 +1,46 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: upsd_10min_ups_load + on: upsd.ups_load + class: Utilization + type: Power Supply +component: UPS + lookup: average -10m unaligned of load + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (70) : (80)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 10m multiplier 1.5 max 1h + summary: UPS ${label:ups_name} load + info: UPS ${label:ups_name} average load over the last 10 minutes + to: sitemgr + + template: upsd_ups_battery_charge + on: upsd.ups_battery_charge + class: Errors + type: Power Supply +component: UPS + lookup: average -60s unaligned of charge + units: % + every: 60s + warn: $this < 75 + crit: $this < 40 + delay: down 10m multiplier 1.5 max 1h + summary: UPS ${label:ups_name} battery charge + info: UPS ${label:ups_name} average battery charge over the last minute + to: sitemgr + + template: upsd_ups_last_collected_secs + on: upsd.ups_load + class: Latency + type: Power Supply +component: UPS device + calc: $now - $last_collected_t + every: 10s + units: seconds ago + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + summary: UPS ${label:ups_name} last collected + info: UPS ${label:ups_name} number of seconds since the last successful data collection + to: sitemgr diff --git a/health/health.d/vcsa.conf b/src/health/health.d/vcsa.conf index 3e20bfd1e..3e20bfd1e 100644 --- a/health/health.d/vcsa.conf +++ b/src/health/health.d/vcsa.conf diff --git a/health/health.d/vernemq.conf b/src/health/health.d/vernemq.conf index 6ea9f99dc..6ea9f99dc 100644 --- a/health/health.d/vernemq.conf +++ b/src/health/health.d/vernemq.conf diff --git a/src/health/health.d/vsphere.conf b/src/health/health.d/vsphere.conf new file mode 100644 index 000000000..e22f0b620 --- /dev/null +++ b/src/health/health.d/vsphere.conf @@ -0,0 +1,66 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + +# -----------------------------------------------Virtual Machine-------------------------------------------------------- + + template: vsphere_vm_cpu_utilization + on: vsphere.vm_cpu_utilization + class: Utilization + type: Virtual Machine +component: CPU + lookup: average -10m unaligned match-names of used + units: % + every: 20s + warn: $this > (($status >= $WARNING) ? (75) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + summary: vSphere CPU utilization for VM ${label:vm} + info: CPU utilization VM ${label:vm} host ${label:host} cluster ${label:cluster} datacenter ${label:datacenter} + to: silent + + template: vsphere_vm_mem_utilization + on: vsphere.vm_mem_utilization + class: Utilization + type: Virtual Machine +component: Memory + calc: $used + units: % + every: 20s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + summary: vSphere memory utilization for VM ${label:vm} + info: Memory utilization VM ${label:vm} host ${label:host} cluster ${label:cluster} datacenter ${label:datacenter} + to: silent + +# -----------------------------------------------ESXI host-------------------------------------------------------------- + + template: vsphere_host_cpu_utilization + on: vsphere.host_cpu_utilization + class: Utilization + type: Virtual Machine +component: CPU + lookup: average -10m unaligned match-names of used + units: % + every: 20s + warn: $this > (($status >= $WARNING) ? (75) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + summary: vSphere ESXi CPU utilization for host ${label:host} + info: CPU utilization ESXi host ${label:host} cluster ${label:cluster} datacenter ${label:datacenter} + to: sysadmin + + template: vsphere_host_mem_utilization + on: vsphere.host_mem_utilization + class: Utilization + type: Virtual Machine +component: Memory + calc: $used + units: % + every: 20s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + summary: vSphere ESXi Ram utilization for host ${label:host} + info: Memory utilization ESXi host ${label:host} cluster ${label:cluster} datacenter ${label:datacenter} + to: sysadmin diff --git a/health/health.d/web_log.conf b/src/health/health.d/web_log.conf index 78f1cc7f5..78f1cc7f5 100644 --- a/health/health.d/web_log.conf +++ b/src/health/health.d/web_log.conf diff --git a/src/health/health.d/whoisquery.conf b/src/health/health.d/whoisquery.conf new file mode 100644 index 000000000..6d87ad280 --- /dev/null +++ b/src/health/health.d/whoisquery.conf @@ -0,0 +1,14 @@ + + template: whoisquery_days_until_expiration + on: whoisquery.time_until_expiration + class: Utilization + type: Other +component: WHOIS + calc: $expiry / 86400 + units: days + every: 60s + warn: $this < $days_until_expiration_warning + crit: $this < $days_until_expiration_critical + summary: Whois expiration time for domain ${label:domain} + info: Time until the domain name registration for ${label:domain} expires + to: webmaster diff --git a/src/health/health.d/windows.conf b/src/health/health.d/windows.conf new file mode 100644 index 000000000..9dfda50c1 --- /dev/null +++ b/src/health/health.d/windows.conf @@ -0,0 +1,108 @@ +## CPU + + template: windows_10min_cpu_usage + on: windows.cpu_utilization_total + class: Utilization + type: Windows +component: CPU + lookup: average -10m unaligned match-names of dpc,user,privileged,interrupt + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (75) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + summary: CPU utilization + info: Average CPU utilization over the last 10 minutes + to: silent + +## Memory + + template: windows_ram_in_use + on: windows.memory_utilization + class: Utilization + type: Windows +component: Memory + calc: ($used) * 100 / ($used + $available) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + summary: Ram utilization + info: Memory utilization + to: sysadmin + +## Network + + template: windows_inbound_packets_discarded + on: windows.net_nic_discarded + class: Errors + type: Windows +component: Network + lookup: sum -10m unaligned absolute match-names of inbound + units: packets + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + summary: Inbound network packets discarded + info: Number of inbound discarded packets for the network interface in the last 10 minutes + to: silent + + template: windows_outbound_packets_discarded + on: windows.net_nic_discarded + class: Errors + type: Windows +component: Network + lookup: sum -10m unaligned absolute match-names of outbound + units: packets + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + summary: Outbound network packets discarded + info: Number of outbound discarded packets for the network interface in the last 10 minutes + to: silent + + template: windows_inbound_packets_errors + on: windows.net_nic_errors + class: Errors + type: Windows +component: Network + lookup: sum -10m unaligned absolute match-names of inbound + units: packets + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + summary: Inbound network errors + info: Number of inbound errors for the network interface in the last 10 minutes + to: silent + + template: windows_outbound_packets_errors + on: windows.net_nic_errors + class: Errors + type: Windows +component: Network + lookup: sum -10m unaligned absolute match-names of outbound + units: packets + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + summary: Outbound network errors + info: Number of outbound errors for the network interface in the last 10 minutes + to: silent + +## Disk + + template: windows_disk_in_use + on: windows.logical_disk_space_usage + class: Utilization + type: Windows +component: Disk + calc: ($used) * 100 / ($used + $free) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + summary: Disk space usage + info: Disk space utilization + to: sysadmin diff --git a/src/health/health.d/x509check.conf b/src/health/health.d/x509check.conf new file mode 100644 index 000000000..1d40c8602 --- /dev/null +++ b/src/health/health.d/x509check.conf @@ -0,0 +1,26 @@ + + template: x509check_days_until_expiration + on: x509check.time_until_expiration + class: Latency + type: Certificates +component: x509 certificates + calc: $expiry / 86400 + units: days + every: 60s + warn: $this < $days_until_expiration_warning + crit: $this < $days_until_expiration_critical + summary: x509 certificate expiration for ${label:source} + info: Time until x509 certificate expires for ${label:source} + to: webmaster + + template: x509check_revocation_status + on: x509check.revocation_status + class: Errors + type: Certificates +component: x509 certificates + calc: $revoked + every: 60s + crit: $this != nan AND $this != 0 + summary: x509 certificate revocation status for ${label:source} + info: x509 certificate revocation status (0: revoked, 1: valid) for ${label:source} + to: webmaster diff --git a/src/health/health.d/zfs.conf b/src/health/health.d/zfs.conf new file mode 100644 index 000000000..9c1f0018b --- /dev/null +++ b/src/health/health.d/zfs.conf @@ -0,0 +1,90 @@ + + alarm: zfs_memory_throttle + on: zfs.memory_ops + class: Utilization + type: System +component: File system + lookup: sum -10m unaligned absolute of throttled + units: events + every: 1m + warn: $this > 0 + delay: down 1h multiplier 1.5 max 2h + summary: ZFS ARC growth throttling + info: number of times ZFS had to limit the ARC growth in the last 10 minutes + to: silent + +# ZFS pool state + + template: zfs_pool_state_warn + on: zfspool.state + class: Errors + type: System +component: File system + calc: $degraded + units: boolean + every: 10s + warn: $this > 0 + delay: down 1m multiplier 1.5 max 1h + summary: ZFS pool ${label:pool} state + info: ZFS pool ${label:pool} state is degraded + to: sysadmin + + template: zfs_pool_state_crit + on: zfspool.state + class: Errors + type: System +component: File system + calc: $faulted + $unavail + units: boolean + every: 10s + crit: $this > 0 + delay: down 1m multiplier 1.5 max 1h + summary: Critical ZFS pool ${label:pool} state + info: ZFS pool ${label:pool} state is faulted or unavail + to: sysadmin + + +## go.d/zfspool + + template: zfs_pool_space_utilization + on: zfspool.pool_space_utilization + class: Utilization + type: System +component: File system + calc: $utilization + units: % + every: 1m + warn: $this > (($status >= $WARNING ) ? (85) : (90)) + crit: $this > (($status >= $WARNING ) ? (90) : (98)) + delay: down 1m multiplier 1.5 max 1h + summary: ZFS pool ${label:pool} space utilization + info: ZFS pool ${label:pool} is nearing capacity. Current space usage is above the threshold. + to: sysadmin + + template: zfs_pool_health_state_warn + on: zfspool.pool_health_state + class: Errors + type: System +component: File system + calc: $degraded + units: boolean + every: 10s + warn: $this > 0 + delay: down 1m multiplier 1.5 max 1h + summary: ZFS pool ${label:pool} state + info: ZFS pool ${label:pool} state is degraded + to: sysadmin + + template: zfs_pool_health_state_crit + on: zfspool.pool_health_state + class: Errors + type: System +component: File system + calc: $faulted + $unavail + units: boolean + every: 10s + crit: $this > 0 + delay: down 1m multiplier 1.5 max 1h + summary: Critical ZFS pool ${label:pool} state + info: ZFS pool ${label:pool} state is faulted or unavail + to: sysadmin diff --git a/src/health/health.h b/src/health/health.h new file mode 100644 index 000000000..b1ac5a9e1 --- /dev/null +++ b/src/health/health.h @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_HEALTH_H +#define NETDATA_HEALTH_H 1 + +#include "daemon/common.h" +#include "rrdcalc.h" + +typedef enum __attribute__((packed)) { + HEALTH_ENTRY_FLAG_PROCESSED = 0x00000001, // notifications engine has processed this + HEALTH_ENTRY_FLAG_UPDATED = 0x00000002, // there is a more recent update about this transition + HEALTH_ENTRY_FLAG_EXEC_RUN = 0x00000004, // notification script has been run (this is the intent, not the result) + HEALTH_ENTRY_FLAG_EXEC_FAILED = 0x00000008, // notification script couldn't be run + HEALTH_ENTRY_FLAG_SILENCED = 0x00000010, + HEALTH_ENTRY_RUN_ONCE = 0x00000020, + HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS = 0x00000040, + HEALTH_ENTRY_FLAG_IS_REPEATING = 0x00000080, + HEALTH_ENTRY_FLAG_SAVED = 0x10000000, // Saved to SQL + HEALTH_ENTRY_FLAG_ACLK_QUEUED = 0x20000000, // Sent to Netdata Cloud + HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION = 0x80000000, +} HEALTH_ENTRY_FLAGS; + +#define RRDR_OPTIONS_DATA_SOURCES (RRDR_OPTION_PERCENTAGE|RRDR_OPTION_ANOMALY_BIT) +#define RRDR_OPTIONS_DIMS_AGGREGATION (RRDR_OPTION_DIMS_MIN|RRDR_OPTION_DIMS_MAX|RRDR_OPTION_DIMS_AVERAGE|RRDR_OPTION_DIMS_MIN2MAX) +#define RRDR_OPTIONS_REMOVE_OVERLAPPING(options) ((options) & ~(RRDR_OPTIONS_DIMS_AGGREGATION|RRDR_OPTIONS_DATA_SOURCES)) + +void health_entry_flags_to_json_array(BUFFER *wb, const char *key, HEALTH_ENTRY_FLAGS flags); + +#ifndef HEALTH_LISTEN_PORT +#define HEALTH_LISTEN_PORT 19998 +#endif + +#ifndef HEALTH_LISTEN_BACKLOG +#define HEALTH_LISTEN_BACKLOG 4096 +#endif + +#ifndef HEALTH_LOG_DEFAULT_HISTORY +#define HEALTH_LOG_DEFAULT_HISTORY 432000 +#endif + +#ifndef HEALTH_LOG_MINIMUM_HISTORY +#define HEALTH_LOG_MINIMUM_HISTORY 86400 +#endif + +#define HEALTH_SILENCERS_MAX_FILE_LEN 10000 + +void health_plugin_init(void); +void health_plugin_destroy(void); + +void health_plugin_reload(void); + +void health_aggregate_alarms(RRDHOST *host, BUFFER *wb, BUFFER* context, RRDCALC_STATUS status); +void health_alarms2json(RRDHOST *host, BUFFER *wb, int all); +void health_alert2json_conf(RRDHOST *host, BUFFER *wb, CONTEXTS_V2_OPTIONS all); +void health_alarms_values2json(RRDHOST *host, BUFFER *wb, int all); + +void health_api_v1_chart_variables2json(RRDSET *st, BUFFER *wb); +void health_api_v1_chart_custom_variables2json(RRDSET *st, BUFFER *buf); + +int health_alarm_log_open(RRDHOST *host); +void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae); +void health_alarm_log_load(RRDHOST *host); + +ALARM_ENTRY* health_create_alarm_entry( + RRDHOST *host, + RRDCALC *rc, + time_t when, + time_t duration, + NETDATA_DOUBLE old_value, + NETDATA_DOUBLE new_value, + RRDCALC_STATUS old_status, + RRDCALC_STATUS new_status, + int delay, + HEALTH_ENTRY_FLAGS flags); + +void health_alarm_log_add_entry(RRDHOST *host, ALARM_ENTRY *ae); + +char *health_user_config_dir(void); +char *health_stock_config_dir(void); +void health_alarm_log_free(RRDHOST *host); + +void health_alarm_log_free_one_nochecks_nounlink(ALARM_ENTRY *ae); + +void *health_cmdapi_thread(void *ptr); + +char *health_edit_command_from_source(const char *source); + +void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix); + +void health_log_alert_transition_with_trace(RRDHOST *host, ALARM_ENTRY *ae, int line, const char *file, const char *function); +#define health_log_alert(host, ae) health_log_alert_transition_with_trace(host, ae, __LINE__, __FILE__, __FUNCTION__) +bool health_alarm_log_get_global_id_and_transition_id_for_rrdcalc(RRDCALC *rc, usec_t *global_id, nd_uuid_t *transitions_id); + +int alert_variable_lookup_trace(RRDHOST *host, RRDSET *st, const char *variable, BUFFER *wb); + +#include "health_prototypes.h" +#include "health_silencers.h" + +typedef void (*prototype_metadata_cb_t)(void *data, STRING *type, STRING *component, STRING *classification, STRING *recipient); +void health_prototype_metadata_foreach(void *data, prototype_metadata_cb_t cb); + +#endif //NETDATA_HEALTH_H diff --git a/src/health/health_config.c b/src/health/health_config.c new file mode 100644 index 000000000..c17f7e21d --- /dev/null +++ b/src/health/health_config.c @@ -0,0 +1,842 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "health.h" +#include "health_internals.h" + +static inline int health_parse_delay( + size_t line, const char *filename, char *string, + int *delay_up_duration, + int *delay_down_duration, + int *delay_max_duration, + float *delay_multiplier) { + + char given_up = 0; + char given_down = 0; + char given_max = 0; + char given_multiplier = 0; + + char *s = string; + while(*s) { + char *key = s; + + while(*s && !isspace((uint8_t)*s)) s++; + while(*s && isspace((uint8_t)*s)) *s++ = '\0'; + + if(!*key) break; + + char *value = s; + while(*s && !isspace((uint8_t)*s)) s++; + while(*s && isspace((uint8_t)*s)) *s++ = '\0'; + + if(!strcasecmp(key, "up")) { + if (!config_parse_duration(value, delay_up_duration)) { + netdata_log_error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword", + line, filename, value, key); + } + else given_up = 1; + } + else if(!strcasecmp(key, "down")) { + if (!config_parse_duration(value, delay_down_duration)) { + netdata_log_error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword", + line, filename, value, key); + } + else given_down = 1; + } + else if(!strcasecmp(key, "multiplier")) { + *delay_multiplier = strtof(value, NULL); + if(isnan(*delay_multiplier) || isinf(*delay_multiplier) || islessequal(*delay_multiplier, 0)) { + netdata_log_error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword", + line, filename, value, key); + } + else given_multiplier = 1; + } + else if(!strcasecmp(key, "max")) { + if (!config_parse_duration(value, delay_max_duration)) { + netdata_log_error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword", + line, filename, value, key); + } + else given_max = 1; + } + else { + netdata_log_error("Health configuration at line %zu of file '%s': unknown keyword '%s'", + line, filename, key); + } + } + + if(!given_up) + *delay_up_duration = 0; + + if(!given_down) + *delay_down_duration = 0; + + if(!given_multiplier) + *delay_multiplier = 1.0; + + if(!given_max) { + if((*delay_max_duration) < (*delay_up_duration) * (*delay_multiplier)) + *delay_max_duration = (int)((*delay_up_duration) * (*delay_multiplier)); + + if((*delay_max_duration) < (*delay_down_duration) * (*delay_multiplier)) + *delay_max_duration = (int)((*delay_down_duration) * (*delay_multiplier)); + } + + return 1; +} + +static inline ALERT_ACTION_OPTIONS health_parse_options(const char *s) { + ALERT_ACTION_OPTIONS options = ALERT_ACTION_OPTION_NONE; + char buf[100+1] = ""; + + while(*s) { + buf[0] = '\0'; + + // skip spaces + while(*s && isspace((uint8_t)*s)) + s++; + + // find the next space + size_t count = 0; + while(*s && count < 100 && !isspace((uint8_t)*s)) + buf[count++] = *s++; + + if(buf[0]) { + buf[count] = '\0'; + + if(!strcasecmp(buf, "no-clear-notification") || !strcasecmp(buf, "no-clear")) + options |= ALERT_ACTION_OPTION_NO_CLEAR_NOTIFICATION; + else + netdata_log_error("Ignoring unknown alarm option '%s'", buf); + } + } + + return options; +} + +static inline int health_parse_repeat( + size_t line, + const char *file, + char *string, + uint32_t *warn_repeat_every, + uint32_t *crit_repeat_every +) { + + char *s = string; + while(*s) { + char *key = s; + + while(*s && !isspace((uint8_t)*s)) s++; + while(*s && isspace((uint8_t)*s)) *s++ = '\0'; + + if(!*key) break; + + char *value = s; + while(*s && !isspace((uint8_t)*s)) s++; + while(*s && isspace((uint8_t)*s)) *s++ = '\0'; + + if(!strcasecmp(key, "off")) { + *warn_repeat_every = 0; + *crit_repeat_every = 0; + return 1; + } + if(!strcasecmp(key, "warning")) { + if (!config_parse_duration(value, (int*)warn_repeat_every)) { + netdata_log_error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword", + line, file, value, key); + } + } + else if(!strcasecmp(key, "critical")) { + if (!config_parse_duration(value, (int*)crit_repeat_every)) { + netdata_log_error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword", + line, file, value, key); + } + } + } + + return 1; +} + +static inline int isvariableterm(const char s) { + if(isalnum(s) || s == '.' || s == '_') + return 0; + + return 1; +} + +static inline int health_parse_db_lookup(size_t line, const char *filename, char *string, struct rrd_alert_config *ac) { + if(ac->dimensions) string_freez(ac->dimensions); + ac->dimensions = NULL; + ac->after = 0; + ac->before = 0; + ac->update_every = 0; + ac->options = 0; + ac->time_group_condition = ALERT_LOOKUP_TIME_GROUP_CONDITION_EQUAL; + ac->time_group_value = NAN; + + char *s = string, *key; + + // first is the group method + key = s; + while(*s && !isspace((uint8_t)*s) && *s != '(') s++; + while(*s && isspace((uint8_t)*s)) *s++ = '\0'; + if(!*s) { + netdata_log_error("Health configuration invalid chart calculation at line %zu of file '%s': expected group method followed by the 'after' time, but got '%s'", + line, filename, key); + return 0; + } + + bool group_options = false; + if(*s == '(') { + *s++ = '\0'; + group_options = true; + } + + if((ac->time_group = time_grouping_parse(key, RRDR_GROUPING_UNDEFINED)) == RRDR_GROUPING_UNDEFINED) { + netdata_log_error("Health configuration at line %zu of file '%s': invalid group method '%s'", + line, filename, key); + return 0; + } + + if(group_options) { + if(*s == '!') { + s++; + if(*s == '=') s++; + ac->time_group_condition = ALERT_LOOKUP_TIME_GROUP_CONDITION_NOT_EQUAL; + } + else if(*s == '<') { + s++; + if(*s == '>') { + s++; + ac->time_group_condition = ALERT_LOOKUP_TIME_GROUP_CONDITION_NOT_EQUAL; + } + else if(*s == '=') { + s++; + ac->time_group_condition = ALERT_LOOKUP_TIME_GROUP_CONDITION_GREATER_EQUAL; + } + else + ac->time_group_condition = ALERT_LOOKUP_TIME_GROUP_CONDITION_GREATER; + } + else if(*s == '>') { + if(*s == '=') { + s++; + ac->time_group_condition = ALERT_LOOKUP_TIME_GROUP_CONDITION_LESS_EQUAL; + } + else + ac->time_group_condition = ALERT_LOOKUP_TIME_GROUP_CONDITION_LESS; + } + + while(*s && isspace((uint8_t)*s)) s++; + + if(*s) { + if(isdigit((uint8_t)*s) || *s == '.') { + ac->time_group_value = str2ndd(s, &s); + while(s && *s && isspace((uint8_t)*s)) s++; + + if(!s || *s != ')') { + netdata_log_error("Health configuration at line %zu of file '%s': missing closing parenthesis after number in aggregation method on '%s'", + line, filename, key); + return 0; + } + } + } + else if(*s != ')') { + netdata_log_error("Health configuration at line %zu of file '%s': missing closing parenthesis after method on '%s'", + line, filename, key); + return 0; + } + + s++; + } + + switch (ac->time_group) { + default: + break; + + case RRDR_GROUPING_COUNTIF: + if(isnan(ac->time_group_value)) + ac->time_group_value = 0; + break; + + case RRDR_GROUPING_TRIMMED_MEAN: + case RRDR_GROUPING_TRIMMED_MEDIAN: + if(isnan(ac->time_group_value)) + ac->time_group_value = 5; + break; + + case RRDR_GROUPING_PERCENTILE: + if(isnan(ac->time_group_value)) + ac->time_group_value = 95; + break; + } + + // then is the 'after' time + key = s; + while(*s && !isspace((uint8_t)*s)) s++; + while(*s && isspace((uint8_t)*s)) *s++ = '\0'; + + if(!config_parse_duration(key, &ac->after)) { + netdata_log_error("Health configuration at line %zu of file '%s': invalid duration '%s' after group method", + line, filename, key); + return 0; + } + + // sane defaults + ac->update_every = ABS(ac->after); + + // now we may have optional parameters + while(*s) { + key = s; + while(*s && !isspace((uint8_t)*s)) s++; + while(*s && isspace((uint8_t)*s)) *s++ = '\0'; + if(!*key) break; + + if(!strcasecmp(key, "at")) { + char *value = s; + while(*s && !isspace((uint8_t)*s)) s++; + while(*s && isspace((uint8_t)*s)) *s++ = '\0'; + + if (!config_parse_duration(value, &ac->before)) { + netdata_log_error("Health configuration at line %zu of file '%s': invalid duration '%s' for '%s' keyword", + line, filename, value, key); + } + } + else if(!strcasecmp(key, HEALTH_EVERY_KEY)) { + char *value = s; + while(*s && !isspace((uint8_t)*s)) s++; + while(*s && isspace((uint8_t)*s)) *s++ = '\0'; + + if (!config_parse_duration(value, &ac->update_every)) { + netdata_log_error("Health configuration at line %zu of file '%s': invalid duration '%s' for '%s' keyword", + line, filename, value, key); + } + } + else if(!strcasecmp(key, "absolute") || !strcasecmp(key, "abs") || !strcasecmp(key, "absolute_sum")) { + ac->options |= RRDR_OPTION_ABSOLUTE; + } + else if(!strcasecmp(key, "min2max")) { + ac->options |= RRDR_OPTION_DIMS_MIN2MAX; + } + else if(!strcasecmp(key, "average")) { + ac->options |= RRDR_OPTION_DIMS_AVERAGE; + } + else if(!strcasecmp(key, "min")) { + ac->options |= RRDR_OPTION_DIMS_MIN; + } + else if(!strcasecmp(key, "max")) { + ac->options |= RRDR_OPTION_DIMS_MAX; + } + else if(!strcasecmp(key, "sum")) { + ; + } + else if(!strcasecmp(key, "null2zero")) { + ac->options |= RRDR_OPTION_NULL2ZERO; + } + else if(!strcasecmp(key, "percentage")) { + ac->options |= RRDR_OPTION_PERCENTAGE; + } + else if(!strcasecmp(key, "unaligned")) { + ac->options |= RRDR_OPTION_NOT_ALIGNED; + } + else if(!strcasecmp(key, "anomaly-bit")) { + ac->options |= RRDR_OPTION_ANOMALY_BIT; + } + else if(!strcasecmp(key, "match-ids") || !strcasecmp(key, "match_ids")) { + ac->options |= RRDR_OPTION_MATCH_IDS; + } + else if(!strcasecmp(key, "match-names") || !strcasecmp(key, "match_names")) { + ac->options |= RRDR_OPTION_MATCH_NAMES; + } + else if(!strcasecmp(key, "of")) { + char *find = NULL; + if(*s && strcasecmp(s, "all") != 0) { + find = strcasestr(s, " foreach"); + if(find) { + *find = '\0'; + } + ac->dimensions = string_strdupz(s); + } + + if(!find) { + break; + } + s = ++find; + } + else { + netdata_log_error("Health configuration at line %zu of file '%s': unknown keyword '%s'", + line, filename, key); + } + } + + return 1; +} + +static inline STRING *health_source_file(size_t line, const char *file) { + char buffer[FILENAME_MAX + 1]; + snprintfz(buffer, FILENAME_MAX, "line=%zu,file=%s", line, file); + return string_strdupz(buffer); +} + +char *health_edit_command_from_source(const char *source) +{ + char buffer[FILENAME_MAX + 1]; + char *temp = strdupz(source); + char *line_num = strchr(temp, '@'); + char *line_p = temp; + char *file_no_path = strrchr(temp, '/'); + + // Check for the 'line=' format if '@' is not found + if (!line_num) { + line_num = strstr(temp, "line="); + file_no_path = strstr(temp, "file=/"); + } + + if (likely(file_no_path && line_num)) { + if (line_num == strchr(temp, '@')) { + *line_num = '\0'; // Handle the old format + } else { + line_num += strlen("line="); + file_no_path = strrchr(file_no_path + strlen("file="), '/'); + char *line_end = strchr(line_num, ','); + if (line_end) { + line_p = line_num; + *line_end = '\0'; + } + } + + snprintfz( + buffer, + FILENAME_MAX, + "sudo %s/edit-config health.d/%s=%s=%s", + netdata_configured_user_config_dir, + file_no_path + 1, + line_p, + rrdhost_registry_hostname(localhost)); + } else { + buffer[0] = '\0'; + } + + freez(temp); + return strdupz(buffer); +} + + +static inline void strip_quotes(char *s) { + while(*s) { + if(*s == '\'' || *s == '"') *s = ' '; + s++; + } +} + +static void replace_green_red(RRD_ALERT_PROTOTYPE *ap, NETDATA_DOUBLE green, NETDATA_DOUBLE red) { + if(!isnan(green)) { + STRING *green_str = string_strdupz("green"); + expression_hardcode_variable(ap->config.calculation, green_str, green); + expression_hardcode_variable(ap->config.warning, green_str, green); + expression_hardcode_variable(ap->config.critical, green_str, green); + string_freez(green_str); + } + + if(!isnan(red)) { + STRING *red_str = string_strdupz("red"); + expression_hardcode_variable(ap->config.calculation, red_str, red); + expression_hardcode_variable(ap->config.warning, red_str, red); + expression_hardcode_variable(ap->config.critical, red_str, red); + string_freez(red_str); + } +} + +static void dims_grouping_from_rrdr_options(RRD_ALERT_PROTOTYPE *ap) { + if(ap->config.options & RRDR_OPTION_DIMS_MIN) + ap->config.dims_group = ALERT_LOOKUP_DIMS_MIN; + else if(ap->config.options & RRDR_OPTION_DIMS_MAX) + ap->config.dims_group = ALERT_LOOKUP_DIMS_MAX; + else if(ap->config.options & RRDR_OPTION_DIMS_MIN2MAX) + ap->config.dims_group = ALERT_LOOKUP_DIMS_MIN2MAX; + else if(ap->config.options & RRDR_OPTION_DIMS_AVERAGE) + ap->config.dims_group = ALERT_LOOKUP_DIMS_AVERAGE; + else + ap->config.dims_group = ALERT_LOOKUP_DIMS_SUM; +} + +static void lookup_data_source_from_rrdr_options(RRD_ALERT_PROTOTYPE *ap) { + if(ap->config.options & RRDR_OPTION_PERCENTAGE) + ap->config.data_source = ALERT_LOOKUP_DATA_SOURCE_PERCENTAGES; + else if(ap->config.options & RRDR_OPTION_ANOMALY_BIT) + ap->config.data_source = ALERT_LOOKUP_DATA_SOURCE_ANOMALIES; + else + ap->config.data_source = ALERT_LOOKUP_DATA_SOURCE_SAMPLES; +} + +#define PARSE_HEALTH_CONFIG_LOG_DUPLICATE_STRING_MSG(ax, member) do { \ + if(strcmp(string2str(ax->member), value) != 0) \ + netdata_log_error( \ + "Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, " \ + "once with value '%s' and later with value '%s'. Using ('%s').", \ + line, filename, string2str(ac->name), key, \ + string2str(ax->member), value, value); \ +} while(0) + +#define PARSE_HEALTH_CONFIG_LINE_STRING(ax, member) do { \ + if(ax->member) { \ + PARSE_HEALTH_CONFIG_LOG_DUPLICATE_STRING_MSG(ax, member); \ + string_freez(ax->member); \ + } \ + ax->member = string_strdupz(value); \ +} while(0) + +#define PARSE_HEALTH_CONFIG_LINE_PATTERN_APPEND(ax, member, label) do { \ + const char *_label = label; \ + if(_label && !*_label) \ + _label = NULL; \ + \ + if(value && (!*value || strcmp(value, "*") == 0)) \ + value = NULL; \ + else if(value && (strcmp(value, "!* *") == 0 || strcmp(value, "!*") == 0)) { \ + value = NULL; \ + ap->match.enabled = false; \ + } \ + \ + if(value && !_label && !strchr(value, '=')) { \ + netdata_log_error( \ + "Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' " \ + "with value '%s' that does not match label=pattern. Ignoring it.", \ + line, filename, string2str(ac->name), key, value); \ + value = NULL; \ + } \ + \ + if(value) { \ + typeof(ax->member) _old = ax->member; \ + char _buf[strlen(value) + string_strlen(_old) + (_label ? strlen(_label) : 0) + 3]; \ + snprintfz(_buf, sizeof(_buf), "%s%s%s%s%s", \ + _label ? _label : "", \ + _label ? "=" : "", \ + value, \ + _old ? " " : "", \ + _old ? string2str(_old) : ""); \ + string_freez(_old); \ + ax->member = string_strdupz(_buf); \ + } \ +} while(0) + +int health_readfile(const char *filename, void *data __maybe_unused, bool stock_config) { + netdata_log_debug(D_HEALTH, "Health configuration reading file '%s'", filename); + + static uint32_t + hash_alarm = 0, + hash_template = 0, + hash_os = 0, + hash_on = 0, + hash_host = 0, + hash_plugin = 0, + hash_module = 0, + hash_calc = 0, + hash_green = 0, + hash_red = 0, + hash_warn = 0, + hash_crit = 0, + hash_exec = 0, + hash_every = 0, + hash_lookup = 0, + hash_units = 0, + hash_summary = 0, + hash_info = 0, + hash_class = 0, + hash_component = 0, + hash_type = 0, + hash_recipient = 0, + hash_delay = 0, + hash_options = 0, + hash_repeat = 0, + hash_host_label = 0, + hash_chart_label = 0; + + char buffer[HEALTH_CONF_MAX_LINE + 1]; + + if(unlikely(!hash_alarm)) { + hash_alarm = simple_uhash(HEALTH_ALARM_KEY); + hash_template = simple_uhash(HEALTH_TEMPLATE_KEY); + hash_on = simple_uhash(HEALTH_ON_KEY); + hash_os = simple_uhash(HEALTH_OS_KEY); + hash_host = simple_uhash(HEALTH_HOST_KEY); + hash_plugin = simple_uhash(HEALTH_PLUGIN_KEY); + hash_module = simple_uhash(HEALTH_MODULE_KEY); + hash_calc = simple_uhash(HEALTH_CALC_KEY); + hash_lookup = simple_uhash(HEALTH_LOOKUP_KEY); + hash_green = simple_uhash(HEALTH_GREEN_KEY); + hash_red = simple_uhash(HEALTH_RED_KEY); + hash_warn = simple_uhash(HEALTH_WARN_KEY); + hash_crit = simple_uhash(HEALTH_CRIT_KEY); + hash_exec = simple_uhash(HEALTH_EXEC_KEY); + hash_every = simple_uhash(HEALTH_EVERY_KEY); + hash_units = simple_hash(HEALTH_UNITS_KEY); + hash_summary = simple_hash(HEALTH_SUMMARY_KEY); + hash_info = simple_hash(HEALTH_INFO_KEY); + hash_class = simple_uhash(HEALTH_CLASS_KEY); + hash_component = simple_uhash(HEALTH_COMPONENT_KEY); + hash_type = simple_uhash(HEALTH_TYPE_KEY); + hash_recipient = simple_hash(HEALTH_RECIPIENT_KEY); + hash_delay = simple_uhash(HEALTH_DELAY_KEY); + hash_options = simple_uhash(HEALTH_OPTIONS_KEY); + hash_repeat = simple_uhash(HEALTH_REPEAT_KEY); + hash_host_label = simple_uhash(HEALTH_HOST_LABEL_KEY); + hash_chart_label = simple_uhash(HEALTH_CHART_LABEL_KEY); + } + + FILE *fp = fopen(filename, "r"); + if(!fp) { + netdata_log_error("Health configuration cannot read file '%s'.", filename); + return 0; + } + + RRD_ALERT_PROTOTYPE *ap = NULL; + struct rrd_alert_config *ac = NULL; + struct rrd_alert_match *am = NULL; + NETDATA_DOUBLE green = NAN; + NETDATA_DOUBLE red = NAN; + + size_t line = 0, append = 0; + char *s; + while((s = fgets(&buffer[append], (int)(HEALTH_CONF_MAX_LINE - append), fp)) || append) { + int stop_appending = !s; + line++; + s = trim(buffer); + if(!s || *s == '#') continue; + + append = strlen(s); + if(!stop_appending && s[append - 1] == '\\') { + s[append - 1] = ' '; + append = &s[append] - buffer; + if(append < HEALTH_CONF_MAX_LINE) + continue; + else + netdata_log_error( + "Health configuration has too long multi-line at line %zu of file '%s'.", + line, filename); + } + append = 0; + + char *key = s; + while(*s && *s != ':') s++; + if(!*s) { + netdata_log_error( + "Health configuration has invalid line %zu of file '%s'. It does not contain a ':'. Ignoring it.", + line, filename); + continue; + } + *s = '\0'; + s++; + + char *value = s; + key = trim_all(key); + value = trim_all(value); + + if(!key) { + netdata_log_error( + "Health configuration has invalid line %zu of file '%s'. Keyword is empty. Ignoring it.", + line, filename); + + continue; + } + + if(!value) { + netdata_log_error( + "Health configuration has invalid line %zu of file '%s'. value is empty. Ignoring it.", + line, filename); + continue; + } + + uint32_t hash = simple_uhash(key); + + if((hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) || (hash == hash_template && !strcasecmp(key, HEALTH_TEMPLATE_KEY))) { + if(ap) { + lookup_data_source_from_rrdr_options(ap); + dims_grouping_from_rrdr_options(ap); + replace_green_red(ap, green, red); + health_prototype_add(ap, NULL); + freez(ap); + } + + ap = callocz(1, sizeof(*ap)); + am = &ap->match; + ac = &ap->config; + + { + char *tmp = strdupz(value); + if(rrdvar_fix_name(tmp)) + netdata_log_error("Health configuration renamed alarm '%s' to '%s'", value, tmp); + + ap->config.name = string_strdupz(tmp); + freez(tmp); + } + + ap->_internal.enabled = true; + ap->match.enabled = true; + ap->match.is_template = (hash == hash_template && !strcasecmp(key, HEALTH_TEMPLATE_KEY)); + ap->config.source = health_source_file(line, filename); + ap->config.source_type = stock_config ? DYNCFG_SOURCE_TYPE_STOCK : DYNCFG_SOURCE_TYPE_USER; + green = NAN; + red = NAN; + ap->config.delay_multiplier = 1; + ap->config.warn_repeat_every = health_globals.config.default_warn_repeat_every; + ap->config.crit_repeat_every = health_globals.config.default_crit_repeat_every; + } + else if(!am || !ac || !ap) { + netdata_log_error( + "Health configuration at line %zu of file '%s' has unknown key '%s'. " + "Expected either '" HEALTH_ALARM_KEY "' or '" HEALTH_TEMPLATE_KEY "'.", + line, filename, key); + } + else if(!am->is_template && hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) { + PARSE_HEALTH_CONFIG_LINE_STRING(am, on.chart); + } + else if(am->is_template && hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) { + PARSE_HEALTH_CONFIG_LINE_STRING(am, on.context); + } + else if(hash == hash_os && !strcasecmp(key, HEALTH_OS_KEY)) { + PARSE_HEALTH_CONFIG_LINE_PATTERN_APPEND(am, host_labels, "_os"); + } + else if(hash == hash_host && !strcasecmp(key, HEALTH_HOST_KEY)) { + PARSE_HEALTH_CONFIG_LINE_PATTERN_APPEND(am, host_labels, "_hostname"); + } + else if(hash == hash_host_label && !strcasecmp(key, HEALTH_HOST_LABEL_KEY)) { + PARSE_HEALTH_CONFIG_LINE_PATTERN_APPEND(am, host_labels, NULL); + } + else if(hash == hash_plugin && !strcasecmp(key, HEALTH_PLUGIN_KEY)) { + PARSE_HEALTH_CONFIG_LINE_PATTERN_APPEND(am, chart_labels, "_collect_plugin"); + } + else if(hash == hash_module && !strcasecmp(key, HEALTH_MODULE_KEY)) { + PARSE_HEALTH_CONFIG_LINE_PATTERN_APPEND(am, chart_labels, "_collect_module"); + } + else if(hash == hash_chart_label && !strcasecmp(key, HEALTH_CHART_LABEL_KEY)) { + PARSE_HEALTH_CONFIG_LINE_PATTERN_APPEND(am, chart_labels, NULL); + } + else if(hash == hash_class && !strcasecmp(key, HEALTH_CLASS_KEY)) { + strip_quotes(value); + PARSE_HEALTH_CONFIG_LINE_STRING(ac, classification); + } + else if(hash == hash_component && !strcasecmp(key, HEALTH_COMPONENT_KEY)) { + strip_quotes(value); + PARSE_HEALTH_CONFIG_LINE_STRING(ac, component); + } + else if(hash == hash_type && !strcasecmp(key, HEALTH_TYPE_KEY)) { + strip_quotes(value); + PARSE_HEALTH_CONFIG_LINE_STRING(ac, type); + } + else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) { + health_parse_db_lookup(line, filename, value, ac); + } + else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) { + if(!config_parse_duration(value, &ac->update_every)) + netdata_log_error( + "Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' " + "cannot parse duration: '%s'.", + line, filename, string2str(ac->name), key, value); + } + else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) { + char *e; + green = str2ndd(value, &e); + if(e && *e) { + netdata_log_error( + "Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' " + "leaves this string unmatched: '%s'.", + line, filename, string2str(ac->name), key, e); + } + } + else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) { + char *e; + red = str2ndd(value, &e); + if(e && *e) { + netdata_log_error( + "Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' " + "leaves this string unmatched: '%s'.", + line, filename, string2str(ac->name), key, e); + } + } + else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) { + const char *failed_at = NULL; + int error = 0; + ac->calculation = expression_parse(value, &failed_at, &error); + if(!ac->calculation) { + netdata_log_error( + "Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' " + "has non-parseable expression '%s': %s at '%s'", + line, filename, string2str(ac->name), key, value, expression_strerror(error), failed_at); + am->enabled = false; + } + } + else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) { + const char *failed_at = NULL; + int error = 0; + ac->warning = expression_parse(value, &failed_at, &error); + if(!ac->warning) { + netdata_log_error( + "Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' " + "has non-parseable expression '%s': %s at '%s'", + line, filename, string2str(ac->name), key, value, expression_strerror(error), failed_at); + am->enabled = false; + } + } + else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) { + const char *failed_at = NULL; + int error = 0; + ac->critical = expression_parse(value, &failed_at, &error); + if(!ac->critical) { + netdata_log_error( + "Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' " + "has non-parseable expression '%s': %s at '%s'", + line, filename, string2str(ac->name), key, value, expression_strerror(error), failed_at); + am->enabled = false; + } + } + else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) { + strip_quotes(value); + PARSE_HEALTH_CONFIG_LINE_STRING(ac, exec); + } + else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) { + strip_quotes(value); + PARSE_HEALTH_CONFIG_LINE_STRING(ac, recipient); + } + else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) { + strip_quotes(value); + PARSE_HEALTH_CONFIG_LINE_STRING(ac, units); + } + else if(hash == hash_summary && !strcasecmp(key, HEALTH_SUMMARY_KEY)) { + strip_quotes(value); + PARSE_HEALTH_CONFIG_LINE_STRING(ac, summary); + } + else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) { + strip_quotes(value); + PARSE_HEALTH_CONFIG_LINE_STRING(ac, info); + } + else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) { + health_parse_delay(line, filename, value, + &ac->delay_up_duration, &ac->delay_down_duration, + &ac->delay_max_duration, &ac->delay_multiplier); + } + else if(hash == hash_options && !strcasecmp(key, HEALTH_OPTIONS_KEY)) { + ac->alert_action_options |= health_parse_options(value); + } + else if(hash == hash_repeat && !strcasecmp(key, HEALTH_REPEAT_KEY)){ + health_parse_repeat(line, filename, value, + &ac->warn_repeat_every, + &ac->crit_repeat_every); + ac->has_custom_repeat_config = true; + } + else { + if (strcmp(key, "families") != 0 && strcmp(key, "charts") != 0) + netdata_log_error( + "Health configuration at line %zu of file '%s' for alarm/template '%s' has unknown key '%s'.", + line, filename, string2str(ac->name), key); + } + } + + if(ap) { + lookup_data_source_from_rrdr_options(ap); + dims_grouping_from_rrdr_options(ap); + replace_green_red(ap, green, red); + health_prototype_add(ap, NULL); + freez(ap); + } + + fclose(fp); + return 1; +} diff --git a/src/health/health_dyncfg.c b/src/health/health_dyncfg.c new file mode 100644 index 000000000..f2b9bc607 --- /dev/null +++ b/src/health/health_dyncfg.c @@ -0,0 +1,842 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "health_internals.h" + +#define DYNCFG_HEALTH_ALERT_PROTOTYPE_PREFIX "health:alert:prototype" + +static void health_dyncfg_register_prototype(RRD_ALERT_PROTOTYPE *ap); + +// --------------------------------------------------------------------------------------------------------------------- +// parse the json object of an alert definition + +static void dims_grouping_to_rrdr_options(RRD_ALERT_PROTOTYPE *ap) { + ap->config.options &= ~(RRDR_OPTIONS_DIMS_AGGREGATION); + + switch(ap->config.dims_group) { + default: + case ALERT_LOOKUP_DIMS_SUM: + break; + + case ALERT_LOOKUP_DIMS_AVERAGE: + ap->config.options |= RRDR_OPTION_DIMS_AVERAGE; + break; + + case ALERT_LOOKUP_DIMS_MIN: + ap->config.options |= RRDR_OPTION_DIMS_MIN; + break; + + case ALERT_LOOKUP_DIMS_MAX: + ap->config.options |= RRDR_OPTION_DIMS_MAX; + break; + + case ALERT_LOOKUP_DIMS_MIN2MAX: + ap->config.options |= RRDR_OPTION_DIMS_MIN2MAX; + break; + } +} + +static void data_source_to_rrdr_options(RRD_ALERT_PROTOTYPE *ap) { + ap->config.options &= ~(RRDR_OPTIONS_DATA_SOURCES); + + switch(ap->config.data_source) { + default: + case ALERT_LOOKUP_DATA_SOURCE_SAMPLES: + break; + + case ALERT_LOOKUP_DATA_SOURCE_PERCENTAGES: + ap->config.options |= RRDR_OPTION_PERCENTAGE; + break; + + case ALERT_LOOKUP_DATA_SOURCE_ANOMALIES: + ap->config.options |= RRDR_OPTION_ANOMALY_BIT; + break; + } +} + +static bool parse_match(json_object *jobj, const char *path, struct rrd_alert_match *match, BUFFER *error, bool strict) { + STRING *on = NULL; + JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, "on", on, error, strict); + if(match->is_template) + match->on.context = on; + else + match->on.chart = on; + + JSONC_PARSE_TXT2PATTERN_OR_ERROR_AND_RETURN(jobj, path, "host_labels", match->host_labels, error, strict); + JSONC_PARSE_TXT2PATTERN_OR_ERROR_AND_RETURN(jobj, path, "instance_labels", match->chart_labels, error, strict); + + return true; +} + +static bool parse_config_value_database_lookup(json_object *jobj, const char *path, struct rrd_alert_config *config, BUFFER *error, bool strict) { + JSONC_PARSE_INT_OR_ERROR_AND_RETURN(jobj, path, "after", config->after, error, strict); + JSONC_PARSE_INT_OR_ERROR_AND_RETURN(jobj, path, "before", config->before, error, strict); + JSONC_PARSE_TXT2ENUM_OR_ERROR_AND_RETURN(jobj, path, "time_group", time_grouping_txt2id, config->time_group, error, strict); + JSONC_PARSE_TXT2ENUM_OR_ERROR_AND_RETURN(jobj, path, "dims_group", alerts_dims_grouping2id, config->dims_group, error, strict); + JSONC_PARSE_TXT2ENUM_OR_ERROR_AND_RETURN(jobj, path, "data_source", alerts_data_sources2id, config->data_source, error, strict); + + switch(config->time_group) { + default: + break; + + case RRDR_GROUPING_COUNTIF: + JSONC_PARSE_TXT2ENUM_OR_ERROR_AND_RETURN(jobj, path, "time_group_condition", alerts_group_condition2id, config->time_group_condition, error, strict); + // fall through + + case RRDR_GROUPING_TRIMMED_MEAN: + case RRDR_GROUPING_TRIMMED_MEDIAN: + case RRDR_GROUPING_PERCENTILE: + JSONC_PARSE_DOUBLE_OR_ERROR_AND_RETURN(jobj, path, "time_group_value", config->time_group_value, error, strict); + break; + } + + JSONC_PARSE_ARRAY_OF_TXT2BITMAP_OR_ERROR_AND_RETURN(jobj, path, "options", rrdr_options_parse_one, config->options, error, strict); + JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, "dimensions", config->dimensions, error, strict); + return true; +} + +static bool parse_config_value(json_object *jobj, const char *path, struct rrd_alert_config *config, BUFFER *error, bool strict) { + JSONC_PARSE_SUBOBJECT(jobj, path, "database_lookup", config, parse_config_value_database_lookup, error, strict); + JSONC_PARSE_TXT2EXPRESSION_OR_ERROR_AND_RETURN(jobj, path, "calculation", config->calculation, error, false); + JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, "units", config->units, error, false); + JSONC_PARSE_INT_OR_ERROR_AND_RETURN(jobj, path, "update_every", config->update_every, error, strict); + return true; +} + +static bool parse_config_conditions(json_object *jobj, const char *path, struct rrd_alert_config *config, BUFFER *error, bool strict) { + JSONC_PARSE_TXT2EXPRESSION_OR_ERROR_AND_RETURN(jobj, path, "warning_condition", config->warning, error, strict); + JSONC_PARSE_TXT2EXPRESSION_OR_ERROR_AND_RETURN(jobj, path, "critical_condition", config->critical, error, strict); + return true; +} + +static bool parse_config_action_delay(json_object *jobj, const char *path, struct rrd_alert_config *config, BUFFER *error, bool strict) { + JSONC_PARSE_INT_OR_ERROR_AND_RETURN(jobj, path, "up", config->delay_up_duration, error, strict); + JSONC_PARSE_INT_OR_ERROR_AND_RETURN(jobj, path, "down", config->delay_down_duration, error, strict); + JSONC_PARSE_INT_OR_ERROR_AND_RETURN(jobj, path, "max", config->delay_max_duration, error, strict); + JSONC_PARSE_DOUBLE_OR_ERROR_AND_RETURN(jobj, path, "multiplier", config->delay_multiplier, error, strict); + return true; +} + +static bool parse_config_action_repeat(json_object *jobj, const char *path, struct rrd_alert_config *config, BUFFER *error, bool strict) { + JSONC_PARSE_BOOL_OR_ERROR_AND_RETURN(jobj, path, "enabled", config->has_custom_repeat_config, error, strict); + JSONC_PARSE_INT_OR_ERROR_AND_RETURN(jobj, path, "warning", config->warn_repeat_every, error, strict); + JSONC_PARSE_INT_OR_ERROR_AND_RETURN(jobj, path, "critical", config->crit_repeat_every, error, strict); + return true; +} + +static bool parse_config_action(json_object *jobj, const char *path, struct rrd_alert_config *config, BUFFER *error, bool strict) { + JSONC_PARSE_ARRAY_OF_TXT2BITMAP_OR_ERROR_AND_RETURN(jobj, path, "options", alert_action_options_parse_one, config->alert_action_options, error, strict); + JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, "execute", config->exec, error, strict); + JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, "recipient", config->recipient, error, strict); + JSONC_PARSE_SUBOBJECT(jobj, path, "delay", config, parse_config_action_delay, error, strict); + JSONC_PARSE_SUBOBJECT(jobj, path, "repeat", config, parse_config_action_repeat, error, strict); + return true; +} + +static bool parse_config(json_object *jobj, const char *path, RRD_ALERT_PROTOTYPE *ap, BUFFER *error, bool strict) { + // we shouldn't parse these from the payload - they are given to us via the function call + // JSONC_PARSE_TXT2ENUM_OR_ERROR_AND_RETURN(jobj, path, "source_type", dyncfg_source_type2id, ap->config.source_type, error, strict); + // JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, "source", ap->config.source, error, strict); + + JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, "summary", ap->config.summary, error, false); + JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, "info", ap->config.info, error, false); + JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, "type", ap->config.type, error, false); + JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, "component", ap->config.component, error, false); + JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, "classification", ap->config.classification, error, false); + + JSONC_PARSE_SUBOBJECT(jobj, path, "value", &ap->config, parse_config_value, error, strict); + JSONC_PARSE_SUBOBJECT(jobj, path, "conditions", &ap->config, parse_config_conditions, error, false); + JSONC_PARSE_SUBOBJECT(jobj, path, "action", &ap->config, parse_config_action, error, false); + JSONC_PARSE_SUBOBJECT(jobj, path, "match", &ap->match, parse_match, error, strict); + + return true; +} + +static bool parse_prototype(json_object *jobj, const char *path, RRD_ALERT_PROTOTYPE *base, BUFFER *error, const char *name, bool strict) { + int64_t version = 0; + JSONC_PARSE_INT_OR_ERROR_AND_RETURN(jobj, path, "format_version", version, error, strict); + + if(version != 1) { + buffer_sprintf(error, "unsupported document version"); + return false; + } + + JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, "name", base->config.name, error, !name && !*name && strict); + + json_object *rules; + if (json_object_object_get_ex(jobj, "rules", &rules)) { + size_t rules_len = json_object_array_length(rules); + + RRD_ALERT_PROTOTYPE *ap = base; // fill the first entry + for (size_t i = 0; i < rules_len; i++) { + if(!ap) { + ap = callocz(1, sizeof(*base)); + ap->config.name = string_dup(base->config.name); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(base->_internal.next, ap, _internal.prev, _internal.next); + } + + json_object *rule = json_object_array_get_idx(rules, i); + + JSONC_PARSE_BOOL_OR_ERROR_AND_RETURN(rule, path, "enabled", ap->match.enabled, error, strict); + + STRING *type = NULL; + JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(rule, path, "type", type, error, strict); + if(string_strcmp(type, "template") == 0) + ap->match.is_template = true; + else if(string_strcmp(type, "instance") == 0) + ap->match.is_template = false; + else { + buffer_sprintf(error, "type is '%s', but it can only be 'instance' or 'template'", string2str(type)); + return false; + } + + JSONC_PARSE_SUBOBJECT(rule, path, "config", ap, parse_config, error, strict); + + ap = NULL; // so that we will create another one, if available + } + } + else { + buffer_sprintf(error, "the rules array is missing"); + return false; + } + + return true; +} + +static RRD_ALERT_PROTOTYPE *health_prototype_payload_parse(const char *payload, size_t payload_len, BUFFER *error, const char *name, bool strict) { + RRD_ALERT_PROTOTYPE *base = callocz(1, sizeof(*base)); + CLEAN_JSON_OBJECT *jobj = NULL; + + struct json_tokener *tokener = json_tokener_new(); + if (!tokener) { + buffer_sprintf(error, "failed to allocate memory for json tokener"); + goto cleanup; + } + + jobj = json_tokener_parse_ex(tokener, payload, (int)payload_len); + if (json_tokener_get_error(tokener) != json_tokener_success) { + const char *error_msg = json_tokener_error_desc(json_tokener_get_error(tokener)); + buffer_sprintf(error, "failed to parse json payload: %s", error_msg); + json_tokener_free(tokener); + goto cleanup; + } + json_tokener_free(tokener); + + if(!parse_prototype(jobj, "", base, error, name, strict)) + goto cleanup; + + if(!base->config.name && name) + base->config.name = string_strdupz(name); + + if(name && *name && string_strcmp(base->config.name, name) != 0) { + string_freez(base->config.name); + base->config.name = string_strdupz(name); + } + + int i = 1; + for(RRD_ALERT_PROTOTYPE *ap = base; ap; ap = ap->_internal.next, i++) { + if(ap->config.name != base->config.name) { + string_freez(ap->config.name); + ap->config.name = string_dup(base->config.name); + } + + if(!RRDCALC_HAS_DB_LOOKUP(ap) && !ap->config.calculation && strict) { + buffer_sprintf(error, "Item %d has neither database lookup nor calculation", i - 1); + goto cleanup; + } + + data_source_to_rrdr_options(ap); + dims_grouping_to_rrdr_options(ap); + + if(ap->match.enabled) + base->_internal.enabled = true; + } + + return base; + +cleanup: + health_prototype_free(base); + return NULL; +} + +// --------------------------------------------------------------------------------------------------------------------- +// generate the json object of an alert definition + +static inline void health_prototype_rule_to_json_array_member(BUFFER *wb, RRD_ALERT_PROTOTYPE *ap, bool for_hashing) { + buffer_json_add_array_item_object(wb); + { + buffer_json_member_add_boolean(wb, "enabled", ap->match.enabled); + buffer_json_member_add_string(wb, "type", ap->match.is_template ? "template" : "instance"); + + buffer_json_member_add_object(wb, "config"); + { + if(!for_hashing) { + buffer_json_member_add_uuid(wb, "hash", &ap->config.hash_id); + buffer_json_member_add_string(wb, "source_type", dyncfg_id2source_type(ap->config.source_type)); + buffer_json_member_add_string(wb, "source", string2str(ap->config.source)); + } + + buffer_json_member_add_object(wb, "match"); + { + if(ap->match.is_template) + buffer_json_member_add_string(wb, "on", string2str(ap->match.on.context)); + else + buffer_json_member_add_string(wb, "on", string2str(ap->match.on.chart)); + + buffer_json_member_add_string_or_empty(wb, "host_labels", ap->match.host_labels ? string2str(ap->match.host_labels) : "*"); + buffer_json_member_add_string_or_empty(wb, "instance_labels", ap->match.chart_labels ? string2str(ap->match.chart_labels) : "*"); + } + buffer_json_object_close(wb); // match + + buffer_json_member_add_string(wb, "summary", string2str(ap->config.summary)); + buffer_json_member_add_string(wb, "info", string2str(ap->config.info)); + + buffer_json_member_add_string(wb, "type", string2str(ap->config.type)); + buffer_json_member_add_string(wb, "component", string2str(ap->config.component)); + buffer_json_member_add_string(wb, "classification", string2str(ap->config.classification)); + + buffer_json_member_add_object(wb, "value"); + { + buffer_json_member_add_object(wb, "database_lookup"); + { + buffer_json_member_add_int64(wb, "after", ap->config.after); + buffer_json_member_add_int64(wb, "before", ap->config.before); + buffer_json_member_add_string(wb, "time_group", time_grouping_id2txt(ap->config.time_group)); + buffer_json_member_add_string(wb, "time_group_condition", alerts_group_conditions_id2txt(ap->config.time_group_condition)); + buffer_json_member_add_double(wb, "time_group_value", ap->config.time_group_value); + buffer_json_member_add_string(wb, "dims_group", alerts_dims_grouping_id2group(ap->config.dims_group)); + buffer_json_member_add_string(wb, "data_source", alerts_data_source_id2source(ap->config.data_source)); + rrdr_options_to_buffer_json_array(wb, "options", RRDR_OPTIONS_REMOVE_OVERLAPPING(ap->config.options)); + buffer_json_member_add_string(wb, "dimensions", string2str(ap->config.dimensions)); + } + buffer_json_object_close(wb); // database lookup + + buffer_json_member_add_string(wb, "calculation", expression_source(ap->config.calculation)); + buffer_json_member_add_string(wb, "units", string2str(ap->config.units)); + buffer_json_member_add_uint64(wb, "update_every", ap->config.update_every); + } + buffer_json_object_close(wb); // value + + buffer_json_member_add_object(wb, "conditions"); + { + buffer_json_member_add_string(wb, "warning_condition", expression_source(ap->config.warning)); + buffer_json_member_add_string(wb, "critical_condition", expression_source(ap->config.critical)); + } + buffer_json_object_close(wb); // conditions + + buffer_json_member_add_object(wb, "action"); + { + alert_action_options_to_buffer_json_array(wb, "options", ap->config.alert_action_options); + buffer_json_member_add_string(wb, "execute", string2str(ap->config.exec)); + buffer_json_member_add_string(wb, "recipient", string2str(ap->config.recipient)); + + buffer_json_member_add_object(wb, "delay"); + { + buffer_json_member_add_int64(wb, "up", ap->config.delay_up_duration); + buffer_json_member_add_int64(wb, "down", ap->config.delay_down_duration); + buffer_json_member_add_int64(wb, "max", ap->config.delay_max_duration); + buffer_json_member_add_double(wb, "multiplier", ap->config.delay_multiplier); + } + buffer_json_object_close(wb); // delay + + buffer_json_member_add_object(wb, "repeat"); + { + buffer_json_member_add_boolean(wb, "enabled", ap->config.has_custom_repeat_config); + buffer_json_member_add_uint64(wb, "warning", ap->config.has_custom_repeat_config ? ap->config.warn_repeat_every : 0); + buffer_json_member_add_uint64(wb, "critical", ap->config.has_custom_repeat_config ? ap->config.crit_repeat_every : 0); + } + buffer_json_object_close(wb); // repeat + } + buffer_json_object_close(wb); // action + } + buffer_json_object_close(wb); // match + } + buffer_json_object_close(wb); // array item +} + +void health_prototype_to_json(BUFFER *wb, RRD_ALERT_PROTOTYPE *ap, bool for_hashing) { + buffer_flush(wb); + buffer_json_initialize(wb, "\"", "\"", 0, true, BUFFER_JSON_OPTIONS_MINIFY); + + buffer_json_member_add_uint64(wb, "format_version", 1); + buffer_json_member_add_string(wb, "name", string2str(ap->config.name)); + buffer_json_member_add_array(wb, "rules"); + { + for(RRD_ALERT_PROTOTYPE *t = ap; t ; t = t->_internal.next) + health_prototype_rule_to_json_array_member(wb, t, for_hashing); + } + buffer_json_array_close(wb); // rules + buffer_json_finalize(wb); +} + +// --------------------------------------------------------------------------------------------------------------------- + +static inline void dyncfg_user_config_print_duration(BUFFER *wb, const char *prefix, int seconds) { + if((seconds % 3600) == 0) + buffer_sprintf(wb, "%s%dh", prefix?prefix:"", seconds / 3600); + else if((seconds % 60) == 0) + buffer_sprintf(wb, "%s%dm", prefix?prefix:"", seconds / 60); + else + buffer_sprintf(wb, "%s%ds", prefix?prefix:"", seconds); +} + +int dyncfg_health_prototype_to_conf(BUFFER *wb, RRD_ALERT_PROTOTYPE *ap, const char *name) { + buffer_flush(wb); + wb->content_type = CT_TEXT_PLAIN; + wb->expires = now_realtime_sec(); + + int n = 0; + for(RRD_ALERT_PROTOTYPE *nap = ap; nap ; nap = nap->_internal.next) { + if(++n > 1) + buffer_sprintf(wb, "\n"); + + if(nap->match.is_template) { + buffer_sprintf(wb, "%13s: %s\n", "template", name); + buffer_sprintf(wb, "%13s: %s\n", "on", string2str(nap->match.on.context)); + } + else { + buffer_sprintf(wb, "%13s: %s\n", "alarm", name); + buffer_sprintf(wb, "%13s: %s\n", "on", string2str(nap->match.on.chart)); + } + + if(nap->config.classification) + buffer_sprintf(wb, "%13s: %s\n", "class", string2str(nap->config.classification)); + + if(nap->config.type) + buffer_sprintf(wb, "%13s: %s\n", "type", string2str(nap->config.type)); + + if(nap->config.component) + buffer_sprintf(wb, "%13s: %s\n", "component", string2str(nap->config.component)); + + if(nap->match.host_labels) + buffer_sprintf(wb, "%13s: %s\n", "host labels", string2str(nap->match.host_labels)); + + if(nap->match.chart_labels) + buffer_sprintf(wb, "%13s: %s\n", "chart labels", string2str(nap->match.chart_labels)); + + if(nap->config.after) { + buffer_sprintf(wb, "%13s: %s", "lookup", time_grouping_tostring(nap->config.time_group)); + switch(nap->config.time_group) { + case RRDR_GROUPING_PERCENTILE: + case RRDR_GROUPING_TRIMMED_MEAN: + case RRDR_GROUPING_TRIMMED_MEDIAN: + buffer_sprintf(wb, "(%0.2f)", nap->config.time_group_value); + break; + + case RRDR_GROUPING_COUNTIF: + buffer_sprintf(wb, "(%s%0.2f)", alerts_group_conditions_id2txt(nap->config.time_group_condition), nap->config.time_group_value); + break; + + default: + break; + } + + dyncfg_user_config_print_duration(wb, " ", nap->config.after); + + if(nap->config.before) + dyncfg_user_config_print_duration(wb, " at ", nap->config.before); + + if(nap->config.options) { + buffer_strcat(wb, " "); + rrdr_options_to_buffer(wb, nap->config.options); + } + + if(nap->config.dimensions) + buffer_sprintf(wb, " of %s", string2str(nap->config.dimensions)); + + buffer_strcat(wb, "\n"); + } + + if(nap->config.calculation) + buffer_sprintf(wb, "%13s: %s\n", "calc", expression_source(nap->config.calculation)); + + if(nap->config.units) + buffer_sprintf(wb, "%13s: %s\n", "units", string2str(nap->config.units)); + + if(nap->config.update_every) { + buffer_sprintf(wb, "%13s: ", "every"); + dyncfg_user_config_print_duration(wb, NULL, nap->config.update_every); + buffer_strcat(wb, "\n"); + } + + if(nap->config.warning) + buffer_sprintf(wb, "%13s: %s\n", "warn", expression_source(nap->config.warning)); + + if(nap->config.critical) + buffer_sprintf(wb, "%13s: %s\n", "crit", expression_source(nap->config.critical)); + + if(nap->config.delay_up_duration || nap->config.delay_down_duration) { + buffer_sprintf(wb, "%13s:", "delay"); + + if(nap->config.delay_up_duration) + dyncfg_user_config_print_duration(wb, " up ", nap->config.delay_up_duration); + + if(nap->config.delay_down_duration) + dyncfg_user_config_print_duration(wb, " down ", nap->config.delay_down_duration); + + if(nap->config.delay_multiplier) + buffer_sprintf(wb, " multiplier %0.2f", nap->config.delay_multiplier); + + if(nap->config.delay_max_duration) + dyncfg_user_config_print_duration(wb, " max ", nap->config.delay_max_duration); + + buffer_strcat(wb, "\n"); + } + + if(nap->config.alert_action_options) { + buffer_sprintf(wb, "%13s:", "options"); + alert_action_options_to_buffer(wb, nap->config.alert_action_options); + buffer_strcat(wb, "\n"); + } + + if(nap->config.has_custom_repeat_config) { + if(!nap->config.crit_repeat_every && !nap->config.warn_repeat_every) + buffer_sprintf(wb, "%13s: off\n", "repeat"); + else { + dyncfg_user_config_print_duration(wb, " warning ", (int)nap->config.warn_repeat_every); + dyncfg_user_config_print_duration(wb, " critical ", (int)nap->config.crit_repeat_every); + buffer_strcat(wb, "\n"); + } + } + + if(nap->config.summary) + buffer_sprintf(wb, "%13s: %s\n", "summary", string2str(nap->config.summary)); + + if(nap->config.info) + buffer_sprintf(wb, "%13s: %s\n", "info", string2str(nap->config.info)); + + if(nap->config.exec && nap->config.exec != localhost->health.health_default_exec) + buffer_sprintf(wb, "%13s: %s\n", "exec", string2str(nap->config.exec)); + + if(nap->config.recipient) + buffer_sprintf(wb, "%13s: %s\n", "to", string2str(nap->config.recipient)); + } + + return 200; +} + +// --------------------------------------------------------------------------------------------------------------------- + +static size_t dyncfg_health_remove_all_rrdcalc_of_prototype(STRING *alert_name) { + size_t removed = 0; + + RRDHOST *host; + dfe_start_reentrant(rrdhost_root_index, host) { + RRDCALC *rc; + foreach_rrdcalc_in_rrdhost_read(host, rc) { + if(rc->config.name != alert_name) + continue; + + rrdcalc_unlink_and_delete(host, rc, false); + removed++; + } + foreach_rrdcalc_in_rrdhost_done(rc); + } + dfe_done(host); + + return removed; +} + +static void dyncfg_health_prototype_reapply(RRD_ALERT_PROTOTYPE *ap) { + dyncfg_health_remove_all_rrdcalc_of_prototype(ap->config.name); + health_prototype_apply_to_all_hosts(ap); +} + +static int dyncfg_health_prototype_template_action(BUFFER *result, DYNCFG_CMDS cmd, const char *add_name, BUFFER *payload, const char *source __maybe_unused) { + int code = HTTP_RESP_INTERNAL_SERVER_ERROR; + switch(cmd) { + case DYNCFG_CMD_ADD: { + CLEAN_BUFFER *error = buffer_create(0, NULL); + RRD_ALERT_PROTOTYPE *nap = health_prototype_payload_parse(buffer_tostring(payload), buffer_strlen(payload), error, add_name, true); + if(!nap) + code = dyncfg_default_response(result, HTTP_RESP_BAD_REQUEST, buffer_tostring(error)); + else { + char *msg = ""; + + nap->config.source_type = DYNCFG_SOURCE_TYPE_DYNCFG; + bool added = health_prototype_add(nap, &msg); // this swaps ap <-> nap + + if(!added) { + health_prototype_free(nap); + if(!msg || !*msg) msg = "required attributes are missing"; + return dyncfg_default_response(result, HTTP_RESP_BAD_REQUEST, msg); + } + else + freez(nap); + + const DICTIONARY_ITEM *item = dictionary_get_and_acquire_item(health_globals.prototypes.dict, add_name); + if(!item) + return dyncfg_default_response(result, HTTP_RESP_INTERNAL_SERVER_ERROR, "added prototype is not found"); + + RRD_ALERT_PROTOTYPE *ap = dictionary_acquired_item_value(item); + + dyncfg_health_prototype_reapply(ap); + health_dyncfg_register_prototype(ap); + code = ap->_internal.enabled ? DYNCFG_RESP_ACCEPTED : DYNCFG_RESP_ACCEPTED_DISABLED; + dictionary_acquired_item_release(health_globals.prototypes.dict, item); + + code = dyncfg_default_response(result, code, "accepted"); + } + } + break; + + case DYNCFG_CMD_USERCONFIG: { + CLEAN_BUFFER *error = buffer_create(0, NULL); + RRD_ALERT_PROTOTYPE *nap = health_prototype_payload_parse(buffer_tostring(payload), buffer_strlen(payload), error, add_name, false); + if(!nap) + code = dyncfg_default_response(result, HTTP_RESP_BAD_REQUEST, buffer_tostring(error)); + else { + code = dyncfg_health_prototype_to_conf(result, nap, add_name); + health_prototype_free(nap); + } + } + break; + + case DYNCFG_CMD_SCHEMA: + code = dyncfg_default_response(result, HTTP_RESP_NOT_IMPLEMENTED, "schema not implemented yet for prototype templates"); + break; + + case DYNCFG_CMD_TEST: + code = dyncfg_default_response(result, HTTP_RESP_NOT_IMPLEMENTED, "test not implemented yet for prototype templates"); + break; + + case DYNCFG_CMD_REMOVE: + case DYNCFG_CMD_RESTART: + case DYNCFG_CMD_DISABLE: + case DYNCFG_CMD_ENABLE: + case DYNCFG_CMD_UPDATE: + case DYNCFG_CMD_GET: + code = dyncfg_default_response(result, HTTP_RESP_BAD_REQUEST, "action given is not supported for prototype templates"); + break; + + case DYNCFG_CMD_NONE: + code = dyncfg_default_response(result, HTTP_RESP_BAD_REQUEST, "invalid action received for prototype templates"); + break; + } + + return code; +} + +static int dyncfg_health_prototype_job_action(BUFFER *result, DYNCFG_CMDS cmd, BUFFER *payload, const char *source __maybe_unused, const char *alert_name) { + const DICTIONARY_ITEM *item = dictionary_get_and_acquire_item(health_globals.prototypes.dict, alert_name); + if(!item) + return dyncfg_default_response(result, HTTP_RESP_NOT_FOUND, "no alert prototype is available by the name given"); + + RRD_ALERT_PROTOTYPE *ap = dictionary_acquired_item_value(item); + + char alert_name_dyncfg[strlen(DYNCFG_HEALTH_ALERT_PROTOTYPE_PREFIX) + strlen(alert_name) + 10]; + snprintfz(alert_name_dyncfg, sizeof(alert_name_dyncfg), DYNCFG_HEALTH_ALERT_PROTOTYPE_PREFIX ":%s", alert_name); + + int code = HTTP_RESP_INTERNAL_SERVER_ERROR; + + switch(cmd) { + case DYNCFG_CMD_SCHEMA: + code = dyncfg_default_response(result, HTTP_RESP_NOT_IMPLEMENTED, "schema not implemented yet"); + break; + + case DYNCFG_CMD_GET: + health_prototype_to_json(result, ap, false); + code = HTTP_RESP_OK; + break; + + case DYNCFG_CMD_DISABLE: + if(ap->_internal.enabled) { + ap->_internal.enabled = false; + dyncfg_health_prototype_reapply(ap); + dyncfg_status(localhost, alert_name_dyncfg, DYNCFG_STATUS_DISABLED); + code = dyncfg_default_response(result, HTTP_RESP_OK, "disabled"); + } + else + code = dyncfg_default_response(result, HTTP_RESP_OK, "already disabled"); + break; + + case DYNCFG_CMD_ENABLE: + if(ap->_internal.enabled) + code = dyncfg_default_response(result, HTTP_RESP_OK, "already enabled"); + else { + size_t matches_enabled = 0; + spinlock_lock(&ap->_internal.spinlock); + for(RRD_ALERT_PROTOTYPE *t = ap; t ;t = t->_internal.next) + if(t->match.enabled) + matches_enabled++; + spinlock_unlock(&ap->_internal.spinlock); + + if(!matches_enabled) { + code = dyncfg_default_response(result, HTTP_RESP_BAD_REQUEST, "all rules in this alert are disabled, so enabling the alert has no effect"); + } + else { + ap->_internal.enabled = true; + dyncfg_health_prototype_reapply(ap); + dyncfg_status(localhost, alert_name_dyncfg, DYNCFG_STATUS_ACCEPTED); + code = dyncfg_default_response(result, DYNCFG_RESP_ACCEPTED, "enabled"); + } + } + break; + + case DYNCFG_CMD_UPDATE: { + CLEAN_BUFFER *error = buffer_create(0, NULL); + RRD_ALERT_PROTOTYPE *nap = health_prototype_payload_parse(buffer_tostring(payload), buffer_strlen(payload), error, alert_name, true); + if(!nap) + code = dyncfg_default_response(result, HTTP_RESP_BAD_REQUEST, buffer_tostring(error)); + else { + char *msg = ""; + nap->config.source_type = DYNCFG_SOURCE_TYPE_DYNCFG; + bool added = health_prototype_add(nap, &msg); // this swaps ap <-> nap + + if(!added) { + health_prototype_free(nap); + if(!msg || !*msg) msg = "required attributes are missing"; + return dyncfg_default_response( result, HTTP_RESP_BAD_REQUEST, msg); + } + else + freez(nap); + + dyncfg_health_prototype_reapply(ap); + code = ap->_internal.enabled ? DYNCFG_RESP_ACCEPTED : DYNCFG_RESP_ACCEPTED_DISABLED; + code = dyncfg_default_response(result, code, "updated"); + } + } + break; + + case DYNCFG_CMD_USERCONFIG: { + CLEAN_BUFFER *error = buffer_create(0, NULL); + RRD_ALERT_PROTOTYPE *nap = health_prototype_payload_parse(buffer_tostring(payload), buffer_strlen(payload), error, alert_name, false); + if(!nap) + code = dyncfg_default_response(result, HTTP_RESP_BAD_REQUEST, buffer_tostring(error)); + else { + code = dyncfg_health_prototype_to_conf(result, nap, alert_name); + health_prototype_free(nap); + } + } + break; + + case DYNCFG_CMD_REMOVE: + dyncfg_health_remove_all_rrdcalc_of_prototype(ap->config.name); + dictionary_del(health_globals.prototypes.dict, dictionary_acquired_item_name(item)); + code = dyncfg_default_response(result, HTTP_RESP_OK, "deleted"); + dyncfg_del(localhost, alert_name_dyncfg); + break; + + case DYNCFG_CMD_TEST: + case DYNCFG_CMD_ADD: + case DYNCFG_CMD_RESTART: + code = dyncfg_default_response(result, HTTP_RESP_BAD_REQUEST, "action given is not supported for the prototype job"); + break; + + case DYNCFG_CMD_NONE: + code = dyncfg_default_response(result, HTTP_RESP_BAD_REQUEST, "invalid action received"); + break; + } + + dictionary_acquired_item_release(health_globals.prototypes.dict, item); + return code; +} + +int dyncfg_health_cb(const char *transaction __maybe_unused, const char *id, DYNCFG_CMDS cmd, const char *add_name, + BUFFER *payload, usec_t *stop_monotonic_ut __maybe_unused, bool *cancelled __maybe_unused, + BUFFER *result, HTTP_ACCESS access __maybe_unused, const char *source, void *data __maybe_unused) { + + char buf[strlen(id) + 1]; + memcpy(buf, id, sizeof(buf)); + + char *words[100] = { NULL }; + size_t num_words = quoted_strings_splitter_dyncfg_id(buf, words, 100); + size_t i = 0; + int code = HTTP_RESP_INTERNAL_SERVER_ERROR; + + char *health_prefix = get_word(words, num_words, i++); + if(!health_prefix || !*health_prefix || strcmp(health_prefix, "health") != 0) + return dyncfg_default_response(result, HTTP_RESP_BAD_REQUEST, "first component of id is not 'health'"); + + char *alert_prefix = get_word(words, num_words, i++); + if(!alert_prefix || !*alert_prefix || strcmp(alert_prefix, "alert") != 0) + return dyncfg_default_response(result, HTTP_RESP_BAD_REQUEST, "second component of id is not 'alert'"); + + char *type_prefix = get_word(words, num_words, i++); + if(!type_prefix || !*type_prefix || strcmp(type_prefix, "prototype") != 0) + return dyncfg_default_response(result, HTTP_RESP_BAD_REQUEST, "third component of id is not 'prototype'"); + + char *alert_name = get_word(words, num_words, i++); + if(!alert_name || !*alert_name) { + // action on the prototype template + + code = dyncfg_health_prototype_template_action(result, cmd, add_name, payload, source); + } + else { + // action on a specific alert prototype + + code = dyncfg_health_prototype_job_action(result, cmd, payload, source, alert_name); + } + return code; +} + +void health_dyncfg_unregister_all_prototypes(void) { + char key[HEALTH_CONF_MAX_LINE]; + RRD_ALERT_PROTOTYPE *ap; + + // remove dyncfg + // it is ok if they are not added before + + dfe_start_read(health_globals.prototypes.dict, ap) { + snprintfz(key, sizeof(key), DYNCFG_HEALTH_ALERT_PROTOTYPE_PREFIX ":%s", string2str(ap->config.name)); + dyncfg_del(localhost, key); + } + dfe_done(ap); + dyncfg_del(localhost, DYNCFG_HEALTH_ALERT_PROTOTYPE_PREFIX); +} + +static void health_dyncfg_register_prototype(RRD_ALERT_PROTOTYPE *ap) { + char key[HEALTH_CONF_MAX_LINE]; + +// bool trace = false; +// if(string_strcmp(ap->config.name, "ram_available") == 0) +// trace = true; + + snprintfz(key, sizeof(key), DYNCFG_HEALTH_ALERT_PROTOTYPE_PREFIX ":%s", string2str(ap->config.name)); + dyncfg_add(localhost, key, "/health/alerts/prototypes", + ap->_internal.enabled ? DYNCFG_STATUS_ACCEPTED : DYNCFG_STATUS_DISABLED, DYNCFG_TYPE_JOB, + ap->config.source_type, string2str(ap->config.source), + DYNCFG_CMD_SCHEMA | DYNCFG_CMD_GET | DYNCFG_CMD_ENABLE | DYNCFG_CMD_DISABLE | + DYNCFG_CMD_UPDATE | DYNCFG_CMD_USERCONFIG | + (ap->config.source_type == DYNCFG_SOURCE_TYPE_DYNCFG && !ap->_internal.is_on_disk ? DYNCFG_CMD_REMOVE : 0), + HTTP_ACCESS_NONE, + HTTP_ACCESS_NONE, + dyncfg_health_cb, NULL); + +#ifdef NETDATA_TEST_HEALTH_PROTOTYPES_JSON_AND_PARSING + { + // make sure we can generate valid json, parse it back and come up to the same object + + CLEAN_BUFFER *original = buffer_create(0, NULL); + CLEAN_BUFFER *parsed = buffer_create(0, NULL); + CLEAN_BUFFER *error = buffer_create(0, NULL); + health_prototype_to_json(original, ap, true); + RRD_ALERT_PROTOTYPE *t = health_prototype_payload_parse(buffer_tostring(original), buffer_strlen(original), error, string2str(ap->config.name)); + if(!t) + fatal("hey! cannot parse: %s", buffer_tostring(error)); + + health_prototype_to_json(parsed, t, true); + + if(strcmp(buffer_tostring(original), buffer_tostring(parsed)) != 0) + fatal("hey! they are different!"); + } +#endif +} + +void health_dyncfg_register_all_prototypes(void) { + RRD_ALERT_PROTOTYPE *ap; + + dyncfg_add(localhost, + DYNCFG_HEALTH_ALERT_PROTOTYPE_PREFIX, "/health/alerts/prototypes", + DYNCFG_STATUS_ACCEPTED, DYNCFG_TYPE_TEMPLATE, + DYNCFG_SOURCE_TYPE_INTERNAL, "internal", + DYNCFG_CMD_SCHEMA | DYNCFG_CMD_ADD | DYNCFG_CMD_ENABLE | DYNCFG_CMD_DISABLE | DYNCFG_CMD_USERCONFIG, + HTTP_ACCESS_NONE, + HTTP_ACCESS_NONE, + dyncfg_health_cb, NULL); + + dfe_start_read(health_globals.prototypes.dict, ap) { + if(ap->config.source_type != DYNCFG_SOURCE_TYPE_DYNCFG) + health_dyncfg_register_prototype(ap); + } + dfe_done(ap); +} diff --git a/src/health/health_event_loop.c b/src/health/health_event_loop.c new file mode 100644 index 000000000..756ffa165 --- /dev/null +++ b/src/health/health_event_loop.c @@ -0,0 +1,771 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "health.h" +#include "health_internals.h" + +#define WORKER_HEALTH_JOB_RRD_LOCK 0 +#define WORKER_HEALTH_JOB_HOST_LOCK 1 +#define WORKER_HEALTH_JOB_DB_QUERY 2 +#define WORKER_HEALTH_JOB_CALC_EVAL 3 +#define WORKER_HEALTH_JOB_WARNING_EVAL 4 +#define WORKER_HEALTH_JOB_CRITICAL_EVAL 5 +#define WORKER_HEALTH_JOB_ALARM_LOG_ENTRY 6 +#define WORKER_HEALTH_JOB_ALARM_LOG_PROCESS 7 +#define WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET 8 +#define WORKER_HEALTH_JOB_DELAYED_INIT_RRDDIM 9 + +#if WORKER_UTILIZATION_MAX_JOB_TYPES < 10 +#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 10 +#endif + +// ---------------------------------------------------------------------------- +// health main thread and friends + +static inline RRDCALC_STATUS rrdcalc_value2status(NETDATA_DOUBLE n) { + if(isnan(n) || isinf(n)) return RRDCALC_STATUS_UNDEFINED; + if(n) return RRDCALC_STATUS_RAISED; + return RRDCALC_STATUS_CLEAR; +} + +static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) { + if(unlikely(!rc->rrdset)) { + netdata_log_debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rrdcalc_chart_name(rc), rrdcalc_name(rc)); + return 0; + } + + if(unlikely(rc->next_update > now)) { + if (unlikely(*next_run > rc->next_update)) { + // update the next_run time of the main loop + // to run this alarm precisely the time required + *next_run = rc->next_update; + } + + netdata_log_debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rrdcalc_chart_name(rc), rrdcalc_name(rc), (int) (rc->next_update - now)); + return 0; + } + + if(unlikely(!rc->config.update_every)) { + netdata_log_debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rrdcalc_chart_name(rc), rrdcalc_name(rc)); + return 0; + } + + if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE))) { + netdata_log_debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rrdcalc_chart_name(rc), rrdcalc_name(rc)); + return 0; + } + + if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) { + netdata_log_debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rrdcalc_chart_name(rc), rrdcalc_name(rc)); + return 0; + } + + int update_every = rc->rrdset->update_every; + time_t first = rrdset_first_entry_s(rc->rrdset); + time_t last = rrdset_last_entry_s(rc->rrdset); + + if(unlikely(now + update_every < first /* || now - update_every > last */)) { + netdata_log_debug(D_HEALTH + , "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)." + , rrdcalc_chart_name(rc), rrdcalc_name(rc), (unsigned long) now, (unsigned long) first + , (unsigned long) last); + return 0; + } + + if(RRDCALC_HAS_DB_LOOKUP(rc)) { + time_t needed = now + rc->config.before + rc->config.after; + + if(needed + update_every < first || needed - update_every > last) { + netdata_log_debug(D_HEALTH + , "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)." + , rrdcalc_chart_name(rc), rrdcalc_name(rc), (unsigned long) needed, (unsigned long) first + , (unsigned long) last); + return 0; + } + } + + return 1; +} + +static void health_sleep(time_t next_run, unsigned int loop __maybe_unused) { + time_t now = now_realtime_sec(); + if(now < next_run) { + worker_is_idle(); + netdata_log_debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now)); + while (now < next_run && service_running(SERVICE_HEALTH)) { + sleep_usec(USEC_PER_SEC); + now = now_realtime_sec(); + } + } + else { + netdata_log_debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop); + } +} + +static void sql_health_postpone_queue_removed(RRDHOST *host __maybe_unused) { +#ifdef ENABLE_ACLK + if (netdata_cloud_enabled) { + struct aclk_sync_cfg_t *wc = host->aclk_config; + if (unlikely(!wc)) { + return; + } + + if (wc->alert_queue_removed >= 1) { + wc->alert_queue_removed+=6; + } + } +#endif +} + +static void health_execute_delayed_initializations(RRDHOST *host) { + health_plugin_init(); + + RRDSET *st; + bool must_postpone = false; + + if (!rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION)) return; + rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION); + + rrdset_foreach_reentrant(st, host) { + if(!rrdset_flag_check(st, RRDSET_FLAG_PENDING_HEALTH_INITIALIZATION)) continue; + rrdset_flag_clear(st, RRDSET_FLAG_PENDING_HEALTH_INITIALIZATION); + + worker_is_busy(WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET); + health_prototype_alerts_for_rrdset_incrementally(st); + must_postpone = true; + } + rrdset_foreach_done(st); + if (must_postpone) + sql_health_postpone_queue_removed(host); +} + +static void health_initialize_rrdhost(RRDHOST *host) { + health_plugin_init(); + + if(!host->health.health_enabled || + rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH) || + !service_running(SERVICE_HEALTH)) + return; + + rrdhost_flag_set(host, RRDHOST_FLAG_INITIALIZED_HEALTH); + + host->health_log.max = health_globals.config.health_log_entries_max; + host->health_log.health_log_history = health_globals.config.health_log_history; + host->health.health_default_exec = string_dup(health_globals.config.default_exec); + host->health.health_default_recipient = string_dup(health_globals.config.default_recipient); + host->health.use_summary_for_notifications = health_globals.config.use_summary_for_notifications; + + host->health_log.next_log_id = (uint32_t)now_realtime_sec(); + host->health_log.next_alarm_id = 0; + + rw_spinlock_init(&host->health_log.spinlock); + sql_health_alarm_log_load(host); + health_apply_prototypes_to_host(host); +} + +static inline int check_if_resumed_from_suspension(void) { + static usec_t last_realtime = 0, last_monotonic = 0; + usec_t realtime = now_realtime_usec(), monotonic = now_monotonic_usec(); + int ret = 0; + + // detect if monotonic and realtime have twice the difference + // in which case we assume the system was just waken from hibernation + + if(last_realtime && last_monotonic && realtime - last_realtime > 2 * (monotonic - last_monotonic)) + ret = 1; + + last_realtime = realtime; + last_monotonic = monotonic; + + return ret; +} + +static void health_event_loop(void) { + bool health_running_logged = false; + + unsigned int loop = 0; + + while(service_running(SERVICE_HEALTH)) { + loop++; + netdata_log_debug(D_HEALTH, "Health monitoring iteration no %u started", loop); + + time_t now = now_realtime_sec(); + int runnable = 0, apply_hibernation_delay = 0; + time_t next_run = now + health_globals.config.run_at_least_every_seconds; + RRDCALC *rc; + RRDHOST *host; + + if (unlikely(check_if_resumed_from_suspension())) { + apply_hibernation_delay = 1; + + nd_log(NDLS_DAEMON, NDLP_NOTICE, + "Postponing alarm checks for %"PRId32" seconds, " + "because it seems that the system was just resumed from suspension.", + (int32_t)health_globals.config.postpone_alarms_during_hibernation_for_seconds); + schedule_node_info_update(localhost); + } + + if (unlikely(silencers->all_alarms && silencers->stype == STYPE_DISABLE_ALARMS)) { + static int logged=0; + if (!logged) { + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "Skipping health checks, because all alarms are disabled via API command."); + logged = 1; + } + } + + worker_is_busy(WORKER_HEALTH_JOB_RRD_LOCK); + dfe_start_reentrant(rrdhost_root_index, host) { + + if(unlikely(!service_running(SERVICE_HEALTH))) + break; + + if (unlikely(!host->health.health_enabled)) + continue; + + if (unlikely(!rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH))) + health_initialize_rrdhost(host); + + health_execute_delayed_initializations(host); + + if (unlikely(apply_hibernation_delay)) { + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "[%s]: Postponing health checks for %"PRId32" seconds.", + rrdhost_hostname(host), + health_globals.config.postpone_alarms_during_hibernation_for_seconds); + + host->health.health_delay_up_to = + now + health_globals.config.postpone_alarms_during_hibernation_for_seconds; + } + + if (unlikely(host->health.health_delay_up_to)) { + if (unlikely(now < host->health.health_delay_up_to)) { + continue; + } + + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "[%s]: Resuming health checks after delay.", + rrdhost_hostname(host)); + + host->health.health_delay_up_to = 0; + } + + // wait until cleanup of obsolete charts on children is complete + if (host != localhost) { + if (unlikely(host->trigger_chart_obsoletion_check == 1)) { + + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "[%s]: Waiting for chart obsoletion check.", + rrdhost_hostname(host)); + + continue; + } + } + + if (!health_running_logged) { + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "[%s]: Health is running.", + rrdhost_hostname(host)); + + health_running_logged = true; + } + + worker_is_busy(WORKER_HEALTH_JOB_HOST_LOCK); + + // the first loop is to lookup values from the db + foreach_rrdcalc_in_rrdhost_read(host, rc) { + + if(unlikely(!service_running(SERVICE_HEALTH))) + break; + + rrdcalc_update_info_using_rrdset_labels(rc); + + if (health_silencers_update_disabled_silenced(host, rc)) + continue; + + // create an alert removed event if the chart is obsolete and + // has stopped being collected for 60 seconds + if (unlikely(rc->rrdset && rc->status != RRDCALC_STATUS_REMOVED && + rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE) && + now > (rc->rrdset->last_collected_time.tv_sec + 60))) { + + if (!rrdcalc_isrepeating(rc)) { + worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY); + time_t now_tmp = now_realtime_sec(); + + ALARM_ENTRY *ae = + health_create_alarm_entry( + host, + rc, + now_tmp, + now_tmp - rc->last_status_change, + rc->value, + NAN, + rc->status, + RRDCALC_STATUS_REMOVED, + 0, + rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0); + + if (ae) { + health_log_alert(host, ae); + health_alarm_log_add_entry(host, ae); + rc->old_status = rc->status; + rc->status = RRDCALC_STATUS_REMOVED; + rc->last_status_change = now_tmp; + rc->last_status_change_value = rc->value; + rc->last_updated = now_tmp; + rc->value = NAN; + +#ifdef ENABLE_ACLK + if (netdata_cloud_enabled) + sql_queue_alarm_to_aclk(host, ae, true); +#endif + } + } + } + + if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) { + if (unlikely(rc->run_flags & RRDCALC_FLAG_RUNNABLE)) + rc->run_flags &= ~RRDCALC_FLAG_RUNNABLE; + continue; + } + + runnable++; + rc->old_value = rc->value; + rc->run_flags |= RRDCALC_FLAG_RUNNABLE; + + // ------------------------------------------------------------ + // if there is database lookup, do it + + if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) { + worker_is_busy(WORKER_HEALTH_JOB_DB_QUERY); + + /* time_t old_db_timestamp = rc->db_before; */ + int value_is_null = 0; + + char group_options_buf[100]; + const char *group_options = group_options_buf; + switch(rc->config.time_group) { + default: + group_options = NULL; + break; + + case RRDR_GROUPING_PERCENTILE: + case RRDR_GROUPING_TRIMMED_MEAN: + case RRDR_GROUPING_TRIMMED_MEDIAN: + snprintfz(group_options_buf, sizeof(group_options_buf), + NETDATA_DOUBLE_FORMAT_AUTO, + rc->config.time_group_value); + break; + + case RRDR_GROUPING_COUNTIF: + snprintfz(group_options_buf, sizeof(group_options_buf), + "%s" NETDATA_DOUBLE_FORMAT_AUTO, + alerts_group_conditions_id2txt(rc->config.time_group_condition), + rc->config.time_group_value); + break; + } + + int ret = rrdset2value_api_v1(rc->rrdset, NULL, &rc->value, rrdcalc_dimensions(rc), 1, + rc->config.after, rc->config.before, rc->config.time_group, group_options, + 0, rc->config.options | RRDR_OPTION_SELECTED_TIER, + &rc->db_after,&rc->db_before, + NULL, NULL, NULL, + &value_is_null, NULL, 0, 0, + QUERY_SOURCE_HEALTH, STORAGE_PRIORITY_SYNCHRONOUS); + + if (unlikely(ret != 200)) { + // database lookup failed + rc->value = NAN; + rc->run_flags |= RRDCALC_FLAG_DB_ERROR; + + netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d", + rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), ret + ); + } else + rc->run_flags &= ~RRDCALC_FLAG_DB_ERROR; + + if (unlikely(value_is_null)) { + // collected value is null + rc->value = NAN; + rc->run_flags |= RRDCALC_FLAG_DB_NAN; + + netdata_log_debug(D_HEALTH, + "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", + rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc) + ); + } else + rc->run_flags &= ~RRDCALC_FLAG_DB_NAN; + + netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " NETDATA_DOUBLE_FORMAT, + rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), rc->value + ); + } + + // ------------------------------------------------------------ + // if there is calculation expression, run it + + if (unlikely(rc->config.calculation)) { + worker_is_busy(WORKER_HEALTH_JOB_CALC_EVAL); + + if (unlikely(!expression_evaluate(rc->config.calculation))) { + // calculation failed + rc->value = NAN; + rc->run_flags |= RRDCALC_FLAG_CALC_ERROR; + + netdata_log_debug( + D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", + rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), + expression_parsed_as(rc->config.calculation), expression_error_msg(rc->config.calculation) + ); + } + else { + rc->run_flags &= ~RRDCALC_FLAG_CALC_ERROR; + + netdata_log_debug( + D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value " + NETDATA_DOUBLE_FORMAT": %s (source: %s)", + rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), + expression_parsed_as(rc->config.calculation), + expression_result(rc->config.calculation), + expression_error_msg(rc->config.calculation), + rrdcalc_source(rc) + ); + + rc->value = expression_result(rc->config.calculation); + } + } + } + foreach_rrdcalc_in_rrdhost_done(rc); + + struct health_raised_summary *hrm = alerts_raised_summary_create(host); + + if (unlikely(runnable && service_running(SERVICE_HEALTH))) { + foreach_rrdcalc_in_rrdhost_read(host, rc) { + if(unlikely(!service_running(SERVICE_HEALTH))) + break; + + if (unlikely(!(rc->run_flags & RRDCALC_FLAG_RUNNABLE))) + continue; + + if (rc->run_flags & RRDCALC_FLAG_DISABLED) { + continue; + } + RRDCALC_STATUS warning_status = RRDCALC_STATUS_UNDEFINED; + RRDCALC_STATUS critical_status = RRDCALC_STATUS_UNDEFINED; + + // -------------------------------------------------------- + // check the warning expression + + if (likely(rc->config.warning)) { + worker_is_busy(WORKER_HEALTH_JOB_WARNING_EVAL); + + if (unlikely(!expression_evaluate(rc->config.warning))) { + // calculation failed + rc->run_flags |= RRDCALC_FLAG_WARN_ERROR; + + netdata_log_debug(D_HEALTH, + "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s", + rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), + expression_error_msg(rc->config.warning) + ); + } else { + rc->run_flags &= ~RRDCALC_FLAG_WARN_ERROR; + netdata_log_debug(D_HEALTH, + "Health on host '%s', alarm '%s.%s': warning expression gave value " + NETDATA_DOUBLE_FORMAT ": %s (source: %s)", + rrdhost_hostname(host), + rrdcalc_chart_name(rc), + rrdcalc_name(rc), + expression_result(rc->config.warning), + expression_error_msg(rc->config.warning), + rrdcalc_source(rc) + ); + warning_status = rrdcalc_value2status(expression_result(rc->config.warning)); + } + } + + // -------------------------------------------------------- + // check the critical expression + + if (likely(rc->config.critical)) { + worker_is_busy(WORKER_HEALTH_JOB_CRITICAL_EVAL); + + if (unlikely(!expression_evaluate(rc->config.critical))) { + // calculation failed + rc->run_flags |= RRDCALC_FLAG_CRIT_ERROR; + + netdata_log_debug(D_HEALTH, + "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s", + rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), + expression_error_msg(rc->config.critical) + ); + } else { + rc->run_flags &= ~RRDCALC_FLAG_CRIT_ERROR; + netdata_log_debug(D_HEALTH, + "Health on host '%s', alarm '%s.%s': critical expression gave value " + NETDATA_DOUBLE_FORMAT ": %s (source: %s)", + rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), + expression_result(rc->config.critical), + expression_error_msg(rc->config.critical), + rrdcalc_source(rc) + ); + critical_status = rrdcalc_value2status(expression_result(rc->config.critical)); + } + } + + // -------------------------------------------------------- + // decide the final alarm status + + RRDCALC_STATUS status = RRDCALC_STATUS_UNDEFINED; + + switch (warning_status) { + case RRDCALC_STATUS_CLEAR: + status = RRDCALC_STATUS_CLEAR; + break; + + case RRDCALC_STATUS_RAISED: + status = RRDCALC_STATUS_WARNING; + break; + + default: + break; + } + + switch (critical_status) { + case RRDCALC_STATUS_CLEAR: + if (status == RRDCALC_STATUS_UNDEFINED) + status = RRDCALC_STATUS_CLEAR; + break; + + case RRDCALC_STATUS_RAISED: + status = RRDCALC_STATUS_CRITICAL; + break; + + default: + break; + } + + // -------------------------------------------------------- + // check if the new status and the old differ + + if (status != rc->status) { + + worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY); + int delay; + + // apply trigger hysteresis + + if (now > rc->delay_up_to_timestamp) { + rc->delay_up_current = rc->config.delay_up_duration; + rc->delay_down_current = rc->config.delay_down_duration; + rc->delay_last = 0; + rc->delay_up_to_timestamp = 0; + } else { + rc->delay_up_current = (int)((float)rc->delay_up_current * rc->config.delay_multiplier); + if (rc->delay_up_current > rc->config.delay_max_duration) + rc->delay_up_current = rc->config.delay_max_duration; + + rc->delay_down_current = (int)((float)rc->delay_down_current * rc->config.delay_multiplier); + if (rc->delay_down_current > rc->config.delay_max_duration) + rc->delay_down_current = rc->config.delay_max_duration; + } + + if (status > rc->status) + delay = rc->delay_up_current; + else + delay = rc->delay_down_current; + + // COMMENTED: because we do need to send raising alarms + // if (now + delay < rc->delay_up_to_timestamp) + // delay = (int)(rc->delay_up_to_timestamp - now); + + rc->delay_last = delay; + rc->delay_up_to_timestamp = now + delay; + + ALARM_ENTRY *ae = + health_create_alarm_entry( + host, + rc, + now, + now - rc->last_status_change, + rc->old_value, + rc->value, + rc->status, + status, + rc->delay_last, + ( + ((rc->config.alert_action_options & ALERT_ACTION_OPTION_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) | + ((rc->run_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) | + (rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0) + ) + ); + + health_log_alert(host, ae); + health_alarm_log_add_entry(host, ae); + + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "[%s]: Alert event for [%s.%s], value [%s], status [%s].", + rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), ae_new_value_string(ae), + rrdcalc_status2string(ae->new_status)); + + rc->last_status_change_value = rc->value; + rc->last_status_change = now; + rc->old_status = rc->status; + rc->status = status; + + if(unlikely(rrdcalc_isrepeating(rc))) { + rc->last_repeat = now; + if (rc->status == RRDCALC_STATUS_CLEAR) + rc->run_flags |= RRDCALC_FLAG_RUN_ONCE; + } + } + + rc->last_updated = now; + rc->next_update = now + rc->config.update_every; + + if (next_run > rc->next_update) + next_run = rc->next_update; + } + foreach_rrdcalc_in_rrdhost_done(rc); + + alerts_raised_summary_populate(hrm); + + // process repeating alarms + foreach_rrdcalc_in_rrdhost_read(host, rc) { + if(unlikely(!service_running(SERVICE_HEALTH))) + break; + + int repeat_every = 0; + if(unlikely(rrdcalc_isrepeating(rc) && rc->delay_up_to_timestamp <= now)) { + if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) { + rc->run_flags &= ~RRDCALC_FLAG_RUN_ONCE; + repeat_every = (int)rc->config.warn_repeat_every; + } + else if(unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) { + rc->run_flags &= ~RRDCALC_FLAG_RUN_ONCE; + repeat_every = (int)rc->config.crit_repeat_every; + } + else if(unlikely(rc->status == RRDCALC_STATUS_CLEAR)) { + if(!(rc->run_flags & RRDCALC_FLAG_RUN_ONCE) && + (rc->old_status == RRDCALC_STATUS_CRITICAL || rc->old_status == RRDCALC_STATUS_WARNING)) + repeat_every = 1; + } + } + else + continue; + + if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) { + worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY); + rc->last_repeat = now; + if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++; + ALARM_ENTRY *ae = + health_create_alarm_entry( + host, + rc, + now, + now - rc->last_status_change, + rc->old_value, + rc->value, + rc->old_status, + rc->status, + rc->delay_last, + ( + ((rc->config.alert_action_options & ALERT_ACTION_OPTION_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) | + ((rc->run_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) | + (rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0) + ) + ); + + health_log_alert(host, ae); + ae->last_repeat = rc->last_repeat; + if (!(rc->run_flags & RRDCALC_FLAG_RUN_ONCE) && rc->status == RRDCALC_STATUS_CLEAR) { + ae->flags |= HEALTH_ENTRY_RUN_ONCE; + } + rc->run_flags |= RRDCALC_FLAG_RUN_ONCE; + health_send_notification(host, ae, hrm); + netdata_log_debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id); + health_alarm_wait_for_execution(ae); + health_alarm_log_free_one_nochecks_nounlink(ae); + } + } + foreach_rrdcalc_in_rrdhost_done(rc); + } + + if (unlikely(!service_running(SERVICE_HEALTH))) + break; + + // execute notifications + // and cleanup + + worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS); + health_alarm_log_process_to_send_notifications(host, hrm); + alerts_raised_summary_free(hrm); + + if (unlikely(!service_running(SERVICE_HEALTH))) { + // wait for all notifications to finish before allowing health to be cleaned up + wait_for_all_notifications_to_finish_before_allowing_health_to_be_cleaned_up(); + break; + } +#ifdef ENABLE_ACLK + if (netdata_cloud_enabled) { + struct aclk_sync_cfg_t *wc = host->aclk_config; + if (unlikely(!wc)) + continue; + + if (wc->alert_queue_removed == 1) { + sql_queue_removed_alerts_to_aclk(host); + } else if (wc->alert_queue_removed > 1) { + wc->alert_queue_removed--; + } + + if (wc->alert_checkpoint_req == 1) { + aclk_push_alarm_checkpoint(host); + } else if (wc->alert_checkpoint_req > 1) { + wc->alert_checkpoint_req--; + } + } +#endif + } + dfe_done(host); + + // wait for all notifications to finish before allowing health to be cleaned up + wait_for_all_notifications_to_finish_before_allowing_health_to_be_cleaned_up(); + + if(unlikely(!service_running(SERVICE_HEALTH))) + break; + + health_sleep(next_run, loop); + + } // forever +} + + +static void health_main_cleanup(void *pptr) { + struct netdata_static_thread *static_thread = CLEANUP_FUNCTION_GET_PTR(pptr); + if(!static_thread) return; + + worker_unregister(); + static_thread->enabled = NETDATA_MAIN_THREAD_EXITING; + netdata_log_info("cleaning up..."); + static_thread->enabled = NETDATA_MAIN_THREAD_EXITED; + + nd_log(NDLS_DAEMON, NDLP_DEBUG, "Health thread ended."); +} + +void *health_main(void *ptr) { + worker_register("HEALTH"); + worker_register_job_name(WORKER_HEALTH_JOB_RRD_LOCK, "rrd lock"); + worker_register_job_name(WORKER_HEALTH_JOB_HOST_LOCK, "host lock"); + worker_register_job_name(WORKER_HEALTH_JOB_DB_QUERY, "db lookup"); + worker_register_job_name(WORKER_HEALTH_JOB_CALC_EVAL, "calc eval"); + worker_register_job_name(WORKER_HEALTH_JOB_WARNING_EVAL, "warning eval"); + worker_register_job_name(WORKER_HEALTH_JOB_CRITICAL_EVAL, "critical eval"); + worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY, "alarm log entry"); + worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS, "alarm log process"); + worker_register_job_name(WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET, "rrdset init"); + worker_register_job_name(WORKER_HEALTH_JOB_DELAYED_INIT_RRDDIM, "rrddim init"); + + CLEANUP_FUNCTION_REGISTER(health_main_cleanup) cleanup_ptr = ptr; + health_event_loop(); + return NULL; +} diff --git a/src/health/health_internals.h b/src/health/health_internals.h new file mode 100644 index 000000000..638a96195 --- /dev/null +++ b/src/health/health_internals.h @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_HEALTH_INTERNALS_H +#define NETDATA_HEALTH_INTERNALS_H + +#include "health.h" + +#define HEALTH_LOG_ENTRIES_DEFAULT 1000U +#define HEALTH_LOG_ENTRIES_MAX 100000U +#define HEALTH_LOG_ENTRIES_MIN 10U + +#define HEALTH_LOG_HISTORY_DEFAULT (5 * 86400) + +#define HEALTH_CONF_MAX_LINE 4096 + +#define HEALTH_ALARM_KEY "alarm" +#define HEALTH_TEMPLATE_KEY "template" +#define HEALTH_CHART_KEY "chart" +#define HEALTH_CONTEXT_KEY "context" +#define HEALTH_ON_KEY "on" +#define HEALTH_HOST_KEY "hosts" +#define HEALTH_OS_KEY "os" +#define HEALTH_PLUGIN_KEY "plugin" +#define HEALTH_MODULE_KEY "module" +#define HEALTH_LOOKUP_KEY "lookup" +#define HEALTH_CALC_KEY "calc" +#define HEALTH_EVERY_KEY "every" +#define HEALTH_GREEN_KEY "green" +#define HEALTH_RED_KEY "red" +#define HEALTH_WARN_KEY "warn" +#define HEALTH_CRIT_KEY "crit" +#define HEALTH_EXEC_KEY "exec" +#define HEALTH_RECIPIENT_KEY "to" +#define HEALTH_UNITS_KEY "units" +#define HEALTH_SUMMARY_KEY "summary" +#define HEALTH_INFO_KEY "info" +#define HEALTH_CLASS_KEY "class" +#define HEALTH_COMPONENT_KEY "component" +#define HEALTH_TYPE_KEY "type" +#define HEALTH_DELAY_KEY "delay" +#define HEALTH_OPTIONS_KEY "options" +#define HEALTH_REPEAT_KEY "repeat" +#define HEALTH_HOST_LABEL_KEY "host labels" +#define HEALTH_CHART_LABEL_KEY "chart labels" + +void alert_action_options_to_buffer_json_array(BUFFER *wb, const char *key, ALERT_ACTION_OPTIONS options); +void alert_action_options_to_buffer(BUFFER *wb, ALERT_ACTION_OPTIONS options); +ALERT_ACTION_OPTIONS alert_action_options_parse(char *o); +ALERT_ACTION_OPTIONS alert_action_options_parse_one(const char *o); + +typedef struct rrd_alert_prototype { + struct rrd_alert_match match; + struct rrd_alert_config config; + + struct { + uint32_t uses; + bool enabled; + bool is_on_disk; + SPINLOCK spinlock; + struct rrd_alert_prototype *prev, *next; + } _internal; +} RRD_ALERT_PROTOTYPE; +bool health_prototype_add(RRD_ALERT_PROTOTYPE *ap, char **msg); +void health_prototype_cleanup(RRD_ALERT_PROTOTYPE *ap); +void health_prototype_free(RRD_ALERT_PROTOTYPE *ap); + +struct health_plugin_globals { + struct { + SPINLOCK spinlock; + bool done; + } initialization; + + struct { + bool enabled; + bool stock_enabled; + bool use_summary_for_notifications; + + unsigned int health_log_entries_max; + uint32_t health_log_history; // the health log history in seconds to be kept in db + + STRING *silencers_filename; + STRING *default_exec; + STRING *default_recipient; + + SIMPLE_PATTERN *enabled_alerts; + + uint32_t default_warn_repeat_every; // the default value for the interval between repeating warning notifications + uint32_t default_crit_repeat_every; // the default value for the interval between repeating critical notifications + + int32_t run_at_least_every_seconds; + int32_t postpone_alarms_during_hibernation_for_seconds; + } config; + + struct { + DICTIONARY *dict; + } prototypes; +}; + +extern struct health_plugin_globals health_globals; + +int health_readfile(const char *filename, void *data, bool stock_config); +void unlink_alarm_notify_in_progress(ALARM_ENTRY *ae); +void wait_for_all_notifications_to_finish_before_allowing_health_to_be_cleaned_up(void); + +void health_alarm_wait_for_execution(ALARM_ENTRY *ae); + +bool rrdcalc_add_from_prototype(RRDHOST *host, RRDSET *st, RRD_ALERT_PROTOTYPE *ap); + +int dyncfg_health_cb(const char *transaction, const char *id, DYNCFG_CMDS cmd, const char *add_name, + BUFFER *payload, usec_t *stop_monotonic_ut, bool *cancelled, + BUFFER *result, HTTP_ACCESS access, const char *source, void *data); + +void health_dyncfg_unregister_all_prototypes(void); +void health_dyncfg_register_all_prototypes(void); +void health_prototype_to_json(BUFFER *wb, RRD_ALERT_PROTOTYPE *ap, bool for_hashing); + +bool alert_variable_lookup(STRING *variable, void *data, NETDATA_DOUBLE *result); + +struct health_raised_summary; +struct health_raised_summary *alerts_raised_summary_create(RRDHOST *host); +void alerts_raised_summary_populate(struct health_raised_summary *hrm); +void alerts_raised_summary_free(struct health_raised_summary *hrm); +void health_send_notification(RRDHOST *host, ALARM_ENTRY *ae, struct health_raised_summary *hrm); +void health_alarm_log_process_to_send_notifications(RRDHOST *host, struct health_raised_summary *hrm); + +void health_apply_prototype_to_host(RRDHOST *host, RRD_ALERT_PROTOTYPE *ap); +void health_prototype_apply_to_all_hosts(RRD_ALERT_PROTOTYPE *ap); + +#endif //NETDATA_HEALTH_INTERNALS_H diff --git a/src/health/health_json.c b/src/health/health_json.c new file mode 100644 index 000000000..68bfb5229 --- /dev/null +++ b/src/health/health_json.c @@ -0,0 +1,286 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "health.h" + +void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix) { + if(value && *value) { + buffer_sprintf(wb, "%s\"%s\":\"", prefix, label); + buffer_strcat_htmlescape(wb, value); + buffer_strcat(wb, "\""); + buffer_strcat(wb, suffix); + } + else + buffer_sprintf(wb, "%s\"%s\":null%s", prefix, label, suffix); +} + +static inline void health_rrdcalc_values2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC *rc) { + (void)host; + buffer_sprintf(wb, + "\t\t\"%s.%s\": {\n" + "\t\t\t\"id\": %lu,\n" + , rrdcalc_chart_name(rc), rrdcalc_name(rc) + , (unsigned long)rc->id); + + buffer_strcat(wb, "\t\t\t\"value\":"); + buffer_print_netdata_double(wb, rc->value); + buffer_strcat(wb, ",\n"); + + buffer_strcat(wb, "\t\t\t\"last_updated\":"); + buffer_sprintf(wb, "%lu", (unsigned long)rc->last_updated); + buffer_strcat(wb, ",\n"); + + buffer_sprintf(wb, + "\t\t\t\"status\": \"%s\"\n" + , rrdcalc_status2string(rc->status)); + + buffer_strcat(wb, "\t\t}"); +} + +static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC *rc) { + char value_string[100 + 1]; + format_value_and_unit(value_string, 100, rc->value, rrdcalc_units(rc), -1); + + char hash_id[GUID_LEN + 1]; + uuid_unparse_lower(rc->config.hash_id, hash_id); + + buffer_sprintf(wb, + "\t\t\"%s.%s\": {\n" + "\t\t\t\"id\": %lu,\n" + "\t\t\t\"config_hash_id\": \"%s\",\n" + "\t\t\t\"name\": \"%s\",\n" + "\t\t\t\"chart\": \"%s\",\n" + "\t\t\t\"class\": \"%s\",\n" + "\t\t\t\"component\": \"%s\",\n" + "\t\t\t\"type\": \"%s\",\n" + "\t\t\t\"active\": %s,\n" + "\t\t\t\"disabled\": %s,\n" + "\t\t\t\"silenced\": %s,\n" + "\t\t\t\"exec\": \"%s\",\n" + "\t\t\t\"recipient\": \"%s\",\n" + "\t\t\t\"source\": \"%s\",\n" + "\t\t\t\"units\": \"%s\",\n" + "\t\t\t\"summary\": \"%s\",\n" + "\t\t\t\"info\": \"%s\",\n" + "\t\t\t\"status\": \"%s\",\n" + "\t\t\t\"last_status_change\": %lu,\n" + "\t\t\t\"last_updated\": %lu,\n" + "\t\t\t\"next_update\": %lu,\n" + "\t\t\t\"update_every\": %d,\n" + "\t\t\t\"delay_up_duration\": %d,\n" + "\t\t\t\"delay_down_duration\": %d,\n" + "\t\t\t\"delay_max_duration\": %d,\n" + "\t\t\t\"delay_multiplier\": %f,\n" + "\t\t\t\"delay\": %d,\n" + "\t\t\t\"delay_up_to_timestamp\": %lu,\n" + "\t\t\t\"warn_repeat_every\": \"%u\",\n" + "\t\t\t\"crit_repeat_every\": \"%u\",\n" + "\t\t\t\"value_string\": \"%s\",\n" + "\t\t\t\"last_repeat\": \"%lu\",\n" + "\t\t\t\"times_repeat\": %lu,\n" + , rrdcalc_chart_name(rc), rrdcalc_name(rc) + , (unsigned long)rc->id + , hash_id + , rrdcalc_name(rc) + , rrdcalc_chart_name(rc) + , rc->config.classification?rrdcalc_classification(rc):"Unknown" + , rc->config.component?rrdcalc_component(rc):"Unknown" + , rc->config.type?rrdcalc_type(rc):"Unknown" + , (rc->rrdset)?"true":"false" + , (rc->run_flags & RRDCALC_FLAG_DISABLED)?"true":"false" + , (rc->run_flags & RRDCALC_FLAG_SILENCED)?"true":"false" + , rc->config.exec?rrdcalc_exec(rc):string2str(host->health.health_default_exec) + , rc->config.recipient?rrdcalc_recipient(rc):string2str(host->health.health_default_recipient) + , rrdcalc_source(rc) + , rrdcalc_units(rc) + , string2str(rc->summary) + , string2str(rc->info) + , rrdcalc_status2string(rc->status) + , (unsigned long)rc->last_status_change + , (unsigned long)rc->last_updated + , (unsigned long)rc->next_update + , rc->config.update_every + , rc->config.delay_up_duration + , rc->config.delay_down_duration + , rc->config.delay_max_duration + , rc->config.delay_multiplier + , rc->delay_last + , (unsigned long)rc->delay_up_to_timestamp + , rc->config.warn_repeat_every + , rc->config.crit_repeat_every + , value_string + , (unsigned long)rc->last_repeat + , (unsigned long)rc->times_repeat + ); + + if(unlikely(rc->config.alert_action_options & ALERT_ACTION_OPTION_NO_CLEAR_NOTIFICATION)) { + buffer_strcat(wb, "\t\t\t\"no_clear_notification\": true,\n"); + } + + if(RRDCALC_HAS_DB_LOOKUP(rc)) { + if(rc->config.dimensions) + health_string2json(wb, "\t\t\t", "lookup_dimensions", rrdcalc_dimensions(rc), ",\n"); + + buffer_sprintf(wb, + "\t\t\t\"db_after\": %lu,\n" + "\t\t\t\"db_before\": %lu,\n" + "\t\t\t\"lookup_method\": \"%s\",\n" + "\t\t\t\"lookup_after\": %d,\n" + "\t\t\t\"lookup_before\": %d,\n" + "\t\t\t\"lookup_options\": \"", + (unsigned long) rc->db_after, + (unsigned long) rc->db_before, + time_grouping_id2txt(rc->config.time_group), + rc->config.after, + rc->config.before + ); + rrdr_options_to_buffer(wb, rc->config.options); + buffer_strcat(wb, "\",\n"); + } + + if(rc->config.calculation) { + health_string2json(wb, "\t\t\t", "calc", expression_source(rc->config.calculation), ",\n"); + health_string2json(wb, "\t\t\t", "calc_parsed", expression_parsed_as(rc->config.calculation), ",\n"); + } + + if(rc->config.warning) { + health_string2json(wb, "\t\t\t", "warn", expression_source(rc->config.warning), ",\n"); + health_string2json(wb, "\t\t\t", "warn_parsed", expression_parsed_as(rc->config.warning), ",\n"); + } + + if(rc->config.critical) { + health_string2json(wb, "\t\t\t", "crit", expression_source(rc->config.critical), ",\n"); + health_string2json(wb, "\t\t\t", "crit_parsed", expression_parsed_as(rc->config.critical), ",\n"); + } + + buffer_strcat(wb, "\t\t\t\"green\":"); + buffer_print_netdata_double(wb, NAN); + buffer_strcat(wb, ",\n"); + + buffer_strcat(wb, "\t\t\t\"red\":"); + buffer_print_netdata_double(wb, NAN); + buffer_strcat(wb, ",\n"); + + buffer_strcat(wb, "\t\t\t\"value\":"); + buffer_print_netdata_double(wb, rc->value); + buffer_strcat(wb, "\n"); + + buffer_strcat(wb, "\t\t}"); +} + +void health_aggregate_alarms(RRDHOST *host, BUFFER *wb, BUFFER* contexts, RRDCALC_STATUS status) { + RRDCALC *rc; + int numberOfAlarms = 0; + char *tok = NULL; + char *p = NULL; + + if (contexts) { + p = (char*)buffer_tostring(contexts); + while(p && *p && (tok = strsep_skip_consecutive_separators(&p, ", |"))) { + if(!*tok) continue; + + STRING *tok_string = string_strdupz(tok); + + foreach_rrdcalc_in_rrdhost_read(host, rc) { + if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec)) + continue; + if (unlikely(!rrdset_is_available_for_exporting_and_alarms(rc->rrdset))) + continue; + if(unlikely(rc->rrdset + && rc->rrdset->context == tok_string + && ((status==RRDCALC_STATUS_RAISED)?(rc->status >= RRDCALC_STATUS_WARNING):rc->status == status))) + numberOfAlarms++; + } + foreach_rrdcalc_in_rrdhost_done(rc); + + string_freez(tok_string); + } + } + else { + foreach_rrdcalc_in_rrdhost_read(host, rc) { + if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec)) + continue; + if (unlikely(!rrdset_is_available_for_exporting_and_alarms(rc->rrdset))) + continue; + if(unlikely((status==RRDCALC_STATUS_RAISED)?(rc->status >= RRDCALC_STATUS_WARNING):rc->status == status)) + numberOfAlarms++; + } + foreach_rrdcalc_in_rrdhost_done(rc); + } + + buffer_sprintf(wb, "%d", numberOfAlarms); +} + +static void health_alarms2json_fill_alarms(RRDHOST *host, BUFFER *wb, int all, void (*fp)(RRDHOST *, BUFFER *, RRDCALC *)) { + RRDCALC *rc; + int i = 0; + foreach_rrdcalc_in_rrdhost_read(host, rc) { + if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec)) + continue; + + if (unlikely(!rrdset_is_available_for_exporting_and_alarms(rc->rrdset))) + continue; + + if(likely(!all && !(rc->status == RRDCALC_STATUS_WARNING || rc->status == RRDCALC_STATUS_CRITICAL))) + continue; + + if(likely(i)) buffer_strcat(wb, ",\n"); + fp(host, wb, rc); + i++; + } + foreach_rrdcalc_in_rrdhost_done(rc); +} + +void health_alarms2json(RRDHOST *host, BUFFER *wb, int all) { + buffer_sprintf(wb, "{\n\t\"hostname\": \"%s\"," + "\n\t\"latest_alarm_log_unique_id\": %u," + "\n\t\"status\": %s," + "\n\t\"now\": %lu," + "\n\t\"alarms\": {\n", + rrdhost_hostname(host), + (host->health_log.next_log_id > 0)?(host->health_log.next_log_id - 1):0, + host->health.health_enabled?"true":"false", + (unsigned long)now_realtime_sec()); + + health_alarms2json_fill_alarms(host, wb, all, health_rrdcalc2json_nolock); + + buffer_strcat(wb, "\n\t}\n}\n"); +} + +void health_alarms_values2json(RRDHOST *host, BUFFER *wb, int all) { + buffer_sprintf(wb, "{\n\t\"hostname\": \"%s\"," + "\n\t\"alarms\": {\n", + rrdhost_hostname(host)); + + health_alarms2json_fill_alarms(host, wb, all, health_rrdcalc_values2json_nolock); + + buffer_strcat(wb, "\n\t}\n}\n"); +} + +void health_entry_flags_to_json_array(BUFFER *wb, const char *key, HEALTH_ENTRY_FLAGS flags) { + buffer_json_member_add_array(wb, key); + + if(flags & HEALTH_ENTRY_FLAG_PROCESSED) + buffer_json_add_array_item_string(wb, "PROCESSED"); + if(flags & HEALTH_ENTRY_FLAG_UPDATED) + buffer_json_add_array_item_string(wb, "UPDATED"); + if(flags & HEALTH_ENTRY_FLAG_EXEC_RUN) + buffer_json_add_array_item_string(wb, "EXEC_RUN"); + if(flags & HEALTH_ENTRY_FLAG_EXEC_FAILED) + buffer_json_add_array_item_string(wb, "EXEC_FAILED"); + if(flags & HEALTH_ENTRY_FLAG_SILENCED) + buffer_json_add_array_item_string(wb, "SILENCED"); + if(flags & HEALTH_ENTRY_RUN_ONCE) + buffer_json_add_array_item_string(wb, "RUN_ONCE"); + if(flags & HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS) + buffer_json_add_array_item_string(wb, "EXEC_IN_PROGRESS"); + if(flags & HEALTH_ENTRY_FLAG_IS_REPEATING) + buffer_json_add_array_item_string(wb, "RECURRING"); + if(flags & HEALTH_ENTRY_FLAG_SAVED) + buffer_json_add_array_item_string(wb, "SAVED"); + if(flags & HEALTH_ENTRY_FLAG_ACLK_QUEUED) + buffer_json_add_array_item_string(wb, "ACLK_QUEUED"); + if(flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION) + buffer_json_add_array_item_string(wb, "NO_CLEAR_NOTIFICATION"); + + buffer_json_array_close(wb); +} diff --git a/health/health_log.c b/src/health/health_log.c index fd124ce80..b04f8f248 100644 --- a/health/health_log.c +++ b/src/health/health_log.c @@ -10,6 +10,8 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) { void health_log_alert_transition_with_trace(RRDHOST *host, ALARM_ENTRY *ae, int line, const char *file, const char *function) { + if(!host || !ae) return; + ND_LOG_STACK lgs[] = { ND_LOG_FIELD_UUID(NDF_MESSAGE_ID, &health_alert_transition_msgid), ND_LOG_FIELD_STR(NDF_NIDL_NODE, host->hostname), @@ -86,31 +88,31 @@ void health_log_alert_transition_with_trace(RRDHOST *host, ALARM_ENTRY *ae, int inline ALARM_ENTRY* health_create_alarm_entry( RRDHOST *host, - uint32_t alarm_id, - uint32_t alarm_event_id, - const uuid_t config_hash_id, + RRDCALC *rc, time_t when, - STRING *name, - STRING *chart, - STRING *chart_context, - STRING *chart_name, - STRING *class, - STRING *component, - STRING *type, - STRING *exec, - STRING *recipient, time_t duration, NETDATA_DOUBLE old_value, NETDATA_DOUBLE new_value, RRDCALC_STATUS old_status, RRDCALC_STATUS new_status, - STRING *source, - STRING *units, - STRING *summary, - STRING *info, int delay, HEALTH_ENTRY_FLAGS flags ) { + uint32_t alarm_id = rc->id; + uint32_t alarm_event_id = rc->next_event_id++; + STRING *name = rc->config.name; + STRING *chart = rc->rrdset->id; + STRING *chart_context = rc->rrdset->context; + STRING *chart_name = rc->rrdset->name; + STRING *class = rc->config.classification; + STRING *component = rc->config.component; + STRING *type = rc->config.type; + STRING *exec = rc->config.exec; + STRING *recipient = rc->config.recipient; + STRING *source = rc->config.source; + STRING *units = rc->config.units; + STRING *summary = rc->summary; + STRING *info = rc->info; if (duration < 0) duration = 0; @@ -123,7 +125,7 @@ inline ALARM_ENTRY* health_create_alarm_entry( ae->chart_context = string_dup(chart_context); ae->chart_name = string_dup(chart_name); - uuid_copy(ae->config_hash_id, *((uuid_t *) config_hash_id)); + uuid_copy(ae->config_hash_id, rc->config.hash_id); uuid_generate_random(ae->transition_id); ae->global_id = now_realtime_usec(); diff --git a/src/health/health_notifications.c b/src/health/health_notifications.c new file mode 100644 index 000000000..79426f48c --- /dev/null +++ b/src/health/health_notifications.c @@ -0,0 +1,569 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "health_internals.h" + +// the queue of executed alarm notifications that haven't been waited for yet +static struct { + ALARM_ENTRY *head; // oldest + ALARM_ENTRY *tail; // latest +} alarm_notifications_in_progress = {NULL, NULL}; + +struct health_raised_summary { + RRDHOST *host; + DICTIONARY *rrdcalc_dict; + + struct { + size_t size; + size_t used; + const DICTIONARY_ITEM **array; + } active_alerts; +}; + +void health_alarm_wait_for_execution(ALARM_ENTRY *ae) { + if (!(ae->flags & HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS)) + return; + + spawn_wait_cmd(ae->exec_spawn_serial, &ae->exec_code, &ae->exec_run_timestamp); + netdata_log_debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code); + ae->flags &= ~HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS; + + if(ae->exec_code != 0) + ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED; + + unlink_alarm_notify_in_progress(ae); +} + +void wait_for_all_notifications_to_finish_before_allowing_health_to_be_cleaned_up(void) { + ALARM_ENTRY *ae; + while (NULL != (ae = alarm_notifications_in_progress.head)) { + if(unlikely(!service_running(SERVICE_HEALTH))) + break; + + health_alarm_wait_for_execution(ae); + } +} + +void unlink_alarm_notify_in_progress(ALARM_ENTRY *ae) +{ + struct alarm_entry *prev = ae->prev_in_progress; + struct alarm_entry *next = ae->next_in_progress; + + if (NULL != prev) { + prev->next_in_progress = next; + } + if (NULL != next) { + next->prev_in_progress = prev; + } + if (ae == alarm_notifications_in_progress.head) { + alarm_notifications_in_progress.head = next; + } + if (ae == alarm_notifications_in_progress.tail) { + alarm_notifications_in_progress.tail = prev; + } +} + +static inline void enqueue_alarm_notify_in_progress(ALARM_ENTRY *ae) +{ + ae->prev_in_progress = NULL; + ae->next_in_progress = NULL; + + if (NULL != alarm_notifications_in_progress.tail) { + ae->prev_in_progress = alarm_notifications_in_progress.tail; + alarm_notifications_in_progress.tail->next_in_progress = ae; + } + if (NULL == alarm_notifications_in_progress.head) { + alarm_notifications_in_progress.head = ae; + } + alarm_notifications_in_progress.tail = ae; + +} + +static bool prepare_command(BUFFER *wb, + const char *exec, + const char *recipient, + const char *registry_hostname, + uint32_t unique_id, + uint32_t alarm_id, + uint32_t alarm_event_id, + uint32_t when, + const char *alert_name, + const char *alert_chart_name, + const char *new_status, + const char *old_status, + NETDATA_DOUBLE new_value, + NETDATA_DOUBLE old_value, + const char *alert_source, + uint32_t duration, + uint32_t non_clear_duration, + const char *alert_units, + const char *alert_info, + const char *new_value_string, + const char *old_value_string, + const char *source, + const char *error_msg, + int n_warn, + int n_crit, + const char *warn_alarms, + const char *crit_alarms, + const char *classification, + const char *edit_command, + const char *machine_guid, + nd_uuid_t *transition_id, + const char *summary, + const char *context, + const char *component, + const char *type +) { + char buf[8192]; + size_t n = sizeof(buf) - 1; + + buffer_strcat(wb, "exec"); + + if (!sanitize_command_argument_string(buf, exec, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, recipient, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, registry_hostname, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + buffer_sprintf(wb, " '%u'", unique_id); + + buffer_sprintf(wb, " '%u'", alarm_id); + + buffer_sprintf(wb, " '%u'", alarm_event_id); + + buffer_sprintf(wb, " '%u'", when); + + if (!sanitize_command_argument_string(buf, alert_name, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, alert_chart_name, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, new_status, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, old_status, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + buffer_sprintf(wb, " '" NETDATA_DOUBLE_FORMAT_ZERO "'", new_value); + + buffer_sprintf(wb, " '" NETDATA_DOUBLE_FORMAT_ZERO "'", old_value); + + if (!sanitize_command_argument_string(buf, alert_source, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + buffer_sprintf(wb, " '%u'", duration); + + buffer_sprintf(wb, " '%u'", non_clear_duration); + + if (!sanitize_command_argument_string(buf, alert_units, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, alert_info, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, new_value_string, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, old_value_string, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, source, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, error_msg, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + buffer_sprintf(wb, " '%d'", n_warn); + + buffer_sprintf(wb, " '%d'", n_crit); + + if (!sanitize_command_argument_string(buf, warn_alarms, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, crit_alarms, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, classification, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, edit_command, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, machine_guid, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + char tr_id[UUID_STR_LEN]; + uuid_unparse_lower(*transition_id, tr_id); + if (!sanitize_command_argument_string(buf, tr_id, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, summary, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, context, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, component, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, type, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + return true; +} + +static inline int compare_raised_alerts(const void *a, const void *b) { + const DICTIONARY_ITEM *item1 = *(const DICTIONARY_ITEM **)a; + const DICTIONARY_ITEM *item2 = *(const DICTIONARY_ITEM **)b; + + RRDCALC *rc1 = dictionary_acquired_item_value(item1); + RRDCALC *rc2 = dictionary_acquired_item_value(item2); + + return (int)(rc2->last_status_change - rc1->last_status_change); +} + +static void health_raised_summary_add_alert(struct health_raised_summary *hrm, const DICTIONARY_ITEM *item) { + if(hrm->active_alerts.used >= hrm->active_alerts.size) { + if(hrm->active_alerts.size == 0) + hrm->active_alerts.size = 2; + + hrm->active_alerts.size *= 2; + hrm->active_alerts.array = reallocz(hrm->active_alerts.array, sizeof(const DICTIONARY_ITEM *) * hrm->active_alerts.size); + } + + hrm->active_alerts.array[hrm->active_alerts.used++] = dictionary_acquired_item_dup(hrm->rrdcalc_dict, item); +} + +void alerts_raised_summary_free(struct health_raised_summary *hrm) { + for(size_t i = 0; i < hrm->active_alerts.used ;i++) + dictionary_acquired_item_release(hrm->rrdcalc_dict, hrm->active_alerts.array[i]); + + freez(hrm->active_alerts.array); + freez(hrm); +} + +struct health_raised_summary *alerts_raised_summary_create(RRDHOST *host) { + struct health_raised_summary *hrm = callocz(1, sizeof(*hrm)); + hrm->rrdcalc_dict = host->rrdcalc_root_index; + hrm->host = host; + return hrm; +} + +void alerts_raised_summary_populate(struct health_raised_summary *hrm) { + RRDCALC *rc; + foreach_rrdcalc_in_rrdhost_read(hrm->host, rc) { + if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec)) continue; + health_raised_summary_add_alert(hrm, rc_dfe.item); + } + foreach_rrdcalc_in_rrdhost_done(rc); + + if (hrm->active_alerts.used > 1) + qsort(hrm->active_alerts.array, hrm->active_alerts.used, sizeof(const DICTIONARY_ITEM *), compare_raised_alerts); +} + +static size_t +health_raised_summary_entries(struct health_raised_summary *hrm, BUFFER *dst, ALARM_ENTRY *ae, RRDCALC_STATUS status) { + buffer_flush(dst); + + size_t count = 0; + for(size_t i = 0; i < hrm->active_alerts.used ;i++) { + RRDCALC *rc = dictionary_acquired_item_value(hrm->active_alerts.array[i]); + if(rc->status != status) continue; + if(rc->id == ae->alarm_id) continue; + + count++; + if(buffer_strlen(dst)) buffer_putc(dst, ','); + buffer_sprintf(dst, "%s=%" PRId64, string2str(rc->config.name), (int64_t)rc->last_status_change); + } + + return count; +} + +static const char *health_raised_summary_my_expression_source(struct health_raised_summary *hrm, ALARM_ENTRY *ae) { + for(size_t i = 0; i < hrm->active_alerts.used ;i++) { + RRDCALC *rc = dictionary_acquired_item_value(hrm->active_alerts.array[i]); + if(rc->id != ae->alarm_id) continue; + + if(rc->status == RRDCALC_STATUS_CRITICAL) + return expression_source(rc->config.critical); + else + return expression_source(rc->config.warning); + } + + return ""; +} + +static const char *health_raised_summary_my_expression_error(struct health_raised_summary *hrm, ALARM_ENTRY *ae) { + for(size_t i = 0; i < hrm->active_alerts.used ;i++) { + RRDCALC *rc = dictionary_acquired_item_value(hrm->active_alerts.array[i]); + if(rc->id != ae->alarm_id) continue; + + if(rc->status == RRDCALC_STATUS_CRITICAL) + return expression_error_msg(rc->config.critical); + else + return expression_error_msg(rc->config.warning); + } + + return ""; +} + +void health_send_notification(RRDHOST *host, ALARM_ENTRY *ae, struct health_raised_summary *hrm) { + netdata_log_debug(D_HEALTH, "Health alarm '%s.%s' = " NETDATA_DOUBLE_FORMAT_AUTO " - changed status from %s to %s", + ae->chart?ae_chart_id(ae):"NOCHART", ae_name(ae), + ae->new_value, + rrdcalc_status2string(ae->old_status), + rrdcalc_status2string(ae->new_status) + ); + + ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED; + + if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) { + // do not send notifications for internal statuses + netdata_log_debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); + goto done; + } + + if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) { + // do not send notifications for disabled statuses + + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "[%s]: Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", + rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); + + // mark it as run, so that we will send the same alarm if it happens again + goto done; + } + + // find the previous notification for the same alarm + // which we have run the exec script + // exception: alarms with HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION set + RRDCALC_STATUS last_executed_status = -3; + if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) { + int ret = sql_health_get_last_executed_event(host, ae, &last_executed_status); + + if (likely(ret == 1)) { + // we have executed this alarm notification in the past + if(last_executed_status == ae->new_status && !(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING)) { + // don't send the notification for the same status again + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "[%s]: Health not sending again notification for alarm '%s.%s' status %s", + rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), + rrdcalc_status2string(ae->new_status)); + goto done; + } + } + else { + // we have not executed this alarm notification in the past + // so, don't send CLEAR notifications + if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) { + if((!(ae->flags & HEALTH_ENTRY_RUN_ONCE)) || (ae->flags & HEALTH_ENTRY_RUN_ONCE && ae->old_status < RRDCALC_STATUS_RAISED) ) { + netdata_log_debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s" + , ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); + goto done; + } + } + } + } + + // Check if alarm notifications are silenced + if (ae->flags & HEALTH_ENTRY_FLAG_SILENCED) { + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "[%s]: Health not sending notification for alarm '%s.%s' status %s " + "(command API has disabled notifications)", + rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); + goto done; + } + + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "[%s]: Sending notification for alarm '%s.%s' status %s.", + rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); + + const char *exec = (ae->exec) ? ae_exec(ae) : string2str(host->health.health_default_exec); + const char *recipient = (ae->recipient) ? ae_recipient(ae) : string2str(host->health.health_default_recipient); + + char *edit_command = ae->source ? health_edit_command_from_source(ae_source(ae)) : strdupz("UNKNOWN=0=UNKNOWN"); + + BUFFER *warn_alarms = buffer_create(1024, &netdata_buffers_statistics.buffers_health); + BUFFER *crit_alarms = buffer_create(1024, &netdata_buffers_statistics.buffers_health); + + size_t n_warn = health_raised_summary_entries(hrm, warn_alarms, ae, RRDCALC_STATUS_WARNING); + size_t n_crit = health_raised_summary_entries(hrm, crit_alarms, ae, RRDCALC_STATUS_CRITICAL); + + BUFFER *wb = buffer_create(8192, &netdata_buffers_statistics.buffers_health); + bool ok = prepare_command(wb, + exec, + recipient, + rrdhost_registry_hostname(host), + ae->unique_id, + ae->alarm_id, + ae->alarm_event_id, + (unsigned long)ae->when, + ae_name(ae), + ae->chart?ae_chart_id(ae):"NOCHART", + rrdcalc_status2string(ae->new_status), + rrdcalc_status2string(ae->old_status), + ae->new_value, + ae->old_value, + ae->source?ae_source(ae):"UNKNOWN", + (uint32_t)ae->duration, + (ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING && ae->new_status >= RRDCALC_STATUS_WARNING) ? (uint32_t)ae->duration : (uint32_t)ae->non_clear_duration, + ae_units(ae), + ae_info(ae), + ae_new_value_string(ae), + ae_old_value_string(ae), + health_raised_summary_my_expression_source(hrm, ae), + health_raised_summary_my_expression_error(hrm, ae), + n_warn, + n_crit, + buffer_tostring(warn_alarms), + buffer_tostring(crit_alarms), + ae->classification?ae_classification(ae):"Unknown", + edit_command, + host->machine_guid, + &ae->transition_id, + host->health.use_summary_for_notifications && ae->summary?ae_summary(ae):ae_name(ae), + string2str(ae->chart_context), + string2str(ae->component), + string2str(ae->type) + ); + + const char *command_to_run = buffer_tostring(wb); + if (ok) { + ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN; + ae->exec_run_timestamp = now_realtime_sec(); /* will be updated by real time after spawning */ + + netdata_log_debug(D_HEALTH, "executing command '%s'", command_to_run); + ae->flags |= HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS; + ae->exec_spawn_serial = spawn_enq_cmd(command_to_run); + enqueue_alarm_notify_in_progress(ae); + health_alarm_log_save(host, ae); + } else { + netdata_log_error("Failed to format command arguments"); + } + + buffer_free(warn_alarms); + buffer_free(crit_alarms); + buffer_free(wb); + freez(edit_command); + + return; //health_alarm_wait_for_execution +done: + health_alarm_log_save(host, ae); +} + +bool health_alarm_log_get_global_id_and_transition_id_for_rrdcalc(RRDCALC *rc, usec_t *global_id, nd_uuid_t *transitions_id) { + if(!rc->rrdset) + return false; + + RRDHOST *host = rc->rrdset->rrdhost; + + rw_spinlock_read_lock(&host->health_log.spinlock); + + ALARM_ENTRY *ae; + for(ae = host->health_log.alarms; ae ; ae = ae->next) { + if(unlikely(ae->alarm_id == rc->id)) + break; + } + + if(ae) { + *global_id = ae->global_id; + uuid_copy(*transitions_id, ae->transition_id); + } + else { + *global_id = 0; + uuid_clear(*transitions_id); + } + + rw_spinlock_read_unlock(&host->health_log.spinlock); + + return ae != NULL; +} + +void health_alarm_log_process_to_send_notifications(RRDHOST *host, struct health_raised_summary *hrm) { + uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0; + time_t now = now_realtime_sec(); + + rw_spinlock_read_lock(&host->health_log.spinlock); + + ALARM_ENTRY *ae; + for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id; ae = ae->next) { + if(unlikely( + !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) && + !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED) + )) { + if(unlikely(ae->unique_id < first_waiting)) + first_waiting = ae->unique_id; + + if(likely(now >= ae->delay_up_to_timestamp)) + health_send_notification(host, ae, hrm); + } + } + + rw_spinlock_read_unlock(&host->health_log.spinlock); + + // remember this for the next iteration + host->health_last_processed_id = first_waiting; + + //delete those that are updated, no in progress execution, and is not repeating + rw_spinlock_write_lock(&host->health_log.spinlock); + + ALARM_ENTRY *prev = NULL, *next = NULL; + for(ae = host->health_log.alarms; ae ; ae = next) { + next = ae->next; // set it here, for the next iteration + + if((likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING)) && + (ae->flags & HEALTH_ENTRY_FLAG_UPDATED) && + (ae->flags & HEALTH_ENTRY_FLAG_SAVED) && + !(ae->flags & HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS)) + || + ((ae->new_status == RRDCALC_STATUS_REMOVED) && + (ae->flags & HEALTH_ENTRY_FLAG_SAVED) && + (ae->when + 86400 < now_realtime_sec()))) + { + + if(host->health_log.alarms == ae) { + host->health_log.alarms = next; + // prev is also NULL here + } + else { + prev->next = next; + // prev should not be touched here - we need it for the next iteration + // because we may have to also remove the next item + } + + health_alarm_log_free_one_nochecks_nounlink(ae); + } + else + prev = ae; + } + + rw_spinlock_write_unlock(&host->health_log.spinlock); +} diff --git a/src/health/health_prototypes.c b/src/health/health_prototypes.c new file mode 100644 index 000000000..c43096115 --- /dev/null +++ b/src/health/health_prototypes.c @@ -0,0 +1,717 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "health_internals.h" + +// --------------------------------------------------------------------------------------------------------------------- + +static struct { + ALERT_LOOKUP_DIMS_GROUPING group; + const char *name; +} dims_grouping[] = { + { .group = ALERT_LOOKUP_DIMS_SUM, .name = "sum" }, + { .group = ALERT_LOOKUP_DIMS_MIN, .name = "min" }, + { .group = ALERT_LOOKUP_DIMS_MAX, .name = "max" }, + { .group = ALERT_LOOKUP_DIMS_AVERAGE, .name = "average" }, + { .group = ALERT_LOOKUP_DIMS_MIN2MAX, .name = "min2max" }, + + // terminator + { .group = 0, .name = NULL }, +}; + +ALERT_LOOKUP_DIMS_GROUPING alerts_dims_grouping2id(const char *group) { + if(!group || !*group) + return dims_grouping[0].group; + + for(size_t i = 0; dims_grouping[i].name ;i++) { + if(strcmp(dims_grouping[i].name, group) == 0) + return dims_grouping[i].group; + } + + nd_log(NDLS_DAEMON, NDLP_WARNING, "Alert lookup dimensions grouping '%s' is not valid", group); + return dims_grouping[0].group; +} + +const char *alerts_dims_grouping_id2group(ALERT_LOOKUP_DIMS_GROUPING grouping) { + for(size_t i = 0; dims_grouping[i].name ;i++) { + if(grouping == dims_grouping[i].group) + return dims_grouping[i].name; + } + + nd_log(NDLS_DAEMON, NDLP_WARNING, "Alert lookup dimensions grouping %d is not valid", grouping); + return dims_grouping[0].name; +} + +// --------------------------------------------------------------------------------------------------------------------- + +static struct { + ALERT_LOOKUP_DATA_SOURCE source; + const char *name; +} data_sources[] = { + { .source = ALERT_LOOKUP_DATA_SOURCE_SAMPLES, .name = "samples" }, + { .source = ALERT_LOOKUP_DATA_SOURCE_PERCENTAGES, .name = "percentages" }, + { .source = ALERT_LOOKUP_DATA_SOURCE_ANOMALIES, .name = "anomalies" }, + + // terminator + { .source = 0, .name = NULL }, +}; + +ALERT_LOOKUP_DATA_SOURCE alerts_data_sources2id(const char *source) { + if(!source || !*source) + return data_sources[0].source; + + for(size_t i = 0; data_sources[i].name ;i++) { + if(strcmp(data_sources[i].name, source) == 0) + return data_sources[i].source; + } + + nd_log(NDLS_DAEMON, NDLP_WARNING, "Alert data source '%s' is not valid", source); + return data_sources[0].source; +} + +const char *alerts_data_source_id2source(ALERT_LOOKUP_DATA_SOURCE source) { + for(size_t i = 0; data_sources[i].name ;i++) { + if(source == data_sources[i].source) + return data_sources[i].name; + } + + nd_log(NDLS_DAEMON, NDLP_WARNING, "Alert data source %d is not valid", source); + return data_sources[0].name; +} + +// --------------------------------------------------------------------------------------------------------------------- + +static struct { + ALERT_LOOKUP_TIME_GROUP_CONDITION condition; + const char *name; +} group_conditions[] = { + { .condition = ALERT_LOOKUP_TIME_GROUP_CONDITION_EQUAL, .name = "=" }, + { .condition = ALERT_LOOKUP_TIME_GROUP_CONDITION_NOT_EQUAL, .name = "!=" }, + { .condition = ALERT_LOOKUP_TIME_GROUP_CONDITION_GREATER, .name = ">" }, + { .condition = ALERT_LOOKUP_TIME_GROUP_CONDITION_GREATER_EQUAL, .name = ">=" }, + { .condition = ALERT_LOOKUP_TIME_GROUP_CONDITION_LESS, .name = "<" }, + { .condition = ALERT_LOOKUP_TIME_GROUP_CONDITION_LESS_EQUAL, .name = "<=" }, + + // terminator + { .condition = 0, .name = NULL }, +}; + +ALERT_LOOKUP_TIME_GROUP_CONDITION alerts_group_condition2id(const char *source) { + if(!source || !*source) + return group_conditions[0].condition; + + for(size_t i = 0; group_conditions[i].name ;i++) { + if(strcmp(group_conditions[i].name, source) == 0) + return group_conditions[i].condition; + } + + nd_log(NDLS_DAEMON, NDLP_WARNING, "Alert data source '%s' is not valid", source); + return group_conditions[0].condition; +} + +const char *alerts_group_conditions_id2txt(ALERT_LOOKUP_TIME_GROUP_CONDITION source) { + for(size_t i = 0; group_conditions[i].name ;i++) { + if(source == group_conditions[i].condition) + return group_conditions[i].name; + } + + nd_log(NDLS_DAEMON, NDLP_WARNING, "Alert data source %d is not valid", source); + return group_conditions[0].name; +} + +// --------------------------------------------------------------------------------------------------------------------- + +static struct { + const char *name; + uint32_t hash; + ALERT_ACTION_OPTIONS value; +} alert_action_options[] = { + { "no-clear-notification", 0 , ALERT_ACTION_OPTION_NO_CLEAR_NOTIFICATION} + + // terminator + , {NULL, 0, 0} +}; + +inline ALERT_ACTION_OPTIONS alert_action_options_parse_one(const char *o) { + ALERT_ACTION_OPTIONS ret = 0; + + if(!o || !*o) return ret; + + uint32_t hash = simple_hash(o); + int i; + for(i = 0; alert_action_options[i].name ; i++) { + if (unlikely(hash == alert_action_options[i].hash && !strcmp(o, alert_action_options[i].name))) { + ret |= alert_action_options[i].value; + break; + } + } + + return ret; +} + +inline ALERT_ACTION_OPTIONS alert_action_options_parse(char *o) { + ALERT_ACTION_OPTIONS ret = 0; + char *tok; + + while(o && *o && (tok = strsep_skip_consecutive_separators(&o, ", |"))) { + if(!*tok) continue; + ret |= alert_action_options_parse_one(tok); + } + + return ret; +} + +void alert_action_options_to_buffer_json_array(BUFFER *wb, const char *key, ALERT_ACTION_OPTIONS options) { + buffer_json_member_add_array(wb, key); + + RRDR_OPTIONS used = 0; // to prevent adding duplicates + for(int i = 0; alert_action_options[i].name ; i++) { + if (unlikely((alert_action_options[i].value & options) && !(alert_action_options[i].value & used))) { + const char *name = alert_action_options[i].name; + used |= alert_action_options[i].value; + + buffer_json_add_array_item_string(wb, name); + } + } + + buffer_json_array_close(wb); +} + +void alert_action_options_to_buffer(BUFFER *wb, ALERT_ACTION_OPTIONS options) { + RRDR_OPTIONS used = 0; // to prevent adding duplicates + for(int i = 0; alert_action_options[i].name ; i++) { + if (unlikely((alert_action_options[i].value & options) && !(alert_action_options[i].value & used))) { + if(used != 0) + buffer_strcat(wb, " "); + + const char *name = alert_action_options[i].name; + used |= alert_action_options[i].value; + + buffer_strcat(wb, name); + } + } +} + +static void alert_action_options_init(void) { + for(int i = 0; alert_action_options[i].name ; i++) + alert_action_options[i].hash = simple_hash(alert_action_options[i].name); +} + + +// --------------------------------------------------------------------------------------------------------------------- + +static void health_prototype_cleanup_one_unsafe(RRD_ALERT_PROTOTYPE *ap) { + rrd_alert_match_cleanup(&ap->match); + rrd_alert_config_cleanup(&ap->config); +} + +void health_prototype_cleanup(RRD_ALERT_PROTOTYPE *ap) { + spinlock_lock(&ap->_internal.spinlock); + + while(ap->_internal.next) { + RRD_ALERT_PROTOTYPE *t = ap->_internal.next; + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(ap->_internal.next, t, _internal.prev, _internal.next); + health_prototype_cleanup_one_unsafe(t); + freez(t); + } + + spinlock_unlock(&ap->_internal.spinlock); + + health_prototype_cleanup_one_unsafe(ap); +} + +void health_prototype_free(RRD_ALERT_PROTOTYPE *ap) { + if(!ap) return; + health_prototype_cleanup(ap); + freez(ap); +} + +void health_prototype_insert_cb(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *data __maybe_unused) { + RRD_ALERT_PROTOTYPE *ap = value; + spinlock_init(&ap->_internal.spinlock); + if(ap->config.source_type != DYNCFG_SOURCE_TYPE_DYNCFG) + ap->_internal.is_on_disk = true; +} + +bool health_prototype_conflict_cb(const DICTIONARY_ITEM *item __maybe_unused, void *old_value, void *new_value, void *data __maybe_unused) { + RRD_ALERT_PROTOTYPE *ap = old_value; + RRD_ALERT_PROTOTYPE *nap = new_value; + + bool replace = nap->config.source_type == DYNCFG_SOURCE_TYPE_DYNCFG; + + if(ap->config.source_type != DYNCFG_SOURCE_TYPE_DYNCFG || nap->config.source_type != DYNCFG_SOURCE_TYPE_DYNCFG) + ap->_internal.is_on_disk = nap->_internal.is_on_disk = true; + + if(!replace) { + if(ap->config.source_type == DYNCFG_SOURCE_TYPE_DYNCFG) { + // the existing is a dyncfg and the new one is read from the config + health_prototype_cleanup(nap); + memset(nap, 0, sizeof(*nap)); + } + else { + // alerts with the same name are appended to the existing one + nap = callocz(1, sizeof(*nap)); + memcpy(nap, new_value, sizeof(*nap)); + + spinlock_lock(&ap->_internal.spinlock); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(ap->_internal.next, nap, _internal.prev, _internal.next); + spinlock_unlock(&ap->_internal.spinlock); + + if(nap->_internal.enabled) + ap->_internal.enabled = true; + } + } + else { + // alerts with the same name replace the existing one + spinlock_init(&nap->_internal.spinlock); + nap->_internal.uses = ap->_internal.uses; + + spinlock_lock(&nap->_internal.spinlock); + spinlock_lock(&ap->_internal.spinlock); + SWAP(*ap, *nap); + spinlock_unlock(&ap->_internal.spinlock); + spinlock_unlock(&nap->_internal.spinlock); + + health_prototype_cleanup(nap); + memset(nap, 0, sizeof(*nap)); + } + + return true; +} + +void health_prototype_delete_cb(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *data __maybe_unused) { + RRD_ALERT_PROTOTYPE *ap = value; + health_prototype_cleanup(ap); +} + +void health_init_prototypes(void) { + if(health_globals.prototypes.dict) + return; + + health_globals.prototypes.dict = dictionary_create(DICT_OPTION_DONT_OVERWRITE_VALUE); + dictionary_register_insert_callback(health_globals.prototypes.dict, health_prototype_insert_cb, NULL); + dictionary_register_conflict_callback(health_globals.prototypes.dict, health_prototype_conflict_cb, NULL); + dictionary_register_delete_callback(health_globals.prototypes.dict, health_prototype_delete_cb, NULL); + + alert_action_options_init(); +} + +// --------------------------------------------------------------------------------------------------------------------- + +static inline struct pattern_array *health_config_add_key_to_values(struct pattern_array *pa, const char *input_key, char *value) +{ + char key[HEALTH_CONF_MAX_LINE + 1]; + char data[HEALTH_CONF_MAX_LINE + 1]; + + char *s = value; + size_t i = 0; + + char pair[HEALTH_CONF_MAX_LINE + 1]; + if (input_key) + strncpyz(key, input_key, HEALTH_CONF_MAX_LINE); + else + key[0] = '\0'; + + while(*s) { + if (*s == '=') { + //hold the key + data[i]='\0'; + strncpyz(key, data, HEALTH_CONF_MAX_LINE); + i=0; + } else if (*s == ' ') { + data[i]='\0'; + if (data[0]=='!') + snprintfz(pair, HEALTH_CONF_MAX_LINE, "!%s=%s ", key, data + 1); + else + snprintfz(pair, HEALTH_CONF_MAX_LINE, "%s=%s ", key, data); + + pa = pattern_array_add_key_simple_pattern(pa, key, simple_pattern_create(pair, NULL, SIMPLE_PATTERN_EXACT, true)); + i=0; + } else { + data[i++] = *s; + } + s++; + } + data[i]='\0'; + if (data[0]) { + if (data[0]=='!') + snprintfz(pair, HEALTH_CONF_MAX_LINE, "!%s=%s ", key, data + 1); + else + snprintfz(pair, HEALTH_CONF_MAX_LINE, "%s=%s ", key, data); + + pa = pattern_array_add_key_simple_pattern(pa, key, simple_pattern_create(pair, NULL, SIMPLE_PATTERN_EXACT, true)); + } + + return pa; +} + +static char *simple_pattern_trim_around_equal(const char *src) { + char *store = mallocz(strlen(src) + 1); + + char *dst = store; + while (*src) { + if (*src == '=') { + if (*(dst -1) == ' ') + dst--; + + *dst++ = *src++; + if (*src == ' ') + src++; + } + + *dst++ = *src++; + } + *dst = 0x00; + + return store; +} + +struct pattern_array *trim_and_add_key_to_values(struct pattern_array *pa, const char *key, STRING *input) { + char *tmp = simple_pattern_trim_around_equal(string2str(input)); + pa = health_config_add_key_to_values(pa, key, tmp); + freez(tmp); + return pa; +} + +static void health_prototype_activate_match_patterns(struct rrd_alert_match *am) { + if(am->host_labels) { + pattern_array_free(am->host_labels_pattern); + am->host_labels_pattern = NULL; + am->host_labels_pattern = trim_and_add_key_to_values(am->host_labels_pattern, NULL, am->host_labels); + } + + if(am->chart_labels) { + pattern_array_free(am->chart_labels_pattern); + am->chart_labels_pattern = NULL; + am->chart_labels_pattern = trim_and_add_key_to_values(am->chart_labels_pattern, NULL, am->chart_labels); + } +} + +void health_prototype_hash_id(RRD_ALERT_PROTOTYPE *ap) { + CLEAN_BUFFER *wb = buffer_create(100, NULL); + health_prototype_to_json(wb, ap, true); + ND_UUID uuid = UUID_generate_from_hash(buffer_tostring(wb), buffer_strlen(wb)); + uuid_copy(ap->config.hash_id, uuid.uuid); + + sql_alert_store_config(ap); +} + +bool health_prototype_add(RRD_ALERT_PROTOTYPE *ap, char **msg) { + if(!ap->match.is_template) { + if(!ap->match.on.chart) { + netdata_log_error( + "HEALTH: alert '%s' does not define a instance (parameter 'on'). Source: %s", + string2str(ap->config.name), string2str(ap->config.source)); + if(msg) + *msg = "missing match 'on' parameter for instance"; + return false; + } + } + else { + if(!ap->match.on.context) { + netdata_log_error( + "HEALTH: alert '%s' does not define a context (parameter 'on'). Source: %s", + string2str(ap->config.name), string2str(ap->config.source)); + if(msg) + *msg = "missing match 'on' parameter for context"; + return false; + } + } + + if(!ap->config.update_every) { + netdata_log_error( + "HEALTH: alert '%s' has no frequency (parameter 'every'). Source: %s", + string2str(ap->config.name), string2str(ap->config.source)); + if(msg) + *msg = "missing update frequency"; + return false; + } + + if(!RRDCALC_HAS_DB_LOOKUP(ap) && !ap->config.calculation && !ap->config.warning && !ap->config.critical) { + netdata_log_error( + "HEALTH: alert '%s' is useless (no db lookup, no calculation, no warning and no critical expressions). Source: %s", + string2str(ap->config.name), string2str(ap->config.source)); + if(msg) + *msg = "no db lookup, calculation and warning/critical conditions"; + return false; + } + + // activate the match patterns in it + bool enabled = false; + for(RRD_ALERT_PROTOTYPE *t = ap; t ;t = t->_internal.next) { + // we need to generate config_hash_id for each instance included + // so, let's break the linked list for this iteration + + RRD_ALERT_PROTOTYPE *prev = t->_internal.prev; + RRD_ALERT_PROTOTYPE *next = t->_internal.next; + t->_internal.prev = t; + t->_internal.next = NULL; + + if(t->match.enabled) + enabled = true; + + if(!t->config.name) + t->config.name = string_dup(ap->config.name); + + health_prototype_hash_id(t); + + health_prototype_activate_match_patterns(&t->match); + + if (!t->config.exec) + t->config.exec = string_dup(health_globals.config.default_exec); + + if (!t->config.recipient) + t->config.recipient = string_dup(health_globals.config.default_recipient); + + // restore the linked list + t->_internal.prev = prev; + t->_internal.next = next; + } + ap->_internal.enabled = enabled; + + // add it to the prototypes + dictionary_set_advanced(health_globals.prototypes.dict, + string2str(ap->config.name), string_strlen(ap->config.name), + ap, sizeof(*ap), + NULL); + + return true; +} + +// --------------------------------------------------------------------------------------------------------------------- + +void health_reload_prototypes(void) { + // remove all dyncfg related to prototypes + health_dyncfg_unregister_all_prototypes(); + + // clear old prototypes from memory + dictionary_flush(health_globals.prototypes.dict); + + // load the prototypes from disk + recursive_config_double_dir_load( + health_user_config_dir(), + health_globals.config.stock_enabled ? health_stock_config_dir() : NULL, + NULL, + health_readfile, + NULL, 0); + + // register all loaded prototypes + health_dyncfg_register_all_prototypes(); +} + +// --------------------------------------------------------------------------------------------------------------------- + +static bool prototype_matches_host(RRDHOST *host, RRD_ALERT_PROTOTYPE *ap) { + if(health_globals.config.enabled_alerts && + !simple_pattern_matches(health_globals.config.enabled_alerts, string2str(ap->config.name))) + return false; + + if (host->rrdlabels && ap->match.host_labels_pattern && + !pattern_array_label_match(ap->match.host_labels_pattern, host->rrdlabels, '=', NULL)) + return false; + + return true; +} + +static bool prototype_matches_rrdset(RRDSET *st, RRD_ALERT_PROTOTYPE *ap) { + // match the chart id + if(!ap->match.is_template && ap->match.on.chart && + ap->match.on.chart != st->id && ap->match.on.chart != st->name) + return false; + + // match the chart context + if(ap->match.is_template && ap->match.on.context && + ap->match.on.context != st->context) + return false; + + if (st->rrdlabels && ap->match.chart_labels_pattern && + !pattern_array_label_match(ap->match.chart_labels_pattern, st->rrdlabels, '=', NULL)) + return false; + + return true; +} + +void health_prototype_copy_match_without_patterns(struct rrd_alert_match *dst, struct rrd_alert_match *src) { + dst->enabled = src->enabled; + dst->is_template = src->is_template; + + if(dst->is_template) + dst->on.context = string_dup(src->on.context); + else + dst->on.chart = string_dup(src->on.chart); + + dst->host_labels = string_dup(src->host_labels); + dst->chart_labels = string_dup(src->chart_labels); +} + +void health_prototype_copy_config(struct rrd_alert_config *dst, struct rrd_alert_config *src) { + uuid_copy(dst->hash_id, src->hash_id); + + dst->name = string_dup(src->name); + + dst->exec = string_dup(src->exec); + dst->recipient = string_dup(src->recipient); + + dst->classification = string_dup(src->classification); + dst->component = string_dup(src->component); + dst->type = string_dup(src->type); + + dst->source_type = src->source_type; + dst->source = string_dup(src->source); + dst->units = string_dup(src->units); + dst->summary = string_dup(src->summary); + dst->info = string_dup(src->info); + + dst->update_every = src->update_every; + + dst->alert_action_options = src->alert_action_options; + + dst->dimensions = string_dup(src->dimensions); + + dst->time_group = src->time_group; + dst->time_group_condition = src->time_group_condition; + dst->time_group_value = src->time_group_value; + dst->dims_group = src->dims_group; + dst->data_source = src->data_source; + dst->before = src->before; + dst->after = src->after; + dst->options = src->options; + + const char *failed_at = NULL; + int error = 0; + + dst->calculation = expression_parse(expression_source(src->calculation), &failed_at, &error); + dst->warning = expression_parse(expression_source(src->warning), &failed_at, &error); + dst->critical = expression_parse(expression_source(src->critical), &failed_at, &error); + + dst->delay_up_duration = src->delay_up_duration; + dst->delay_down_duration = src->delay_down_duration; + dst->delay_max_duration = src->delay_max_duration; + dst->delay_multiplier = src->delay_multiplier; + + dst->has_custom_repeat_config = src->has_custom_repeat_config; + dst->warn_repeat_every = src->warn_repeat_every; + dst->crit_repeat_every = src->crit_repeat_every; +} + +static void health_prototype_apply_to_rrdset(RRDSET *st, RRD_ALERT_PROTOTYPE *ap) { + if(!ap->_internal.enabled) + return; + + spinlock_lock(&ap->_internal.spinlock); + for(size_t template = 0; template < 2; template++) { + bool want_template = template ? true : false; + + for (RRD_ALERT_PROTOTYPE *t = ap; t; t = t->_internal.next) { + if (!t->match.enabled) + continue; + + bool is_template = t->match.is_template ? true : false; + + if (is_template != want_template) + continue; + + if (!prototype_matches_host(st->rrdhost, t)) + continue; + + if (!prototype_matches_rrdset(st, t)) + continue; + + if (rrdcalc_add_from_prototype(st->rrdhost, st, t)) + ap->_internal.uses++; + } + } + spinlock_unlock(&ap->_internal.spinlock); +} + +void health_prototype_alerts_for_rrdset_incrementally(RRDSET *st) { + RRD_ALERT_PROTOTYPE *ap; + dfe_start_read(health_globals.prototypes.dict, ap) { + health_prototype_apply_to_rrdset(st, ap); + } + dfe_done(ap); +} + +void health_prototype_reset_alerts_for_rrdset(RRDSET *st) { + rrdcalc_unlink_and_delete_all_rrdset_alerts(st); + health_prototype_alerts_for_rrdset_incrementally(st); +} + +// --------------------------------------------------------------------------------------------------------------------- + +void health_apply_prototype_to_host(RRDHOST *host, RRD_ALERT_PROTOTYPE *ap) { + if(!ap->_internal.enabled) + return; + + if(unlikely(!host->health.health_enabled) && !rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH)) + return; + + RRDSET *st; + rrdset_foreach_read(st, host) { + health_prototype_apply_to_rrdset(st, ap); + } + rrdset_foreach_done(st); +} + +void health_prototype_apply_to_all_hosts(RRD_ALERT_PROTOTYPE *ap) { + if(!ap->_internal.enabled) + return; + + RRDHOST *host; + dfe_start_reentrant(rrdhost_root_index, host){ + health_apply_prototype_to_host(host, ap); + } + dfe_done(host); +} + +// --------------------------------------------------------------------------------------------------------------------- + +void health_apply_prototypes_to_host(RRDHOST *host) { + if(unlikely(!host->health.health_enabled) && !rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH)) + return; + + // free all running alarms + rrdcalc_delete_all(host); + + // invalidate all previous entries in the alarm log + rw_spinlock_read_lock(&host->health_log.spinlock); + ALARM_ENTRY *t; + for(t = host->health_log.alarms ; t ; t = t->next) { + if(t->new_status != RRDCALC_STATUS_REMOVED) + t->flags |= HEALTH_ENTRY_FLAG_UPDATED; + } + rw_spinlock_read_unlock(&host->health_log.spinlock); + + // apply all the prototypes for the charts of the host + RRDSET *st; + rrdset_foreach_read(st, host) { + health_prototype_reset_alerts_for_rrdset(st); + } + rrdset_foreach_done(st); + +#ifdef ENABLE_ACLK + if (netdata_cloud_enabled) { + struct aclk_sync_cfg_t *wc = host->aclk_config; + if (likely(wc)) { + wc->alert_queue_removed = SEND_REMOVED_AFTER_HEALTH_LOOPS; + } + } +#endif +} + +void health_apply_prototypes_to_all_hosts(void) { + RRDHOST *host; + dfe_start_reentrant(rrdhost_root_index, host){ + health_apply_prototypes_to_host(host); + } + dfe_done(host); +} + +// --------------------------------------------------------------------------------------------------------------------- + +void health_prototype_metadata_foreach(void *data, prototype_metadata_cb_t cb) { + RRD_ALERT_PROTOTYPE *ap; + dfe_start_read(health_globals.prototypes.dict, ap) { + cb(data, ap->config.type, ap->config.component, ap->config.classification, ap->config.recipient); + } + dfe_done(ap); +} diff --git a/src/health/health_prototypes.h b/src/health/health_prototypes.h new file mode 100644 index 000000000..e226c1929 --- /dev/null +++ b/src/health/health_prototypes.h @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_HEALTH_PROTOTYPES_H +#define NETDATA_HEALTH_PROTOTYPES_H + +#include "../web/api/queries/rrdr.h" + +typedef enum __attribute__((packed)) { + ALERT_ACTION_OPTION_NONE = 0, + ALERT_ACTION_OPTION_NO_CLEAR_NOTIFICATION = (1 << 0), +} ALERT_ACTION_OPTIONS; + +typedef enum __attribute__((packed)) { + ALERT_LOOKUP_DATA_SOURCE_SAMPLES = 0, + ALERT_LOOKUP_DATA_SOURCE_PERCENTAGES, + ALERT_LOOKUP_DATA_SOURCE_ANOMALIES, +} ALERT_LOOKUP_DATA_SOURCE; +ALERT_LOOKUP_DATA_SOURCE alerts_data_sources2id(const char *source); +const char *alerts_data_source_id2source(ALERT_LOOKUP_DATA_SOURCE source); + +typedef enum __attribute__((packed)) { + ALERT_LOOKUP_DIMS_SUM = 0, + ALERT_LOOKUP_DIMS_MIN, + ALERT_LOOKUP_DIMS_MAX, + ALERT_LOOKUP_DIMS_AVERAGE, + ALERT_LOOKUP_DIMS_MIN2MAX, +} ALERT_LOOKUP_DIMS_GROUPING; +ALERT_LOOKUP_DIMS_GROUPING alerts_dims_grouping2id(const char *group); +const char *alerts_dims_grouping_id2group(ALERT_LOOKUP_DIMS_GROUPING grouping); + +typedef enum __attribute__((packed)) { + ALERT_LOOKUP_TIME_GROUP_CONDITION_EQUAL, + ALERT_LOOKUP_TIME_GROUP_CONDITION_NOT_EQUAL, + ALERT_LOOKUP_TIME_GROUP_CONDITION_GREATER, + ALERT_LOOKUP_TIME_GROUP_CONDITION_LESS, + ALERT_LOOKUP_TIME_GROUP_CONDITION_GREATER_EQUAL, + ALERT_LOOKUP_TIME_GROUP_CONDITION_LESS_EQUAL, +} ALERT_LOOKUP_TIME_GROUP_CONDITION; +ALERT_LOOKUP_TIME_GROUP_CONDITION alerts_group_condition2id(const char *source); +const char *alerts_group_conditions_id2txt(ALERT_LOOKUP_TIME_GROUP_CONDITION source); + +struct rrd_alert_match { + bool enabled; + + bool is_template; + union { + STRING *chart; + STRING *context; + } on; + + STRING *host_labels; // the label read from an alarm file + STRING *chart_labels; // the chart label read from an alarm file + + struct pattern_array *host_labels_pattern; + struct pattern_array *chart_labels_pattern; +}; +void rrd_alert_match_cleanup(struct rrd_alert_match *am); + +struct rrd_alert_config { + nd_uuid_t hash_id; + + STRING *name; // the name of this alarm + + STRING *exec; // the command to execute when this alarm switches state + STRING *recipient; // the recipient of the alarm (the first parameter to exec) + + STRING *classification; // the class that this alarm belongs + STRING *component; // the component that this alarm refers to + STRING *type; // type of the alarm + + DYNCFG_SOURCE_TYPE source_type; + STRING *source; // the source of this alarm + STRING *units; // the units of the alarm + STRING *summary; // a short alert summary + STRING *info; // a description of the alarm + + int update_every; // update frequency for the alarm + + ALERT_ACTION_OPTIONS alert_action_options; + + // ------------------------------------------------------------------------ + // database lookup settings + + STRING *dimensions; // the chart dimensions + RRDR_TIME_GROUPING time_group; // grouping method: average, max, etc. + ALERT_LOOKUP_TIME_GROUP_CONDITION time_group_condition; + NETDATA_DOUBLE time_group_value; + ALERT_LOOKUP_DIMS_GROUPING dims_group; // grouping method for dimensions + ALERT_LOOKUP_DATA_SOURCE data_source; + int before; // ending point in time-series + int after; // starting point in time-series + RRDR_OPTIONS options; // configuration options + + // ------------------------------------------------------------------------ + // expressions related to the alarm + + EVAL_EXPRESSION *calculation; // expression to calculate the value of the alarm + EVAL_EXPRESSION *warning; // expression to check the warning condition + EVAL_EXPRESSION *critical; // expression to check the critical condition + + // ------------------------------------------------------------------------ + // notification delay settings + + int delay_up_duration; // duration to delay notifications when alarm raises + int delay_down_duration; // duration to delay notifications when alarm lowers + int delay_max_duration; // the absolute max delay to apply to this alarm + float delay_multiplier; // multiplier for all delays when alarms switch status + // while now < delay_up_to + + // ------------------------------------------------------------------------ + // notification repeat settings + + bool has_custom_repeat_config; + uint32_t warn_repeat_every; // interval between repeating warning notifications + uint32_t crit_repeat_every; // interval between repeating critical notifications +}; +void rrd_alert_config_cleanup(struct rrd_alert_config *ac); + +#include "health.h" + +void health_init_prototypes(void); + +bool health_plugin_enabled(void); +void health_plugin_disable(void); + +void health_reload_prototypes(void); +void health_apply_prototypes_to_host(RRDHOST *host); +void health_apply_prototypes_to_all_hosts(void); + +void health_prototype_alerts_for_rrdset_incrementally(RRDSET *st); + +struct rrd_alert_config; +struct rrd_alert_match; +void health_prototype_copy_config(struct rrd_alert_config *dst, struct rrd_alert_config *src); +void health_prototype_copy_match_without_patterns(struct rrd_alert_match *dst, struct rrd_alert_match *src); +void health_prototype_reset_alerts_for_rrdset(RRDSET *st); + +#endif //NETDATA_HEALTH_PROTOTYPES_H diff --git a/src/health/health_silencers.c b/src/health/health_silencers.c new file mode 100644 index 000000000..ee829780f --- /dev/null +++ b/src/health/health_silencers.c @@ -0,0 +1,495 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "health_internals.h" + +#define HEALTH_CMDAPI_CMD_SILENCEALL "SILENCE ALL" +#define HEALTH_CMDAPI_CMD_DISABLEALL "DISABLE ALL" +#define HEALTH_CMDAPI_CMD_SILENCE "SILENCE" +#define HEALTH_CMDAPI_CMD_DISABLE "DISABLE" +#define HEALTH_CMDAPI_CMD_RESET "RESET" +#define HEALTH_CMDAPI_CMD_LIST "LIST" + +#define HEALTH_CMDAPI_MSG_AUTHERROR "Auth Error\n" +#define HEALTH_CMDAPI_MSG_SILENCEALL "All alarm notifications are silenced\n" +#define HEALTH_CMDAPI_MSG_DISABLEALL "All health checks are disabled\n" +#define HEALTH_CMDAPI_MSG_RESET "All health checks and notifications are enabled\n" +#define HEALTH_CMDAPI_MSG_DISABLE "Health checks disabled for alarms matching the selectors\n" +#define HEALTH_CMDAPI_MSG_SILENCE "Alarm notifications silenced for alarms matching the selectors\n" +#define HEALTH_CMDAPI_MSG_ADDED "Alarm selector added\n" +#define HEALTH_CMDAPI_MSG_STYPEWARNING "WARNING: Added alarm selector to silence/disable alarms without a SILENCE or DISABLE command.\n" +#define HEALTH_CMDAPI_MSG_NOSELECTORWARNING "WARNING: SILENCE or DISABLE command is ineffective without defining any alarm selectors.\n" + +SILENCERS *silencers; + +/** + * Create Silencer + * + * Allocate a new silencer to Netdata. + * + * @return It returns the address off the silencer on success and NULL otherwise + */ +SILENCER *create_silencer(void) { + SILENCER *t = callocz(1, sizeof(SILENCER)); + netdata_log_debug(D_HEALTH, "HEALTH command API: Created empty silencer"); + + return t; +} + +/** + * Health Silencers add + * + * Add more one silencer to the list of silencers. + * + * @param silencer + */ +void health_silencers_add(SILENCER *silencer) { + // Add the created instance to the linked list in silencers + silencer->next = silencers->silencers; + silencers->silencers = silencer; + netdata_log_debug( + D_HEALTH, + "HEALTH command API: Added silencer %s:%s:%s:%s", + silencer->alarms, + silencer->charts, + silencer->contexts, + silencer->hosts); +} + +/** + * Silencers Add Parameter + * + * Create a new silencer and adjust the variables + * + * @param silencer a pointer to the silencer that will be adjusted + * @param key the key value sent by client + * @param value the value sent to the key + * + * @return It returns the silencer configured on success and NULL otherwise + */ +SILENCER *health_silencers_addparam(SILENCER *silencer, char *key, char *value) { + static uint32_t + hash_alarm = 0, + hash_template = 0, + hash_chart = 0, + hash_context = 0, + hash_host = 0; + + if (unlikely(!hash_alarm)) { + hash_alarm = simple_uhash(HEALTH_ALARM_KEY); + hash_template = simple_uhash(HEALTH_TEMPLATE_KEY); + hash_chart = simple_uhash(HEALTH_CHART_KEY); + hash_context = simple_uhash(HEALTH_CONTEXT_KEY); + hash_host = simple_uhash(HEALTH_HOST_KEY); + } + + uint32_t hash = simple_uhash(key); + if (unlikely(silencer == NULL)) { + if ( + (hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) || + (hash == hash_template && !strcasecmp(key, HEALTH_TEMPLATE_KEY)) || + (hash == hash_chart && !strcasecmp(key, HEALTH_CHART_KEY)) || + (hash == hash_context && !strcasecmp(key, HEALTH_CONTEXT_KEY)) || + (hash == hash_host && !strcasecmp(key, HEALTH_HOST_KEY)) + ) { + silencer = create_silencer(); + } + } + + if (hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) { + silencer->alarms = strdupz(value); + silencer->alarms_pattern = simple_pattern_create(silencer->alarms, NULL, SIMPLE_PATTERN_EXACT, true); + } else if (hash == hash_chart && !strcasecmp(key, HEALTH_CHART_KEY)) { + silencer->charts = strdupz(value); + silencer->charts_pattern = simple_pattern_create(silencer->charts, NULL, SIMPLE_PATTERN_EXACT, true); + } else if (hash == hash_context && !strcasecmp(key, HEALTH_CONTEXT_KEY)) { + silencer->contexts = strdupz(value); + silencer->contexts_pattern = simple_pattern_create(silencer->contexts, NULL, SIMPLE_PATTERN_EXACT, true); + } else if (hash == hash_host && !strcasecmp(key, HEALTH_HOST_KEY)) { + silencer->hosts = strdupz(value); + silencer->hosts_pattern = simple_pattern_create(silencer->hosts, NULL, SIMPLE_PATTERN_EXACT, true); + } + + return silencer; +} + +/** + * JSON Read Callback + * + * Callback called by netdata to create the silencer. + * + * @param e the main json structure + * + * @return It always return 0. + */ +int health_silencers_json_read_callback(JSON_ENTRY *e) +{ + switch(e->type) { + case JSON_OBJECT: +#ifndef ENABLE_JSONC + e->callback_function = health_silencers_json_read_callback; + if(strcmp(e->name,"")) { + // init silencer + netdata_log_debug(D_HEALTH, "JSON: Got object with a name, initializing new silencer for %s",e->name); +#endif + e->callback_data = create_silencer(); + if(e->callback_data) { + health_silencers_add(e->callback_data); + } +#ifndef ENABLE_JSONC + } +#endif + break; + + case JSON_ARRAY: + e->callback_function = health_silencers_json_read_callback; + break; + + case JSON_STRING: + if(!strcmp(e->name,"type")) { + netdata_log_debug(D_HEALTH, "JSON: Processing type=%s",e->data.string); + if (!strcmp(e->data.string,"SILENCE")) silencers->stype = STYPE_SILENCE_NOTIFICATIONS; + else if (!strcmp(e->data.string,"DISABLE")) silencers->stype = STYPE_DISABLE_ALARMS; + } else { + netdata_log_debug(D_HEALTH, "JSON: Adding %s=%s", e->name, e->data.string); + if (e->callback_data) + (void)health_silencers_addparam(e->callback_data, e->name, e->data.string); + } + break; + + case JSON_BOOLEAN: + netdata_log_debug(D_HEALTH, "JSON: Processing all_alarms"); + silencers->all_alarms=e->data.boolean?1:0; + break; + + case JSON_NUMBER: + case JSON_NULL: + break; + } + + return 0; +} + +/** + * Initialize Global Silencers + * + * Initialize the silencer for the whole netdata system. + * + * @return It returns 0 on success and -1 otherwise + */ +int health_initialize_global_silencers() { + silencers = mallocz(sizeof(SILENCERS)); + silencers->all_alarms = 0; + silencers->stype = STYPE_NONE; + silencers->silencers = NULL; + + return 0; +} + +// ---------------------------------------------------------------------------- + +/** + * Free Silencers + * + * Clean the silencer structure + * + * @param t is the structure that will be cleaned. + */ +void free_silencers(SILENCER *t) { + if (!t) return; + + while(t) { + SILENCER *next = t->next; + + simple_pattern_free(t->alarms_pattern); + simple_pattern_free(t->charts_pattern); + simple_pattern_free(t->contexts_pattern); + simple_pattern_free(t->hosts_pattern); + freez(t->alarms); + freez(t->charts); + freez(t->contexts); + freez(t->hosts); + freez(t); + + t = next; + } +} + +/** + * Silencers to JSON Entry + * + * Fill the buffer with the other values given. + * + * @param wb a pointer to the output buffer + * @param var the json variable + * @param val the json value + * @param hasprev has it a previous value? + * + * @return + */ +int health_silencers2json_entry(BUFFER *wb, char* var, char* val, int hasprev) { + if (val) { + buffer_sprintf(wb, "%s\n\t\t\t\"%s\": \"%s\"", (hasprev)?",":"", var, val); + return 1; + } else { + return hasprev; + } +} + +/** + * Silencer to JSON + * + * Write the silencer values using JSON format inside a buffer. + * + * @param wb is the buffer to write the silencers. + */ +void health_silencers2json(BUFFER *wb) { + buffer_sprintf(wb, "{\n\t\"all\": %s," + "\n\t\"type\": \"%s\"," + "\n\t\"silencers\": [", + (silencers->all_alarms)?"true":"false", + (silencers->stype == STYPE_NONE)?"None":((silencers->stype == STYPE_DISABLE_ALARMS)?"DISABLE":"SILENCE")); + + SILENCER *silencer; + int i = 0, j = 0; + for(silencer = silencers->silencers; silencer ; silencer = silencer->next) { + if(likely(i)) buffer_strcat(wb, ","); + buffer_strcat(wb, "\n\t\t{"); + j=health_silencers2json_entry(wb, HEALTH_ALARM_KEY, silencer->alarms, j); + j=health_silencers2json_entry(wb, HEALTH_CHART_KEY, silencer->charts, j); + j=health_silencers2json_entry(wb, HEALTH_CONTEXT_KEY, silencer->contexts, j); + j=health_silencers2json_entry(wb, HEALTH_HOST_KEY, silencer->hosts, j); + j=0; + buffer_strcat(wb, "\n\t\t}"); + i++; + } + if(likely(i)) buffer_strcat(wb, "\n\t"); + buffer_strcat(wb, "]\n}\n"); +} + + +/** + * Silencer to FILE + * + * Write the silencer buffer to a file. + * @param wb + */ +void health_silencers2file(BUFFER *wb) { + if (wb->len == 0) return; + + FILE *fd = fopen(health_silencers_filename(), "wb"); + if(fd) { + size_t written = (size_t)fprintf(fd, "%s", wb->buffer) ; + if (written == wb->len ) { + netdata_log_info("Silencer changes written to %s", health_silencers_filename()); + } + fclose(fd); + return; + } + netdata_log_error("Silencer changes could not be written to %s. Error %s", health_silencers_filename(), strerror(errno)); +} + +/** + * Request V1 MGMT Health + * + * Function called by api to management the health. + * + * @param host main structure with client information! + * @param w is the structure with all information of the client request. + * @param url is the url that netdata is working + * + * @return It returns 200 on success and another code otherwise. + */ +int web_client_api_request_v1_mgmt_health(RRDHOST *host, struct web_client *w, char *url) { + int ret; + (void) host; + + BUFFER *wb = w->response.data; + buffer_flush(wb); + wb->content_type = CT_TEXT_PLAIN; + + buffer_flush(w->response.data); + + //Local instance of the silencer + SILENCER *silencer = NULL; + int config_changed = 1; + + if (!w->auth_bearer_token) { + buffer_strcat(wb, HEALTH_CMDAPI_MSG_AUTHERROR); + ret = HTTP_RESP_FORBIDDEN; + } else { + netdata_log_debug(D_HEALTH, "HEALTH command API: Comparing secret '%s' to '%s'", w->auth_bearer_token, api_secret); + if (strcmp(w->auth_bearer_token, api_secret) != 0) { + buffer_strcat(wb, HEALTH_CMDAPI_MSG_AUTHERROR); + ret = HTTP_RESP_FORBIDDEN; + } else { + while (url) { + char *value = strsep_skip_consecutive_separators(&url, "&"); + if (!value || !*value) continue; + + char *key = strsep_skip_consecutive_separators(&value, "="); + if (!key || !*key) continue; + if (!value || !*value) continue; + + netdata_log_debug(D_WEB_CLIENT, "%llu: API v1 health query param '%s' with value '%s'", w->id, key, value); + + // name and value are now the parameters + if (!strcmp(key, "cmd")) { + if (!strcmp(value, HEALTH_CMDAPI_CMD_SILENCEALL)) { + silencers->all_alarms = 1; + silencers->stype = STYPE_SILENCE_NOTIFICATIONS; + buffer_strcat(wb, HEALTH_CMDAPI_MSG_SILENCEALL); + } else if (!strcmp(value, HEALTH_CMDAPI_CMD_DISABLEALL)) { + silencers->all_alarms = 1; + silencers->stype = STYPE_DISABLE_ALARMS; + buffer_strcat(wb, HEALTH_CMDAPI_MSG_DISABLEALL); + } else if (!strcmp(value, HEALTH_CMDAPI_CMD_SILENCE)) { + silencers->stype = STYPE_SILENCE_NOTIFICATIONS; + buffer_strcat(wb, HEALTH_CMDAPI_MSG_SILENCE); + } else if (!strcmp(value, HEALTH_CMDAPI_CMD_DISABLE)) { + silencers->stype = STYPE_DISABLE_ALARMS; + buffer_strcat(wb, HEALTH_CMDAPI_MSG_DISABLE); + } else if (!strcmp(value, HEALTH_CMDAPI_CMD_RESET)) { + silencers->all_alarms = 0; + silencers->stype = STYPE_NONE; + free_silencers(silencers->silencers); + silencers->silencers = NULL; + buffer_strcat(wb, HEALTH_CMDAPI_MSG_RESET); + } else if (!strcmp(value, HEALTH_CMDAPI_CMD_LIST)) { + w->response.data->content_type = CT_APPLICATION_JSON; + health_silencers2json(wb); + config_changed=0; + } + } else { + silencer = health_silencers_addparam(silencer, key, value); + } + } + + if (likely(silencer)) { + health_silencers_add(silencer); + buffer_strcat(wb, HEALTH_CMDAPI_MSG_ADDED); + if (silencers->stype == STYPE_NONE) { + buffer_strcat(wb, HEALTH_CMDAPI_MSG_STYPEWARNING); + } + } + if (unlikely(silencers->stype != STYPE_NONE && !silencers->all_alarms && !silencers->silencers)) { + buffer_strcat(wb, HEALTH_CMDAPI_MSG_NOSELECTORWARNING); + } + ret = HTTP_RESP_OK; + } + } + w->response.data = wb; + buffer_no_cacheable(w->response.data); + if (ret == HTTP_RESP_OK && config_changed) { + BUFFER *jsonb = buffer_create(200, &netdata_buffers_statistics.buffers_health); + health_silencers2json(jsonb); + health_silencers2file(jsonb); + buffer_free(jsonb); + } + + return ret; +} + +// ---------------------------------------------------------------------------- + +const char *health_silencers_filename(void) { + return string2str(health_globals.config.silencers_filename); +} + +void health_set_silencers_filename(void) { + char filename[FILENAME_MAX + 1]; + snprintfz(filename, FILENAME_MAX, "%s/health.silencers.json", netdata_configured_varlib_dir); + + health_globals.config.silencers_filename = + string_strdupz(config_get(CONFIG_SECTION_HEALTH, "silencers file", filename)); +} + +void health_silencers_init(void) { + FILE *fd = fopen(health_silencers_filename(), "r"); + if (fd) { + fseek(fd, 0 , SEEK_END); + off_t length = (off_t) ftell(fd); + fseek(fd, 0 , SEEK_SET); + + if (length > 0 && length < HEALTH_SILENCERS_MAX_FILE_LEN) { + char *str = mallocz((length+1)* sizeof(char)); + if(str) { + size_t copied; + copied = fread(str, sizeof(char), length, fd); + if (copied == (length* sizeof(char))) { + str[length] = 0x00; + json_parse(str, NULL, health_silencers_json_read_callback); + netdata_log_info("Parsed health silencers file %s", health_silencers_filename()); + } else { + netdata_log_error("Cannot read the data from health silencers file %s", health_silencers_filename()); + } + freez(str); + } + } else { + netdata_log_error("Health silencers file %s has the size %" PRId64 " that is out of range[ 1 , %d ]. Aborting read.", + health_silencers_filename(), + (int64_t)length, + HEALTH_SILENCERS_MAX_FILE_LEN); + } + fclose(fd); + } else { + netdata_log_info("Cannot open the file %s, so Netdata will work with the default health configuration.", + health_silencers_filename()); + } +} + +SILENCE_TYPE health_silencers_check_silenced(RRDCALC *rc, const char *host) { + SILENCER *s; + + for (s = silencers->silencers; s!=NULL; s=s->next){ + if ( + (!s->alarms_pattern || (rc->config.name && s->alarms_pattern && simple_pattern_matches_string(s->alarms_pattern, rc->config.name))) && + (!s->contexts_pattern || (rc->rrdset && rc->rrdset->context && s->contexts_pattern && simple_pattern_matches_string(s->contexts_pattern, rc->rrdset->context))) && + (!s->hosts_pattern || (host && s->hosts_pattern && simple_pattern_matches(s->hosts_pattern, host))) && + (!s->charts_pattern || (rc->chart && s->charts_pattern && simple_pattern_matches_string(s->charts_pattern, rc->chart))) + ) { + netdata_log_debug(D_HEALTH, "Alarm matches command API silence entry %s:%s:%s:%s", s->alarms,s->charts, s->contexts, s->hosts); + if (unlikely(silencers->stype == STYPE_NONE)) { + netdata_log_debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rrdcalc_name(rc)); + } else { + netdata_log_debug(D_HEALTH, "Alarm %s via the command API - name:%s context:%s chart:%s host:%s" + , (silencers->stype == STYPE_DISABLE_ALARMS)?"Disabled":"Silenced" + , rrdcalc_name(rc) + , (rc->rrdset)?rrdset_context(rc->rrdset):"" + , rrdcalc_chart_name(rc) + , host + ); + } + return silencers->stype; + } + } + return STYPE_NONE; +} + +int health_silencers_update_disabled_silenced(RRDHOST *host, RRDCALC *rc) { + uint32_t rrdcalc_flags_old = rc->run_flags; + // Clear the flags + rc->run_flags &= ~(RRDCALC_FLAG_DISABLED | RRDCALC_FLAG_SILENCED); + if (unlikely(silencers->all_alarms)) { + if (silencers->stype == STYPE_DISABLE_ALARMS) rc->run_flags |= RRDCALC_FLAG_DISABLED; + else if (silencers->stype == STYPE_SILENCE_NOTIFICATIONS) rc->run_flags |= RRDCALC_FLAG_SILENCED; + } else { + SILENCE_TYPE st = health_silencers_check_silenced(rc, rrdhost_hostname(host)); + if (st == STYPE_DISABLE_ALARMS) rc->run_flags |= RRDCALC_FLAG_DISABLED; + else if (st == STYPE_SILENCE_NOTIFICATIONS) rc->run_flags |= RRDCALC_FLAG_SILENCED; + } + + if (rrdcalc_flags_old != rc->run_flags) { + netdata_log_info( + "Alarm silencing changed for host '%s' alarm '%s': Disabled %s->%s Silenced %s->%s", + rrdhost_hostname(host), + rrdcalc_name(rc), + (rrdcalc_flags_old & RRDCALC_FLAG_DISABLED) ? "true" : "false", + (rc->run_flags & RRDCALC_FLAG_DISABLED) ? "true" : "false", + (rrdcalc_flags_old & RRDCALC_FLAG_SILENCED) ? "true" : "false", + (rc->run_flags & RRDCALC_FLAG_SILENCED) ? "true" : "false"); + } + if (rc->run_flags & RRDCALC_FLAG_DISABLED) + return 1; + else + return 0; +} diff --git a/src/health/health_silencers.h b/src/health/health_silencers.h new file mode 100644 index 000000000..fe80c2477 --- /dev/null +++ b/src/health/health_silencers.h @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_HEALTH_SILENCERS_H +#define NETDATA_HEALTH_SILENCERS_H + +#include "health.h" + +typedef struct silencer { + char *alarms; + SIMPLE_PATTERN *alarms_pattern; + + char *hosts; + SIMPLE_PATTERN *hosts_pattern; + + char *contexts; + SIMPLE_PATTERN *contexts_pattern; + + char *charts; + SIMPLE_PATTERN *charts_pattern; + + struct silencer *next; +} SILENCER; + +typedef enum silence_type { + STYPE_NONE, + STYPE_DISABLE_ALARMS, + STYPE_SILENCE_NOTIFICATIONS +} SILENCE_TYPE; + +typedef struct silencers { + int all_alarms; + SILENCE_TYPE stype; + SILENCER *silencers; +} SILENCERS; + +extern SILENCERS *silencers; + +SILENCER *create_silencer(void); +int health_silencers_json_read_callback(JSON_ENTRY *e); +void health_silencers_add(SILENCER *silencer); +SILENCER * health_silencers_addparam(SILENCER *silencer, char *key, char *value); +int health_initialize_global_silencers(); + +void free_silencers(SILENCER *t); + +struct web_client; +int web_client_api_request_v1_mgmt_health(RRDHOST *host, struct web_client *w, char *url); + +const char *health_silencers_filename(void); +void health_set_silencers_filename(void); +void health_silencers_init(void); +SILENCE_TYPE health_silencers_check_silenced(RRDCALC *rc, const char *host); +int health_silencers_update_disabled_silenced(RRDHOST *host, RRDCALC *rc); + +#endif //NETDATA_HEALTH_SILENCERS_H diff --git a/src/health/health_variable.c b/src/health/health_variable.c new file mode 100644 index 000000000..69637de64 --- /dev/null +++ b/src/health/health_variable.c @@ -0,0 +1,466 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "health.h" +#include "health_internals.h" + +struct variable_lookup_score { + RRDSET *st; + const char *source; + NETDATA_DOUBLE value; + size_t score; +}; + +struct variable_lookup_job { + RRDCALC *rc; + RRDHOST *host; + STRING *variable; + STRING *dim; + const char *dimension; + size_t dimension_length; + enum { + DIM_SELECT_NORMAL, + DIM_SELECT_RAW, + DIM_SELECT_LAST_COLLECTED, + } dimension_selection; + + struct { + size_t size; + size_t used; + struct variable_lookup_score *array; + } result; + + struct { + RRDSET *last_rrdset; + size_t last_score; + } score; +}; + +static void variable_lookup_add_result_with_score(struct variable_lookup_job *vbd, NETDATA_DOUBLE n, RRDSET *st, const char *source __maybe_unused) { + if(vbd->score.last_rrdset != st) { + vbd->score.last_rrdset = st; + vbd->score.last_score = rrdlabels_common_count(vbd->rc->rrdset->rrdlabels, st->rrdlabels); + } + + if(vbd->result.used >= vbd->result.size) { + if(!vbd->result.size) + vbd->result.size = 1; + + vbd->result.size *= 2; + vbd->result.array = reallocz(vbd->result.array, sizeof(struct variable_lookup_score) * vbd->result.size); + } + + vbd->result.array[vbd->result.used++] = (struct variable_lookup_score) { + .value = n, + .score = vbd->score.last_score, + .st = st, + .source = source, + }; +} + +static bool variable_lookup_in_chart(struct variable_lookup_job *vbd, RRDSET *st, bool stop_on_match) { + bool found = false; + const DICTIONARY_ITEM *item = NULL; + RRDDIM *rd = NULL; + dfe_start_read(st->rrddim_root_index, rd) { + if(rd->id == vbd->dim || rd->name == vbd->dim) { + item = dictionary_acquired_item_dup(st->rrddim_root_index, rd_dfe.item); + break; + } + } + dfe_done(rd); + + if (item) { + switch (vbd->dimension_selection) { + case DIM_SELECT_NORMAL: + variable_lookup_add_result_with_score(vbd, (NETDATA_DOUBLE)rd->collector.last_stored_value, st, "last stored value of dimension"); + break; + case DIM_SELECT_RAW: + variable_lookup_add_result_with_score(vbd, (NETDATA_DOUBLE)rd->collector.last_collected_value, st, "last collected value of dimension"); + break; + case DIM_SELECT_LAST_COLLECTED: + variable_lookup_add_result_with_score(vbd, (NETDATA_DOUBLE)rd->collector.last_collected_time.tv_sec, st, "last collected time of dimension"); + break; + } + + dictionary_acquired_item_release(st->rrddim_root_index, item); + found = true; + } + if(found && stop_on_match) goto cleanup; + + // chart variable + { + NETDATA_DOUBLE n; + if(rrdvar_get_custom_chart_variable_value(st, vbd->variable, &n)) { + variable_lookup_add_result_with_score(vbd, n, st, "chart variable"); + found = true; + } + } + if(found && stop_on_match) goto cleanup; + +cleanup: + return found; +} + +static int foreach_instance_in_context_cb(RRDSET *st, void *data) { + struct variable_lookup_job *vbd = data; + return variable_lookup_in_chart(vbd, st, false) ? 1 : 0; +} + +static bool variable_lookup_context(struct variable_lookup_job *vbd, const char *chart_or_context, const char *dim_id_or_name) { + struct variable_lookup_job vbd_back = *vbd; + + vbd->dimension = dim_id_or_name; + vbd->dim = string_strdupz(vbd->dimension); + vbd->dimension_length = string_strlen(vbd->dim); + // vbd->dimension_selection = DIM_SELECT_NORMAL; + + bool found = false; + + // lookup chart in host + + RRDSET_ACQUIRED *rsa = rrdset_find_and_acquire(vbd->host, chart_or_context); + if(rsa) { + if(variable_lookup_in_chart(vbd, rrdset_acquired_to_rrdset(rsa), false)) + found = true; + rrdset_acquired_release(rsa); + } + + // lookup context in contexts, then foreach chart + + if(rrdcontext_foreach_instance_with_rrdset_in_context(vbd->host, chart_or_context, foreach_instance_in_context_cb, vbd) > 0) + found = true; + + string_freez(vbd->dim); + + vbd->dimension = vbd_back.dimension; + vbd->dim = vbd_back.dim; + vbd->dimension_length = vbd_back.dimension_length; + // vbd->dimension_selection = vbd_back.dimension_selection; + + return found; +} + +bool alert_variable_from_running_alerts(struct variable_lookup_job *vbd) { + bool found = false; + RRDCALC *rc; + foreach_rrdcalc_in_rrdhost_read(vbd->host, rc) { + if(rc->config.name == vbd->variable) { + variable_lookup_add_result_with_score(vbd, (NETDATA_DOUBLE)rc->value, rc->rrdset, "alarm value"); + found = true; + } + } + foreach_rrdcalc_in_rrdhost_done(rc); + return found; +} + +bool alert_variable_lookup_internal(STRING *variable, void *data, NETDATA_DOUBLE *result, BUFFER *wb) { + static STRING *this_string = NULL, + *now_string = NULL, + *after_string = NULL, + *before_string = NULL, + *status_string = NULL, + *removed_string = NULL, + *uninitialized_string = NULL, + *undefined_string = NULL, + *clear_string = NULL, + *warning_string = NULL, + *critical_string = NULL, + *last_collected_t_string = NULL, + *update_every_string = NULL; + + + struct variable_lookup_job vbd = { 0 }; + +// const char *v_name = string2str(variable); +// bool trace_this = false; +// if(strcmp(v_name, "btrfs_allocated") == 0) +// trace_this = true; + + bool found = false; + + const char *source = NULL; + RRDSET *source_st = NULL; + + RRDCALC *rc = data; + RRDSET *st = rc->rrdset; + + if(!st) + return false; + + if(unlikely(!last_collected_t_string)) { + this_string = string_strdupz("this"); + now_string = string_strdupz("now"); + after_string = string_strdupz("after"); + before_string = string_strdupz("before"); + status_string = string_strdupz("status"); + removed_string = string_strdupz("REMOVED"); + undefined_string = string_strdupz("UNDEFINED"); + uninitialized_string = string_strdupz("UNINITIALIZED"); + clear_string = string_strdupz("CLEAR"); + warning_string = string_strdupz("WARNING"); + critical_string = string_strdupz("CRITICAL"); + last_collected_t_string = string_strdupz("last_collected_t"); + update_every_string = string_strdupz("update_every"); + } + + if(unlikely(variable == this_string)) { + *result = (NETDATA_DOUBLE)rc->value; + source = "current alert value"; + source_st = st; + found = true; + goto log; + } + + if(unlikely(variable == after_string)) { + *result = (NETDATA_DOUBLE)rc->db_after; + source = "current alert query start time"; + source_st = st; + found = true; + goto log; + } + + if(unlikely(variable == before_string)) { + *result = (NETDATA_DOUBLE)rc->db_before; + source = "current alert query end time"; + source_st = st; + found = true; + goto log; + } + + if(unlikely(variable == now_string)) { + *result = (NETDATA_DOUBLE)now_realtime_sec(); + source = "current wall-time clock timestamp"; + source_st = st; + found = true; + goto log; + } + + if(unlikely(variable == status_string)) { + *result = (NETDATA_DOUBLE)rc->status; + source = "current alert status"; + source_st = st; + found = true; + goto log; + } + + if(unlikely(variable == removed_string)) { + *result = (NETDATA_DOUBLE)RRDCALC_STATUS_REMOVED; + source = "removed status constant"; + source_st = st; + found = true; + goto log; + } + + if(unlikely(variable == uninitialized_string)) { + *result = (NETDATA_DOUBLE)RRDCALC_STATUS_UNINITIALIZED; + source = "uninitialized status constant"; + source_st = st; + found = true; + goto log; + } + + if(unlikely(variable == undefined_string)) { + *result = (NETDATA_DOUBLE)RRDCALC_STATUS_UNDEFINED; + source = "undefined status constant"; + source_st = st; + found = true; + goto log; + } + + if(unlikely(variable == clear_string)) { + *result = (NETDATA_DOUBLE)RRDCALC_STATUS_CLEAR; + source = "clear status constant"; + source_st = st; + found = true; + goto log; + } + + if(unlikely(variable == warning_string)) { + *result = (NETDATA_DOUBLE)RRDCALC_STATUS_WARNING; + source = "warning status constant"; + source_st = st; + found = true; + goto log; + } + + if(unlikely(variable == critical_string)) { + *result = (NETDATA_DOUBLE)RRDCALC_STATUS_CRITICAL; + source = "critical status constant"; + source_st = st; + found = true; + goto log; + } + + if(unlikely(variable == last_collected_t_string)) { + *result = (NETDATA_DOUBLE)st->last_collected_time.tv_sec; + source = "current instance last_collected_t"; + source_st = st; + found = true; + goto log; + } + + if(unlikely(variable == update_every_string)) { + *result = (NETDATA_DOUBLE)st->update_every; + source = "current instance update_every"; + source_st = st; + found = true; + goto log; + } + + // find the dimension id/name + + vbd = (struct variable_lookup_job){ + .rc = rc, + .host = st->rrdhost, + .variable = variable, + .dimension = string2str(variable), + .dimension_length = string_strlen(variable), + .dimension_selection = DIM_SELECT_NORMAL, + .dim = string_dup(variable), + .result = { 0 }, + }; + if (strendswith_lengths(vbd.dimension, vbd.dimension_length, "_raw", 4)) { + vbd.dimension_length -= 4; + vbd.dimension_selection = DIM_SELECT_RAW; + vbd.dim = string_strndupz(vbd.dimension, vbd.dimension_length); + } else if (strendswith_lengths(vbd.dimension, vbd.dimension_length, "_last_collected_t", 17)) { + vbd.dimension_length -= 17; + vbd.dimension_selection = DIM_SELECT_LAST_COLLECTED; + vbd.dim = string_strndupz(vbd.dimension, vbd.dimension_length); + } + + if(variable_lookup_in_chart(&vbd, st, true)) { + found = true; + goto find_best_scored; + } + + // host variables + { + NETDATA_DOUBLE n; + found = rrdvar_get_custom_host_variable_value(vbd.host, vbd.variable, &n); + if(found) { + variable_lookup_add_result_with_score(&vbd, n, st, "host variable"); + goto find_best_scored; + } + } + + // alert names + if(alert_variable_from_running_alerts(&vbd)) { + found = true; + goto find_best_scored; + } + + // find the components of the variable + { + char id[string_strlen(vbd.dim) + 1]; + memcpy(id, string2str(vbd.dim), string_strlen(vbd.dim)); + id[string_strlen(vbd.dim)] = '\0'; + + char *dot = strrchr(id, '.'); + while(dot) { + *dot = '\0'; + + if(strchr(id, '.') == NULL) break; + + if(variable_lookup_context(&vbd, id, dot + 1)) + found = true; + + char *dot2 = strrchr(id, '.'); + *dot = '.'; + dot = dot2; + } + } + +find_best_scored: + if(found && vbd.result.array) { + struct variable_lookup_score *best = &vbd.result.array[0]; + for (size_t i = 1; i < vbd.result.used; i++) + if (vbd.result.array[i].score > best->score) + best = &vbd.result.array[i]; + + source = best->source; + source_st = best->st; + *result = best->value; + freez(vbd.result.array); + } + else { + found = false; + *result = NAN; + } + +log: +#ifdef NETDATA_LOG_HEALTH_VARIABLES_LOOKUP + if(found) { + nd_log(NDLS_DAEMON, NDLP_INFO, + "HEALTH_VARIABLE_LOOKUP: variable '%s' of alert '%s' of chart '%s', context '%s', host '%s' " + "resolved with %s of chart '%s' and context '%s'", + string2str(variable), + string2str(rc->config.name), + string2str(rc->rrdset->id), + string2str(rc->rrdset->context), + string2str(rc->rrdset->rrdhost->hostname), + source, + string2str(source_st->id), + string2str(source_st->context) + ); + } + else { + nd_log(NDLS_DAEMON, NDLP_INFO, + "HEALTH_VARIABLE_LOOKUP: variable '%s' of alert '%s' of chart '%s', context '%s', host '%s' " + "could not be resolved", + string2str(variable), + string2str(rc->config.name), + string2str(rc->rrdset->id), + string2str(rc->rrdset->context), + string2str(rc->rrdset->rrdhost->hostname) + ); + } +#endif + + if(unlikely(wb)) { + buffer_json_member_add_string(wb, "variable", string2str(variable)); + buffer_json_member_add_string(wb, "instance", string2str(st->id)); + buffer_json_member_add_string(wb, "context", string2str(st->context)); + buffer_json_member_add_boolean(wb, "found", found); + + if (found) { + buffer_json_member_add_double(wb, "value", *result); + buffer_json_member_add_object(wb, "source"); + { + buffer_json_member_add_string(wb, "description", source); + buffer_json_member_add_string(wb, "instance", string2str(source_st->id)); + buffer_json_member_add_string(wb, "context", string2str(source_st->context)); + buffer_json_member_add_uint64(wb, "candidates", vbd.result.used ? vbd.result.used : 1); + } + buffer_json_object_close(wb); // source + } + } + + string_freez(vbd.dim); + + return found; +} + +bool alert_variable_lookup(STRING *variable, void *data, NETDATA_DOUBLE *result) { + return alert_variable_lookup_internal(variable, data, result, NULL); +} + +int alert_variable_lookup_trace(RRDHOST *host __maybe_unused, RRDSET *st, const char *variable, BUFFER *wb) { + int code = HTTP_RESP_INTERNAL_SERVER_ERROR; + + buffer_flush(wb); + buffer_json_initialize(wb, "\"", "\"", 0, true, BUFFER_JSON_OPTIONS_DEFAULT); + + STRING *v = string_strdupz(variable); + RRDCALC rc = { + .rrdset = st, + }; + + NETDATA_DOUBLE n; + alert_variable_lookup_internal(v, &rc, &n, wb); + + string_freez(v); + + buffer_json_finalize(wb); + return code; +} diff --git a/src/health/notifications/README.md b/src/health/notifications/README.md new file mode 100644 index 000000000..5a2b032a3 --- /dev/null +++ b/src/health/notifications/README.md @@ -0,0 +1,207 @@ +# Agent alert notifications + +This is a reference documentation for Netdata's Agent alert notification feature, which supports dozens of endpoints, user roles, and more. + +The `script to execute on alarm` line in `netdata.conf` defines the external script that will be called once the alert is triggered. + +The default script is `alarm-notify.sh`. + +> ### Info +> +> This file mentions editing configuration files. +> +> - To edit configuration files in a safe way, we provide the [`edit config` script](/docs/netdata-agent/configuration/README.md#edit-netdataconf)located in your [Netdata config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory) (typically is `/etc/netdata`) that creates the proper file and opens it in an editor automatically. +> Note that to run the script you need to be inside your Netdata config directory. +> +> - Please also note that after most configuration changes you will need to [restart the Agent](/packaging/installer/README.md#maintaining-a-netdata-agent-installation) for the changes to take effect. +> +> It is recommended to use this way for configuring Netdata. + +You can change the default script globally by editing `netdata.conf` and changing the `script to execute on alarm` in the `[health]` section. + +`alarm-notify.sh` is capable of sending notifications: + +- to multiple recipients +- using multiple notification methods +- filtering severity per recipient + +It uses **roles**. For example `sysadmin`, `webmaster`, `dba`, etc. + +Each alert is assigned to one or more roles, using the `to` line of the alert configuration. For example, here is the alert configuration for `ram.conf` that defaults to the role `sysadmin`: + +```conf + alarm: ram_in_use + on: system.ram + class: Utilization + type: System +component: Memory + os: linux + hosts: * + calc: $used * 100 / ($used + $cached + $free + $buffers) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + info: system memory utilization + to: sysadmin +``` + +Then `alarm-notify.sh` uses its own configuration file `health_alarm_notify.conf`, which at the bottom of the file stores the recipients per role, for all notification methods. + +Here is an example, of the `sysadmin`'s role recipients for the email notification. +You can send the notification to multiple recipients by separating the emails with a space. + +```conf + +############################################################################### +# RECIPIENTS PER ROLE + +# ----------------------------------------------------------------------------- +# generic system alerts +# CPU, disks, network interfaces, entropy, etc + +role_recipients_email[sysadmin]="someone@exaple.com someoneelse@example.com" +``` + +Each role may have one or more destinations and one or more notification methods. + +So, for example the `sysadmin` role may send: + +1. emails to admin1@example.com and admin2@example.com +2. pushover.net notifications to USERTOKENS `A`, `B` and `C`. +3. pushbullet.com push notifications to admin1@example.com and admin2@example.com +4. messages to the `#alerts` and `#systems` channels of a Slack workspace. +5. messages to Discord channels `#alerts` and `#systems`. + +## Configuration + +You can edit `health_alarm_notify.conf` using the `edit-config` script to configure: + +- **Settings** per notification method: + + All notification methods except email, require some configuration (i.e. API keys, tokens, destination rooms, channels, etc). Please check this section's content to find the configuration guides for your notification option of choice + +- **Recipients** per role per notification method + + ```conf + role_recipients_email[sysadmin]="${DEFAULT_RECIPIENT_EMAIL}" + role_recipients_pushover[sysadmin]="${DEFAULT_RECIPIENT_PUSHOVER}" + role_recipients_pushbullet[sysadmin]="${DEFAULT_RECIPIENT_PUSHBULLET}" + role_recipients_telegram[sysadmin]="${DEFAULT_RECIPIENT_TELEGRAM}" + role_recipients_slack[sysadmin]="${DEFAULT_RECIPIENT_SLACK}" + ... + ``` + + Here you can change the `${DEFAULT_...}` values to the values of the recipients you want, separated by a space if you have multiple recipients. + +## Testing Alert Notifications + +You can run the following command by hand, to test alerts configuration: + +```sh +# become user netdata +sudo su -s /bin/bash netdata + +# enable debugging info on the console +export NETDATA_ALARM_NOTIFY_DEBUG=1 + +# send test alerts to sysadmin +/usr/libexec/netdata/plugins.d/alarm-notify.sh test + +# send test alerts to any role +/usr/libexec/netdata/plugins.d/alarm-notify.sh test "ROLE" +``` + +If you are [running your own registry](/src/registry/README.md#run-your-own-registry), add `export NETDATA_REGISTRY_URL=[YOUR_URL]` before calling `alarm-notify.sh`. + +> If you need to dig even deeper, you can trace the execution with `bash -x`. Note that in test mode, `alarm-notify.sh` calls itself with many more arguments. So first do: +> +>```sh +>bash -x /usr/libexec/netdata/plugins.d/alarm-notify.sh test +>``` +> +> And then look in the output for the alarm-notify.sh calls and run the one you want to trace with `bash -x`. + +## Global configuration options + +### Notification Filtering + +When you define recipients per role for notification methods, you can append `|critical` to limit the notifications that are sent. + +In the following examples, the first recipient receives all the alerts, while the second one receives only notifications for alerts that have at some point become critical. +The second user may still receive warning and clear notifications, but only for the event that previously caused a critical alert. + +```conf + email : "user1@example.com user2@example.com|critical" + pushover : "2987343...9437837 8756278...2362736|critical" + telegram : "111827421 112746832|critical" + slack : "alerts disasters|critical" + alerta : "alerts disasters|critical" + flock : "alerts disasters|critical" + discord : "alerts disasters|critical" + twilio : "+15555555555 +17777777777|critical" + messagebird: "+15555555555 +17777777777|critical" + kavenegar : "09155555555 09177777777|critical" + pd : "<pd_service_key_1> <pd_service_key_2>|critical" + irc : "<irc_channel_1> <irc_channel_2>|critical" +``` + +If a per role recipient is set to an empty string, the default recipient of the given +notification method (email, pushover, telegram, slack, alerta, etc.) will be used. + +To disable a notification, use the recipient called: disabled +This works for all notification methods (including the default recipients). + +### Proxy configuration + +If you need to send curl based notifications (pushover, pushbullet, slack, alerta, +flock, discord, telegram) via a proxy, you should set these variables to your proxy address: + +```conf +export http_proxy="http://10.0.0.1:3128/" +export https_proxy="http://10.0.0.1:3128/" +``` + +### Notification images + +Images in notifications need to be downloaded from an Internet facing site. + +To allow notification providers to fetch the icons/images, by default we set the URL of the global public netdata registry. + +If you have an Internet facing netdata (or you have copied the images/ folder +of netdata to your web server), set its URL here, to fetch the notification +images from it. + +```conf +images_base_url="http://my.public.netdata.server:19999" +``` + +### Date handling + +You can configure netdata alerts to send dates in any format you want via editing the `date_format` variable. + +This uses standard `date` command format strings. See `man date` for +more info on what formats are supported. + +Note that this has to start with a '+', otherwise it won't work. + +- For ISO 8601 dates, use `+%FT%T%z` +- For RFC 5322 dates, use `+%a, %d %b %Y %H:%M:%S %z` +- For RFC 3339 dates, use `+%F %T%:z` +- For RFC 1123 dates, use `+%a, %d %b %Y %H:%M:%S %Z` +- For RFC 1036 dates, use `+%A, %d-%b-%y %H:%M:%S %Z` +- For a reasonably local date and time (in that order), use `+%x %X` +- For the old default behavior (compatible with ANSI C's `asctime()` function), leave the `date_format` field empty. + +### Hostname handling + +By default, Netdata will use the simple hostname for the system (the hostname with everything after the first `.` removed) when displaying the hostname in alert notifications. + +If you instead prefer to have Netdata use the host's fully qualified domain name, you can set `use_fdqn` to `YES`. + +This setting does not account for child systems for which the system you are configuring is a parent. + +> ### Note +> +> If the system's host name is overridden in `/etc/netdata.conf` with the `hostname` option, that name will be used unconditionally. diff --git a/health/notifications/alarm-email.sh b/src/health/notifications/alarm-email.sh index 69c4c3f8d..69c4c3f8d 100755 --- a/health/notifications/alarm-email.sh +++ b/src/health/notifications/alarm-email.sh diff --git a/health/notifications/alarm-notify.sh.in b/src/health/notifications/alarm-notify.sh.in index 9d95c21dc..9a5780de1 100755 --- a/health/notifications/alarm-notify.sh.in +++ b/src/health/notifications/alarm-notify.sh.in @@ -248,31 +248,32 @@ docurl() { # Used in a couple of places to write more compact code. method_names=" -email -pushover -pushbullet -telegram -slack alerta -flock +awssns +custom discord +dynatrace +email +fleep +flock +gotify hipchat -twilio +irc +kavenegar +matrix messagebird -pd -fleep -syslog -custom msteams -kavenegar +ntfy +pd prowl -irc -awssns +pushbullet +pushover rocketchat +slack sms -dynatrace -matrix -ntfy +syslog +telegram +twilio " # ----------------------------------------------------------------------------- @@ -2399,7 +2400,7 @@ send_ntfy() { msg="${host} ${status_message}: ${alarm} - ${info}" httpcode=$(docurl -X POST \ "${ntfy_auth_header[@]}" \ - -H "Icon: https://raw.githubusercontent.com/netdata/netdata/master/web/gui/dashboard/images/favicon-196x196.png" \ + -H "Icon: https://raw.githubusercontent.com/netdata/netdata/master/src/web/gui/dashboard/images/favicon-196x196.png" \ -H "Title: ${host}: ${name//_/ }" \ -H "Tags: ${emoji}" \ -H "Priority: ${priority}" \ diff --git a/health/notifications/alarm-test.sh b/src/health/notifications/alarm-test.sh index 828aa756b..828aa756b 100755 --- a/health/notifications/alarm-test.sh +++ b/src/health/notifications/alarm-test.sh diff --git a/src/health/notifications/alerta/README.md b/src/health/notifications/alerta/README.md new file mode 100644 index 000000000..40fef3fd7 --- /dev/null +++ b/src/health/notifications/alerta/README.md @@ -0,0 +1,128 @@ +<!--startmeta +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/health/notifications/alerta/README.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/health/notifications/alerta/metadata.yaml" +sidebar_label: "Alerta" +learn_status: "Published" +learn_rel_path: "Alerts & Notifications/Notifications/Agent Dispatched Notifications" +message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE NOTIFICATION'S metadata.yaml FILE" +endmeta--> + +# Alerta + + +<img src="https://netdata.cloud/img/alerta.png" width="150"/> + + +The [Alerta](https://alerta.io/) monitoring system is a tool used to consolidate and de-duplicate alerts from multiple sources for quick ‘at-a-glance’ visualization. With just one system you can monitor alerts from many other monitoring tools on a single screen. +You can send Netdata alerts to Alerta to see alerts coming from many Netdata hosts or also from a multi-host Netdata configuration. + + + +<img src="https://img.shields.io/badge/maintained%20by-Netdata-%2300ab44" /> + +## Setup + +### Prerequisites + +#### + +- A working Alerta instance +- An Alerta API key (if authentication in Alerta is enabled) +- Access to the terminal where Netdata Agent is running + + + +### Configuration + +#### File + +The configuration file name for this integration is `health_alarm_notify.conf`. + + +You can edit the configuration file using the `edit-config` script from the +Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory). + +```bash +cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata +sudo ./edit-config health_alarm_notify.conf +``` +#### Options + +The following options can be defined for this notification + +<details open><summary>Config Options</summary> + +| Name | Description | Default | Required | +|:----|:-----------|:-------|:--------:| +| SEND_ALERTA | Set `SEND_ALERTA` to YES | | yes | +| ALERTA_WEBHOOK_URL | set `ALERTA_WEBHOOK_URL` to the API url you defined when you installed the Alerta server. | | yes | +| ALERTA_API_KEY | Set `ALERTA_API_KEY` to your API key. | | yes | +| DEFAULT_RECIPIENT_ALERTA | Set `DEFAULT_RECIPIENT_ALERTA` to the default recipient environment you want the alert notifications to be sent to. All roles will default to this variable if left unconfigured. | | yes | +| DEFAULT_RECIPIENT_CUSTOM | Set different recipient environments per role, by editing `DEFAULT_RECIPIENT_CUSTOM` with the environment name of your choice | | no | + +##### ALERTA_API_KEY + +You will need an API key to send messages from any source, if Alerta is configured to use authentication (recommended). To create a new API key: +1. Go to Configuration > API Keys. +2. Create a new API key called "netdata" with `write:alerts` permission. + + +##### DEFAULT_RECIPIENT_CUSTOM + +The `DEFAULT_RECIPIENT_CUSTOM` can be edited in the following entries at the bottom of the same file: + +```conf +role_recipients_alerta[sysadmin]="Systems" +role_recipients_alerta[domainadmin]="Domains" +role_recipients_alerta[dba]="Databases Systems" +role_recipients_alerta[webmaster]="Marketing Development" +role_recipients_alerta[proxyadmin]="Proxy" +role_recipients_alerta[sitemgr]="Sites" +``` + +The values you provide should be defined as environments in `/etc/alertad.conf` with `ALLOWED_ENVIRONMENTS` option. + + +</details> + +#### Examples + +##### Basic Configuration + + + +```yaml +#------------------------------------------------------------------------------ +# alerta (alerta.io) global notification options + +SEND_ALERTA="YES" +ALERTA_WEBHOOK_URL="http://yourserver/alerta/api" +ALERTA_API_KEY="INSERT_YOUR_API_KEY_HERE" +DEFAULT_RECIPIENT_ALERTA="Production" + +``` + + +## Troubleshooting + +### Test Notification + +You can run the following command by hand, to test alerts configuration: + +```bash +# become user netdata +sudo su -s /bin/bash netdata + +# enable debugging info on the console +export NETDATA_ALARM_NOTIFY_DEBUG=1 + +# send test alarms to sysadmin +/usr/libexec/netdata/plugins.d/alarm-notify.sh test + +# send test alarms to any role +/usr/libexec/netdata/plugins.d/alarm-notify.sh test "ROLE" +``` + +Note that this will test _all_ alert mechanisms for the selected role. + + diff --git a/health/notifications/alerta/metadata.yaml b/src/health/notifications/alerta/metadata.yaml index f815032b9..f815032b9 100644 --- a/health/notifications/alerta/metadata.yaml +++ b/src/health/notifications/alerta/metadata.yaml diff --git a/src/health/notifications/awssns/README.md b/src/health/notifications/awssns/README.md new file mode 100644 index 000000000..b5a4cc5f4 --- /dev/null +++ b/src/health/notifications/awssns/README.md @@ -0,0 +1,180 @@ +<!--startmeta +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/health/notifications/awssns/README.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/health/notifications/awssns/metadata.yaml" +sidebar_label: "AWS SNS" +learn_status: "Published" +learn_rel_path: "Alerts & Notifications/Notifications/Agent Dispatched Notifications" +message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE NOTIFICATION'S metadata.yaml FILE" +endmeta--> + +# AWS SNS + + +<img src="https://netdata.cloud/img/aws.svg" width="150"/> + + +As part of its AWS suite, Amazon provides a notification broker service called 'Simple Notification Service' (SNS). Amazon SNS works similarly to Netdata's own notification system, allowing to dispatch a single notification to multiple subscribers of different types. Among other things, SNS supports sending notifications to: +- Email addresses +- Mobile Phones via SMS +- HTTP or HTTPS web hooks +- AWS Lambda functions +- AWS SQS queues +- Mobile applications via push notifications +You can send notifications through Amazon SNS using Netdata's Agent alert notification feature, which supports dozens of endpoints, user roles, and more. + + +<img src="https://img.shields.io/badge/maintained%20by-Netdata-%2300ab44" /> + +## Limitations + +- While Amazon SNS supports sending differently formatted messages for different delivery methods, Netdata does not currently support this functionality. +- For email notification support, we recommend using Netdata's email notifications, as it is has the following benefits: + - In most cases, it requires less configuration. + - Netdata's emails are nicely pre-formatted and support features like threading, which requires a lot of manual effort in SNS. + - It is less resource intensive and more cost-efficient than SNS. + + + +## Setup + +### Prerequisites + +#### + +- The [Amazon Web Services CLI tools](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) (awscli). +- An actual home directory for the user you run Netdata as, instead of just using `/` as a home directory. The setup depends on the distribution, but `/var/lib/netdata` is the recommended directory. If you are using Netdata as a dedicated user, the permissions will already be correct. +- An Amazon SNS topic to send notifications to with one or more subscribers. The Getting Started section of the Amazon SNS documentation covers the basics of how to set this up. Make note of the Topic ARN when you create the topic. +- While not mandatory, it is highly recommended to create a dedicated IAM user on your account for Netdata to send notifications. This user needs to have programmatic access, and should only allow access to SNS. For an additional layer of security, you can create one for each system or group of systems. +- Terminal access to the Agent you wish to configure. + + + +### Configuration + +#### File + +The configuration file name for this integration is `health_alarm_notify.conf`. + + +You can edit the configuration file using the `edit-config` script from the +Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory). + +```bash +cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata +sudo ./edit-config health_alarm_notify.conf +``` +#### Options + +The following options can be defined for this notification + +<details open><summary>Config Options</summary> + +| Name | Description | Default | Required | +|:----|:-----------|:-------|:--------:| +| aws path | The full path of the aws command. If empty, the system `$PATH` will be searched for it. If not found, Amazon SNS notifications will be silently disabled. | | yes | +| SEND_AWSNS | Set `SEND_AWSNS` to YES | YES | yes | +| AWSSNS_MESSAGE_FORMAT | Set `AWSSNS_MESSAGE_FORMAT` to to the string that you want the alert to be sent into. | ${status} on ${host} at ${date}: ${chart} ${value_string} | yes | +| DEFAULT_RECIPIENT_AWSSNS | Set `DEFAULT_RECIPIENT_AWSSNS` to the Topic ARN you noted down upon creating the Topic. | | yes | + +##### AWSSNS_MESSAGE_FORMAT + +The supported variables are: + +| Variable name | Description | +|:---------------------------:|:---------------------------------------------------------------------------------| +| `${alarm}` | Like "name = value units" | +| `${status_message}` | Like "needs attention", "recovered", "is critical" | +| `${severity}` | Like "Escalated to CRITICAL", "Recovered from WARNING" | +| `${raised_for}` | Like "(alarm was raised for 10 minutes)" | +| `${host}` | The host generated this event | +| `${url_host}` | Same as ${host} but URL encoded | +| `${unique_id}` | The unique id of this event | +| `${alarm_id}` | The unique id of the alarm that generated this event | +| `${event_id}` | The incremental id of the event, for this alarm id | +| `${when}` | The timestamp this event occurred | +| `${name}` | The name of the alarm, as given in netdata health.d entries | +| `${url_name}` | Same as ${name} but URL encoded | +| `${chart}` | The name of the chart (type.id) | +| `${url_chart}` | Same as ${chart} but URL encoded | +| `${status}` | The current status : REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL | +| `${old_status}` | The previous status: REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL | +| `${value}` | The current value of the alarm | +| `${old_value}` | The previous value of the alarm | +| `${src}` | The line number and file the alarm has been configured | +| `${duration}` | The duration in seconds of the previous alarm state | +| `${duration_txt}` | Same as ${duration} for humans | +| `${non_clear_duration}` | The total duration in seconds this is/was non-clear | +| `${non_clear_duration_txt}` | Same as ${non_clear_duration} for humans | +| `${units}` | The units of the value | +| `${info}` | A short description of the alarm | +| `${value_string}` | Friendly value (with units) | +| `${old_value_string}` | Friendly old value (with units) | +| `${image}` | The URL of an image to represent the status of the alarm | +| `${color}` | A color in AABBCC format for the alarm | +| `${goto_url}` | The URL the user can click to see the netdata dashboard | +| `${calc_expression}` | The expression evaluated to provide the value for the alarm | +| `${calc_param_values}` | The value of the variables in the evaluated expression | +| `${total_warnings}` | The total number of alarms in WARNING state on the host | +| `${total_critical}` | The total number of alarms in CRITICAL state on the host | + + +##### DEFAULT_RECIPIENT_AWSSNS + +All roles will default to this variable if left unconfigured. + +You can have different recipient Topics per **role**, by editing `DEFAULT_RECIPIENT_AWSSNS` with the Topic ARN you want, in the following entries at the bottom of the same file: + +```conf +role_recipients_awssns[sysadmin]="arn:aws:sns:us-east-2:123456789012:Systems" +role_recipients_awssns[domainadmin]="arn:aws:sns:us-east-2:123456789012:Domains" +role_recipients_awssns[dba]="arn:aws:sns:us-east-2:123456789012:Databases" +role_recipients_awssns[webmaster]="arn:aws:sns:us-east-2:123456789012:Development" +role_recipients_awssns[proxyadmin]="arn:aws:sns:us-east-2:123456789012:Proxy" +role_recipients_awssns[sitemgr]="arn:aws:sns:us-east-2:123456789012:Sites" +``` + + +</details> + +#### Examples + +##### Basic Configuration + +An example working configuration would be: + +```yaml +```conf +#------------------------------------------------------------------------------ +# Amazon SNS notifications + +SEND_AWSSNS="YES" +AWSSNS_MESSAGE_FORMAT="${status} on ${host} at ${date}: ${chart} ${value_string}" +DEFAULT_RECIPIENT_AWSSNS="arn:aws:sns:us-east-2:123456789012:MyTopic" +``` + +``` + + +## Troubleshooting + +### Test Notification + +You can run the following command by hand, to test alerts configuration: + +```bash +# become user netdata +sudo su -s /bin/bash netdata + +# enable debugging info on the console +export NETDATA_ALARM_NOTIFY_DEBUG=1 + +# send test alarms to sysadmin +/usr/libexec/netdata/plugins.d/alarm-notify.sh test + +# send test alarms to any role +/usr/libexec/netdata/plugins.d/alarm-notify.sh test "ROLE" +``` + +Note that this will test _all_ alert mechanisms for the selected role. + + diff --git a/health/notifications/awssns/metadata.yaml b/src/health/notifications/awssns/metadata.yaml index 93389bad0..93389bad0 100644 --- a/health/notifications/awssns/metadata.yaml +++ b/src/health/notifications/awssns/metadata.yaml diff --git a/src/health/notifications/custom/README.md b/src/health/notifications/custom/README.md new file mode 100644 index 000000000..785aec59d --- /dev/null +++ b/src/health/notifications/custom/README.md @@ -0,0 +1,211 @@ +<!--startmeta +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/health/notifications/custom/README.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/health/notifications/custom/metadata.yaml" +sidebar_label: "Custom" +learn_status: "Published" +learn_rel_path: "Alerts & Notifications/Notifications/Agent Dispatched Notifications" +message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE NOTIFICATION'S metadata.yaml FILE" +endmeta--> + +# Custom + + +<img src="https://netdata.cloud/img/custom.png" width="150"/> + + +Netdata Agent's alert notification feature allows you to send custom notifications to any endpoint you choose. + + + +<img src="https://img.shields.io/badge/maintained%20by-Netdata-%2300ab44" /> + +## Setup + +### Prerequisites + +#### + +- Access to the terminal where Netdata Agent is running + + + +### Configuration + +#### File + +The configuration file name for this integration is `health_alarm_notify.conf`. + + +You can edit the configuration file using the `edit-config` script from the +Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory). + +```bash +cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata +sudo ./edit-config health_alarm_notify.conf +``` +#### Options + +The following options can be defined for this notification + +<details open><summary>Config Options</summary> + +| Name | Description | Default | Required | +|:----|:-----------|:-------|:--------:| +| SEND_CUSTOM | Set `SEND_CUSTOM` to YES | YES | yes | +| DEFAULT_RECIPIENT_CUSTOM | This value is dependent on how you handle the `${to}` variable inside the `custom_sender()` function. | | yes | +| custom_sender() | You can look at the other senders in `/usr/libexec/netdata/plugins.d/alarm-notify.sh` for examples of how to modify the function in this configuration file. | | no | + +##### DEFAULT_RECIPIENT_CUSTOM + +All roles will default to this variable if left unconfigured. You can edit `DEFAULT_RECIPIENT_CUSTOM` with the variable you want, in the following entries at the bottom of the same file: +``` +role_recipients_custom[sysadmin]="systems" +role_recipients_custom[domainadmin]="domains" +role_recipients_custom[dba]="databases systems" +role_recipients_custom[webmaster]="marketing development" +role_recipients_custom[proxyadmin]="proxy-admin" +role_recipients_custom[sitemgr]="sites" +``` + + +##### custom_sender() + +The following is a sample custom_sender() function in health_alarm_notify.conf, to send an SMS via an imaginary HTTPS endpoint to the SMS gateway: +``` +custom_sender() { + # example human readable SMS + local msg="${host} ${status_message}: ${alarm} ${raised_for}" + + # limit it to 160 characters and encode it for use in a URL + urlencode "${msg:0:160}" >/dev/null; msg="${REPLY}" + + # a space separated list of the recipients to send alarms to + to="${1}" + + for phone in ${to}; do + httpcode=$(docurl -X POST \ + --data-urlencode "From=XXX" \ + --data-urlencode "To=${phone}" \ + --data-urlencode "Body=${msg}" \ + -u "${accountsid}:${accounttoken}" \ + https://domain.website.com/) + + if [ "${httpcode}" = "200" ]; then + info "sent custom notification ${msg} to ${phone}" + sent=$((sent + 1)) + else + error "failed to send custom notification ${msg} to ${phone} with HTTP error code ${httpcode}." + fi + done +} +``` + +The supported variables that you can use for the function's `msg` variable are: + +| Variable name | Description | +|:---------------------------:|:---------------------------------------------------------------------------------| +| `${alarm}` | Like "name = value units" | +| `${status_message}` | Like "needs attention", "recovered", "is critical" | +| `${severity}` | Like "Escalated to CRITICAL", "Recovered from WARNING" | +| `${raised_for}` | Like "(alarm was raised for 10 minutes)" | +| `${host}` | The host generated this event | +| `${url_host}` | Same as ${host} but URL encoded | +| `${unique_id}` | The unique id of this event | +| `${alarm_id}` | The unique id of the alarm that generated this event | +| `${event_id}` | The incremental id of the event, for this alarm id | +| `${when}` | The timestamp this event occurred | +| `${name}` | The name of the alarm, as given in netdata health.d entries | +| `${url_name}` | Same as ${name} but URL encoded | +| `${chart}` | The name of the chart (type.id) | +| `${url_chart}` | Same as ${chart} but URL encoded | +| `${status}` | The current status : REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL | +| `${old_status}` | The previous status: REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL | +| `${value}` | The current value of the alarm | +| `${old_value}` | The previous value of the alarm | +| `${src}` | The line number and file the alarm has been configured | +| `${duration}` | The duration in seconds of the previous alarm state | +| `${duration_txt}` | Same as ${duration} for humans | +| `${non_clear_duration}` | The total duration in seconds this is/was non-clear | +| `${non_clear_duration_txt}` | Same as ${non_clear_duration} for humans | +| `${units}` | The units of the value | +| `${info}` | A short description of the alarm | +| `${value_string}` | Friendly value (with units) | +| `${old_value_string}` | Friendly old value (with units) | +| `${image}` | The URL of an image to represent the status of the alarm | +| `${color}` | A color in AABBCC format for the alarm | +| `${goto_url}` | The URL the user can click to see the netdata dashboard | +| `${calc_expression}` | The expression evaluated to provide the value for the alarm | +| `${calc_param_values}` | The value of the variables in the evaluated expression | +| `${total_warnings}` | The total number of alarms in WARNING state on the host | +| `${total_critical}` | The total number of alarms in CRITICAL state on the host | + + +</details> + +#### Examples + +##### Basic Configuration + + + +```yaml +#------------------------------------------------------------------------------ +# custom notifications + +SEND_CUSTOM="YES" +DEFAULT_RECIPIENT_CUSTOM="" + +# The custom_sender() is a custom function to do whatever you need to do +custom_sender() { + # example human readable SMS + local msg="${host} ${status_message}: ${alarm} ${raised_for}" + + # limit it to 160 characters and encode it for use in a URL + urlencode "${msg:0:160}" >/dev/null; msg="${REPLY}" + + # a space separated list of the recipients to send alarms to + to="${1}" + + for phone in ${to}; do + httpcode=$(docurl -X POST \ + --data-urlencode "From=XXX" \ + --data-urlencode "To=${phone}" \ + --data-urlencode "Body=${msg}" \ + -u "${accountsid}:${accounttoken}" \ + https://domain.website.com/) + + if [ "${httpcode}" = "200" ]; then + info "sent custom notification ${msg} to ${phone}" + sent=$((sent + 1)) + else + error "failed to send custom notification ${msg} to ${phone} with HTTP error code ${httpcode}." + fi + done +} + +``` + + +## Troubleshooting + +### Test Notification + +You can run the following command by hand, to test alerts configuration: + +```bash +# become user netdata +sudo su -s /bin/bash netdata + +# enable debugging info on the console +export NETDATA_ALARM_NOTIFY_DEBUG=1 + +# send test alarms to sysadmin +/usr/libexec/netdata/plugins.d/alarm-notify.sh test + +# send test alarms to any role +/usr/libexec/netdata/plugins.d/alarm-notify.sh test "ROLE" +``` + +Note that this will test _all_ alert mechanisms for the selected role. + + diff --git a/health/notifications/custom/metadata.yaml b/src/health/notifications/custom/metadata.yaml index 557539cfb..557539cfb 100644 --- a/health/notifications/custom/metadata.yaml +++ b/src/health/notifications/custom/metadata.yaml diff --git a/src/health/notifications/discord/README.md b/src/health/notifications/discord/README.md new file mode 100644 index 000000000..128e04a44 --- /dev/null +++ b/src/health/notifications/discord/README.md @@ -0,0 +1,117 @@ +<!--startmeta +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/health/notifications/discord/README.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/health/notifications/discord/metadata.yaml" +sidebar_label: "Discord" +learn_status: "Published" +learn_rel_path: "Alerts & Notifications/Notifications/Agent Dispatched Notifications" +message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE NOTIFICATION'S metadata.yaml FILE" +endmeta--> + +# Discord + + +<img src="https://netdata.cloud/img/discord.png" width="150"/> + + +Send notifications to Discord using Netdata's Agent alert notification feature, which supports dozens of endpoints, user roles, and more. + + + +<img src="https://img.shields.io/badge/maintained%20by-Netdata-%2300ab44" /> + +## Setup + +### Prerequisites + +#### + +- The incoming webhook URL as given by Discord. Create a webhook by following the official [Discord documentation](https://support.discord.com/hc/en-us/articles/228383668-Intro-to-Webhooks). You can use the same on all your Netdata servers (or you can have multiple if you like - your decision). +- One or more Discord channels to post the messages to +- Access to the terminal where Netdata Agent is running + + + +### Configuration + +#### File + +The configuration file name for this integration is `health_alarm_notify.conf`. + + +You can edit the configuration file using the `edit-config` script from the +Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory). + +```bash +cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata +sudo ./edit-config health_alarm_notify.conf +``` +#### Options + +The following options can be defined for this notification + +<details open><summary>Config Options</summary> + +| Name | Description | Default | Required | +|:----|:-----------|:-------|:--------:| +| SEND_DISCORD | Set `SEND_DISCORD` to YES | YES | yes | +| DISCORD_WEBHOOK_URL | set `DISCORD_WEBHOOK_URL` to your webhook URL. | | yes | +| DEFAULT_RECIPIENT_DISCORD | Set `DEFAULT_RECIPIENT_DISCORD` to the channel you want the alert notifications to be sent to. You can define multiple channels like this: `alerts` `systems`. | | yes | + +##### DEFAULT_RECIPIENT_DISCORD + +All roles will default to this variable if left unconfigured. +You can then have different channels per role, by editing `DEFAULT_RECIPIENT_DISCORD` with the channel you want, in the following entries at the bottom of the same file: +```conf +role_recipients_discord[sysadmin]="systems" +role_recipients_discord[domainadmin]="domains" +role_recipients_discord[dba]="databases systems" +role_recipients_discord[webmaster]="marketing development" +role_recipients_discord[proxyadmin]="proxy-admin" +role_recipients_discord[sitemgr]="sites" +``` + +The values you provide should already exist as Discord channels in your server. + + +</details> + +#### Examples + +##### Basic Configuration + + + +```yaml +#------------------------------------------------------------------------------ +# discord (discordapp.com) global notification options + +SEND_DISCORD="YES" +DISCORD_WEBHOOK_URL="https://discord.com/api/webhooks/XXXXXXXXXXXXX/XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" +DEFAULT_RECIPIENT_DISCORD="alerts" + +``` + + +## Troubleshooting + +### Test Notification + +You can run the following command by hand, to test alerts configuration: + +```bash +# become user netdata +sudo su -s /bin/bash netdata + +# enable debugging info on the console +export NETDATA_ALARM_NOTIFY_DEBUG=1 + +# send test alarms to sysadmin +/usr/libexec/netdata/plugins.d/alarm-notify.sh test + +# send test alarms to any role +/usr/libexec/netdata/plugins.d/alarm-notify.sh test "ROLE" +``` + +Note that this will test _all_ alert mechanisms for the selected role. + + diff --git a/health/notifications/discord/metadata.yaml b/src/health/notifications/discord/metadata.yaml index a46a8ec98..a46a8ec98 100644 --- a/health/notifications/discord/metadata.yaml +++ b/src/health/notifications/discord/metadata.yaml diff --git a/src/health/notifications/dynatrace/README.md b/src/health/notifications/dynatrace/README.md new file mode 100644 index 000000000..6785cdb82 --- /dev/null +++ b/src/health/notifications/dynatrace/README.md @@ -0,0 +1,124 @@ +<!--startmeta +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/health/notifications/dynatrace/README.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/health/notifications/dynatrace/metadata.yaml" +sidebar_label: "Dynatrace" +learn_status: "Published" +learn_rel_path: "Alerts & Notifications/Notifications/Agent Dispatched Notifications" +message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE NOTIFICATION'S metadata.yaml FILE" +endmeta--> + +# Dynatrace + + +<img src="https://netdata.cloud/img/dynatrace.svg" width="150"/> + + +Dynatrace allows you to receive notifications using their Events REST API. See the [Dynatrace documentation](https://www.dynatrace.com/support/help/dynatrace-api/environment-api/events-v2/post-event) about POSTing an event in the Events API for more details. +You can send notifications to Dynatrace using Netdata's Agent alert notification feature, which supports dozens of endpoints, user roles, and more. + + + +<img src="https://img.shields.io/badge/maintained%20by-Netdata-%2300ab44" /> + +## Setup + +### Prerequisites + +#### + +- A Dynatrace Server. You can use the same on all your Netdata servers but make sure the server is network visible from your Netdata hosts. The Dynatrace server should be with protocol prefixed (http:// or https://), for example: https://monitor.example.com. +- An API Token. Generate a secure access API token that enables access to your Dynatrace monitoring data via the REST-based API. See [Dynatrace API - Authentication](https://www.dynatrace.com/support/help/extend-dynatrace/dynatrace-api/basics/dynatrace-api-authentication/) for more details. +- An API Space. This is the URL part of the page you have access in order to generate the API Token. For example, the URL for a generated API token might look like: https://monitor.illumineit.com/e/2a93fe0e-4cd5-469a-9d0d-1a064235cfce/#settings/integration/apikeys;gf=all In that case, the Space is 2a93fe0e-4cd5-469a-9d0d-1a064235cfce. +- A Server Tag. To generate one on your Dynatrace Server, go to Settings --> Tags --> Manually applied tags and create the Tag. The Netdata alarm is sent as a Dynatrace Event to be correlated with all those hosts tagged with this Tag you have created. +- Terminal access to the Agent you wish to configure + + + +### Configuration + +#### File + +The configuration file name for this integration is `health_alarm_notify.conf`. + + +You can edit the configuration file using the `edit-config` script from the +Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory). + +```bash +cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata +sudo ./edit-config health_alarm_notify.conf +``` +#### Options + +The following options can be defined for this notification + +<details open><summary>Config Options</summary> + +| Name | Description | Default | Required | +|:----|:-----------|:-------|:--------:| +| SEND_DYNATRACE | Set `SEND_DYNATRACE` to YES | YES | yes | +| DYNATRACE_SERVER | Set `DYNATRACE_SERVER` to the Dynatrace server with the protocol prefix, for example `https://monitor.example.com`. | | yes | +| DYNATRACE_TOKEN | Set `DYNATRACE_TOKEN` to your Dynatrace API authentication token | | yes | +| DYNATRACE_SPACE | Set `DYNATRACE_SPACE` to the API Space, it is the URL part of the page you have access in order to generate the API Token. | | yes | +| DYNATRACE_TAG_VALUE | Set `DYNATRACE_TAG_VALUE` to your Dynatrace Server Tag. | | yes | +| DYNATRACE_ANNOTATION_TYPE | `DYNATRACE_ANNOTATION_TYPE` can be left to its default value Netdata Alarm, but you can change it to better fit your needs. | Netdata Alarm | no | +| DYNATRACE_EVENT | Set `DYNATRACE_EVENT` to the Dynatrace eventType you want. | Netdata Alarm | no | + +##### DYNATRACE_SPACE + +For example, the URL for a generated API token might look like: https://monitor.illumineit.com/e/2a93fe0e-4cd5-469a-9d0d-1a064235cfce/#settings/integration/apikeys;gf=all In that case, the Space is 2a93fe0e-4cd5-469a-9d0d-1a064235cfce. + + +##### DYNATRACE_EVENT + +`AVAILABILITY_EVENT`, `CUSTOM_ALERT`, `CUSTOM_ANNOTATION`, `CUSTOM_CONFIGURATION`, `CUSTOM_DEPLOYMENT`, `CUSTOM_INFO`, `ERROR_EVENT`, +`MARKED_FOR_TERMINATION`, `PERFORMANCE_EVENT`, `RESOURCE_CONTENTION_EVENT`. +You can read more [here](https://www.dynatrace.com/support/help/dynatrace-api/environment-api/events-v2/post-event#request-body-objects). + + +</details> + +#### Examples + +##### Basic Configuration + + + +```yaml +#------------------------------------------------------------------------------ +# Dynatrace global notification options + +SEND_DYNATRACE="YES" +DYNATRACE_SERVER="https://monitor.example.com" +DYNATRACE_TOKEN="XXXXXXX" +DYNATRACE_SPACE="2a93fe0e-4cd5-469a-9d0d-1a064235cfce" +DYNATRACE_TAG_VALUE="SERVERTAG" +DYNATRACE_ANNOTATION_TYPE="Netdata Alert" +DYNATRACE_EVENT="AVAILABILITY_EVENT" + +``` + + +## Troubleshooting + +### Test Notification + +You can run the following command by hand, to test alerts configuration: + +```bash +# become user netdata +sudo su -s /bin/bash netdata + +# enable debugging info on the console +export NETDATA_ALARM_NOTIFY_DEBUG=1 + +# send test alarms to sysadmin +/usr/libexec/netdata/plugins.d/alarm-notify.sh test + +# send test alarms to any role +/usr/libexec/netdata/plugins.d/alarm-notify.sh test "ROLE" +``` + +Note that this will test _all_ alert mechanisms for the selected role. + + diff --git a/health/notifications/dynatrace/metadata.yaml b/src/health/notifications/dynatrace/metadata.yaml index a88c766fd..a88c766fd 100644 --- a/health/notifications/dynatrace/metadata.yaml +++ b/src/health/notifications/dynatrace/metadata.yaml diff --git a/src/health/notifications/email/README.md b/src/health/notifications/email/README.md new file mode 100644 index 000000000..1e831d58e --- /dev/null +++ b/src/health/notifications/email/README.md @@ -0,0 +1,114 @@ +<!--startmeta +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/health/notifications/email/README.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/health/notifications/email/metadata.yaml" +sidebar_label: "Email" +learn_status: "Published" +learn_rel_path: "Alerts & Notifications/Notifications/Agent Dispatched Notifications" +message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE NOTIFICATION'S metadata.yaml FILE" +endmeta--> + +# Email + + +<img src="https://netdata.cloud/img/email.png" width="150"/> + + +Send notifications via Email using Netdata's Agent alert notification feature, which supports dozens of endpoints, user roles, and more. + + + +<img src="https://img.shields.io/badge/maintained%20by-Netdata-%2300ab44" /> + +## Setup + +### Prerequisites + +#### + +- A working sendmail command is required for email alerts to work. Almost all MTAs provide a sendmail interface. Netdata sends all emails as user netdata, so make sure your sendmail works for local users. +- Access to the terminal where Netdata Agent is running + + + +### Configuration + +#### File + +The configuration file name for this integration is `health_alarm_notify.conf`. + + +You can edit the configuration file using the `edit-config` script from the +Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory). + +```bash +cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata +sudo ./edit-config health_alarm_notify.conf +``` +#### Options + +The following options can be defined for this notification + +<details open><summary>Config Options</summary> + +| Name | Description | Default | Required | +|:----|:-----------|:-------|:--------:| +| EMAIL_SENDER | You can change `EMAIL_SENDER` to the email address sending the notifications. | netdata | no | +| SEND_EMAIL | Set `SEND_EMAIL` to YES | YES | yes | +| DEFAULT_RECIPIENT_EMAIL | Set `DEFAULT_RECIPIENT_EMAIL` to the email address you want the email to be sent by default. You can define multiple email addresses like this: `alarms@example.com` `systems@example.com`. | root | yes | + +##### DEFAULT_RECIPIENT_EMAIL + +All roles will default to this variable if left unconfigured. +The `DEFAULT_RECIPIENT_CUSTOM` can be edited in the following entries at the bottom of the same file: +```conf +role_recipients_email[sysadmin]="systems@example.com" +role_recipients_email[domainadmin]="domains@example.com" +role_recipients_email[dba]="databases@example.com systems@example.com" +role_recipients_email[webmaster]="marketing@example.com development@example.com" +role_recipients_email[proxyadmin]="proxy-admin@example.com" +role_recipients_email[sitemgr]="sites@example.com" +``` + + +</details> + +#### Examples + +##### Basic Configuration + + + +```yaml +#------------------------------------------------------------------------------ +# email global notification options + +EMAIL_SENDER="example@domain.com" +SEND_EMAIL="YES" +DEFAULT_RECIPIENT_EMAIL="recipient@example.com" + +``` + + +## Troubleshooting + +### Test Notification + +You can run the following command by hand, to test alerts configuration: + +```bash +# become user netdata +sudo su -s /bin/bash netdata + +# enable debugging info on the console +export NETDATA_ALARM_NOTIFY_DEBUG=1 + +# send test alarms to sysadmin +/usr/libexec/netdata/plugins.d/alarm-notify.sh test + +# send test alarms to any role +/usr/libexec/netdata/plugins.d/alarm-notify.sh test "ROLE" +``` + +Note that this will test _all_ alert mechanisms for the selected role. + + diff --git a/health/notifications/email/metadata.yaml b/src/health/notifications/email/metadata.yaml index f0d4a62a9..f0d4a62a9 100644 --- a/health/notifications/email/metadata.yaml +++ b/src/health/notifications/email/metadata.yaml diff --git a/src/health/notifications/flock/README.md b/src/health/notifications/flock/README.md new file mode 100644 index 000000000..332ede832 --- /dev/null +++ b/src/health/notifications/flock/README.md @@ -0,0 +1,113 @@ +<!--startmeta +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/health/notifications/flock/README.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/health/notifications/flock/metadata.yaml" +sidebar_label: "Flock" +learn_status: "Published" +learn_rel_path: "Alerts & Notifications/Notifications/Agent Dispatched Notifications" +message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE NOTIFICATION'S metadata.yaml FILE" +endmeta--> + +# Flock + + +<img src="https://netdata.cloud/img/flock.png" width="150"/> + + +Send notifications to Flock using Netdata's Agent alert notification feature, which supports dozens of endpoints, user roles, and more. + + + +<img src="https://img.shields.io/badge/maintained%20by-Netdata-%2300ab44" /> + +## Setup + +### Prerequisites + +#### + +- The incoming webhook URL as given by flock.com. You can use the same on all your Netdata servers (or you can have multiple if you like). Read more about flock webhooks and how to get one [here](https://admin.flock.com/webhooks). +- Access to the terminal where Netdata Agent is running + + + +### Configuration + +#### File + +The configuration file name for this integration is `health_alarm_notify.conf`. + + +You can edit the configuration file using the `edit-config` script from the +Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory). + +```bash +cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata +sudo ./edit-config health_alarm_notify.conf +``` +#### Options + +The following options can be defined for this notification + +<details open><summary>Config Options</summary> + +| Name | Description | Default | Required | +|:----|:-----------|:-------|:--------:| +| SEND_FLOCK | Set `SEND_FLOCK` to YES | YES | yes | +| FLOCK_WEBHOOK_URL | set `FLOCK_WEBHOOK_URL` to your webhook URL. | | yes | +| DEFAULT_RECIPIENT_FLOCK | Set `DEFAULT_RECIPIENT_FLOCK` to the Flock channel you want the alert notifications to be sent to. All roles will default to this variable if left unconfigured. | | yes | + +##### DEFAULT_RECIPIENT_FLOCK + +You can have different channels per role, by editing DEFAULT_RECIPIENT_FLOCK with the channel you want, in the following entries at the bottom of the same file: +```conf +role_recipients_flock[sysadmin]="systems" +role_recipients_flock[domainadmin]="domains" +role_recipients_flock[dba]="databases systems" +role_recipients_flock[webmaster]="marketing development" +role_recipients_flock[proxyadmin]="proxy-admin" +role_recipients_flock[sitemgr]="sites" +``` + + +</details> + +#### Examples + +##### Basic Configuration + + + +```yaml +#------------------------------------------------------------------------------ +# flock (flock.com) global notification options + +SEND_FLOCK="YES" +FLOCK_WEBHOOK_URL="https://api.flock.com/hooks/sendMessage/XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" +DEFAULT_RECIPIENT_FLOCK="alarms" + +``` + + +## Troubleshooting + +### Test Notification + +You can run the following command by hand, to test alerts configuration: + +```bash +# become user netdata +sudo su -s /bin/bash netdata + +# enable debugging info on the console +export NETDATA_ALARM_NOTIFY_DEBUG=1 + +# send test alarms to sysadmin +/usr/libexec/netdata/plugins.d/alarm-notify.sh test + +# send test alarms to any role +/usr/libexec/netdata/plugins.d/alarm-notify.sh test "ROLE" +``` + +Note that this will test _all_ alert mechanisms for the selected role. + + diff --git a/health/notifications/flock/metadata.yaml b/src/health/notifications/flock/metadata.yaml index 62e7f4995..62e7f4995 100644 --- a/health/notifications/flock/metadata.yaml +++ b/src/health/notifications/flock/metadata.yaml diff --git a/src/health/notifications/gotify/README.md b/src/health/notifications/gotify/README.md new file mode 100644 index 000000000..f0f8a7edb --- /dev/null +++ b/src/health/notifications/gotify/README.md @@ -0,0 +1,98 @@ +<!--startmeta +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/health/notifications/gotify/README.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/health/notifications/gotify/metadata.yaml" +sidebar_label: "Gotify" +learn_status: "Published" +learn_rel_path: "Alerts & Notifications/Notifications/Agent Dispatched Notifications" +message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE NOTIFICATION'S metadata.yaml FILE" +endmeta--> + +# Gotify + + +<img src="https://netdata.cloud/img/gotify.png" width="150"/> + + +[Gotify](https://gotify.net/) is a self-hosted push notification service created for sending and receiving messages in real time. +You can send alerts to your Gotify instance using Netdata's Agent alert notification feature, which supports dozens of endpoints, user roles, and more. + + + +<img src="https://img.shields.io/badge/maintained%20by-Netdata-%2300ab44" /> + +## Setup + +### Prerequisites + +#### + +- An application token. You can generate a new token in the Gotify Web UI. +- Access to the terminal where Netdata Agent is running + + + +### Configuration + +#### File + +The configuration file name for this integration is `health_alarm_notify.conf`. + + +You can edit the configuration file using the `edit-config` script from the +Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory). + +```bash +cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata +sudo ./edit-config health_alarm_notify.conf +``` +#### Options + +The following options can be defined for this notification + +<details open><summary>Config Options</summary> + +| Name | Description | Default | Required | +|:----|:-----------|:-------|:--------:| +| SEND_GOTIFY | Set `SEND_GOTIFY` to YES | YES | yes | +| GOTIFY_APP_TOKEN | set `GOTIFY_APP_TOKEN` to the app token you generated. | | yes | +| GOTIFY_APP_URL | Set `GOTIFY_APP_URL` to point to your Gotify instance, for example `https://push.example.domain/` | | yes | + +</details> + +#### Examples + +##### Basic Configuration + + + +```yaml +SEND_GOTIFY="YES" +GOTIFY_APP_TOKEN="XXXXXXXXXXXXXXX" +GOTIFY_APP_URL="https://push.example.domain/" + +``` + + +## Troubleshooting + +### Test Notification + +You can run the following command by hand, to test alerts configuration: + +```bash +# become user netdata +sudo su -s /bin/bash netdata + +# enable debugging info on the console +export NETDATA_ALARM_NOTIFY_DEBUG=1 + +# send test alarms to sysadmin +/usr/libexec/netdata/plugins.d/alarm-notify.sh test + +# send test alarms to any role +/usr/libexec/netdata/plugins.d/alarm-notify.sh test "ROLE" +``` + +Note that this will test _all_ alert mechanisms for the selected role. + + diff --git a/health/notifications/gotify/metadata.yaml b/src/health/notifications/gotify/metadata.yaml index 4552de1c4..4552de1c4 100644 --- a/health/notifications/gotify/metadata.yaml +++ b/src/health/notifications/gotify/metadata.yaml diff --git a/health/notifications/health_alarm_notify.conf b/src/health/notifications/health_alarm_notify.conf index f3b67c9de..f3b67c9de 100755 --- a/health/notifications/health_alarm_notify.conf +++ b/src/health/notifications/health_alarm_notify.conf diff --git a/health/notifications/health_email_recipients.conf b/src/health/notifications/health_email_recipients.conf index f56c6c64a..f56c6c64a 100644 --- a/health/notifications/health_email_recipients.conf +++ b/src/health/notifications/health_email_recipients.conf diff --git a/src/health/notifications/irc/README.md b/src/health/notifications/irc/README.md new file mode 100644 index 000000000..76d3f5bc2 --- /dev/null +++ b/src/health/notifications/irc/README.md @@ -0,0 +1,132 @@ +<!--startmeta +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/health/notifications/irc/README.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/health/notifications/irc/metadata.yaml" +sidebar_label: "IRC" +learn_status: "Published" +learn_rel_path: "Alerts & Notifications/Notifications/Agent Dispatched Notifications" +message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE NOTIFICATION'S metadata.yaml FILE" +endmeta--> + +# IRC + + +<img src="https://netdata.cloud/img/irc.png" width="150"/> + + +Send notifications to IRC using Netdata's Agent alert notification feature, which supports dozens of endpoints, user roles, and more. + + + +<img src="https://img.shields.io/badge/maintained%20by-Netdata-%2300ab44" /> + +## Setup + +### Prerequisites + +#### + +- The `nc` utility. You can set the path to it, or Netdata will search for it in your system `$PATH`. +- Access to the terminal where Netdata Agent is running + + + +### Configuration + +#### File + +The configuration file name for this integration is `health_alarm_notify.conf`. + + +You can edit the configuration file using the `edit-config` script from the +Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory). + +```bash +cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata +sudo ./edit-config health_alarm_notify.conf +``` +#### Options + +The following options can be defined for this notification + +<details open><summary>Config Options</summary> + +| Name | Description | Default | Required | +|:----|:-----------|:-------|:--------:| +| nc path | Set the path for nc, otherwise Netdata will search for it in your system $PATH | | yes | +| SEND_IRC | Set `SEND_IRC` YES. | YES | yes | +| IRC_NETWORK | Set `IRC_NETWORK` to the IRC network which your preferred channels belong to. | | yes | +| IRC_PORT | Set `IRC_PORT` to the IRC port to which a connection will occur. | | no | +| IRC_NICKNAME | Set `IRC_NICKNAME` to the IRC nickname which is required to send the notification. It must not be an already registered name as the connection's MODE is defined as a guest. | | yes | +| IRC_REALNAME | Set `IRC_REALNAME` to the IRC realname which is required in order to make the connection. | | yes | +| DEFAULT_RECIPIENT_IRC | You can have different channels per role, by editing `DEFAULT_RECIPIENT_IRC` with the channel you want | | yes | + +##### nc path + +```sh +#------------------------------------------------------------------------------ +# external commands +# +# The full path of the nc command. +# If empty, the system $PATH will be searched for it. +# If not found, irc notifications will be silently disabled. +nc="/usr/bin/nc" +``` + + +##### DEFAULT_RECIPIENT_IRC + +The `DEFAULT_RECIPIENT_IRC` can be edited in the following entries at the bottom of the same file: +```conf +role_recipients_irc[sysadmin]="#systems" +role_recipients_irc[domainadmin]="#domains" +role_recipients_irc[dba]="#databases #systems" +role_recipients_irc[webmaster]="#marketing #development" +role_recipients_irc[proxyadmin]="#proxy-admin" +role_recipients_irc[sitemgr]="#sites" +``` + + +</details> + +#### Examples + +##### Basic Configuration + + + +```yaml +#------------------------------------------------------------------------------ +# irc notification options +# +SEND_IRC="YES" +DEFAULT_RECIPIENT_IRC="#system-alarms" +IRC_NETWORK="irc.freenode.net" +IRC_NICKNAME="netdata-alarm-user" +IRC_REALNAME="netdata-user" + +``` + + +## Troubleshooting + +### Test Notification + +You can run the following command by hand, to test alerts configuration: + +```bash +# become user netdata +sudo su -s /bin/bash netdata + +# enable debugging info on the console +export NETDATA_ALARM_NOTIFY_DEBUG=1 + +# send test alarms to sysadmin +/usr/libexec/netdata/plugins.d/alarm-notify.sh test + +# send test alarms to any role +/usr/libexec/netdata/plugins.d/alarm-notify.sh test "ROLE" +``` + +Note that this will test _all_ alert mechanisms for the selected role. + + diff --git a/health/notifications/irc/metadata.yaml b/src/health/notifications/irc/metadata.yaml index aa2593f91..aa2593f91 100644 --- a/health/notifications/irc/metadata.yaml +++ b/src/health/notifications/irc/metadata.yaml diff --git a/src/health/notifications/kavenegar/README.md b/src/health/notifications/kavenegar/README.md new file mode 100644 index 000000000..eedd43a23 --- /dev/null +++ b/src/health/notifications/kavenegar/README.md @@ -0,0 +1,120 @@ +<!--startmeta +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/health/notifications/kavenegar/README.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/health/notifications/kavenegar/metadata.yaml" +sidebar_label: "Kavenegar" +learn_status: "Published" +learn_rel_path: "Alerts & Notifications/Notifications/Agent Dispatched Notifications" +message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE NOTIFICATION'S metadata.yaml FILE" +endmeta--> + +# Kavenegar + + +<img src="https://netdata.cloud/img/kavenegar.png" width="150"/> + + +[Kavenegar](https://kavenegar.com/) as service for software developers, based in Iran, provides send and receive SMS, calling voice by using its APIs. +You can send notifications to Kavenegar using Netdata's Agent alert notification feature, which supports dozens of endpoints, user roles, and more. + + + +<img src="https://img.shields.io/badge/maintained%20by-Netdata-%2300ab44" /> + +## Setup + +### Prerequisites + +#### + +- The APIKEY and Sender from http://panel.kavenegar.com/client/setting/account +- Access to the terminal where Netdata Agent is running + + + +### Configuration + +#### File + +The configuration file name for this integration is `health_alarm_notify.conf`. + + +You can edit the configuration file using the `edit-config` script from the +Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory). + +```bash +cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata +sudo ./edit-config health_alarm_notify.conf +``` +#### Options + +The following options can be defined for this notification + +<details open><summary>Config Options</summary> + +| Name | Description | Default | Required | +|:----|:-----------|:-------|:--------:| +| SEND_KAVENEGAR | Set `SEND_KAVENEGAR` to YES | YES | yes | +| KAVENEGAR_API_KEY | Set `KAVENEGAR_API_KEY` to your API key. | | yes | +| KAVENEGAR_SENDER | Set `KAVENEGAR_SENDER` to the value of your Sender. | | yes | +| DEFAULT_RECIPIENT_KAVENEGAR | Set `DEFAULT_RECIPIENT_KAVENEGAR` to the SMS recipient you want the alert notifications to be sent to. You can define multiple recipients like this: 09155555555 09177777777. | | yes | + +##### DEFAULT_RECIPIENT_KAVENEGAR + +All roles will default to this variable if lest unconfigured. + +You can then have different SMS recipients per role, by editing `DEFAULT_RECIPIENT_KAVENEGAR` with the SMS recipients you want, in the following entries at the bottom of the same file: +```conf +role_recipients_kavenegar[sysadmin]="09100000000" +role_recipients_kavenegar[domainadmin]="09111111111" +role_recipients_kavenegar[dba]="0922222222" +role_recipients_kavenegar[webmaster]="0933333333" +role_recipients_kavenegar[proxyadmin]="0944444444" +role_recipients_kavenegar[sitemgr]="0955555555" +``` + +The values you provide should be defined as environments in `/etc/alertad.conf` with `ALLOWED_ENVIRONMENTS` option. + + +</details> + +#### Examples + +##### Basic Configuration + + + +```yaml +#------------------------------------------------------------------------------ +# Kavenegar (Kavenegar.com) SMS options + +SEND_KAVENEGAR="YES" +KAVENEGAR_API_KEY="XXXXXXXXXXXX" +KAVENEGAR_SENDER="YYYYYYYY" +DEFAULT_RECIPIENT_KAVENEGAR="0912345678" + +``` + + +## Troubleshooting + +### Test Notification + +You can run the following command by hand, to test alerts configuration: + +```bash +# become user netdata +sudo su -s /bin/bash netdata + +# enable debugging info on the console +export NETDATA_ALARM_NOTIFY_DEBUG=1 + +# send test alarms to sysadmin +/usr/libexec/netdata/plugins.d/alarm-notify.sh test + +# send test alarms to any role +/usr/libexec/netdata/plugins.d/alarm-notify.sh test "ROLE" +``` + +Note that this will test _all_ alert mechanisms for the selected role. + + diff --git a/health/notifications/kavenegar/metadata.yaml b/src/health/notifications/kavenegar/metadata.yaml index 559dbac09..559dbac09 100644 --- a/health/notifications/kavenegar/metadata.yaml +++ b/src/health/notifications/kavenegar/metadata.yaml diff --git a/src/health/notifications/matrix/README.md b/src/health/notifications/matrix/README.md new file mode 100644 index 000000000..3c01a9ef2 --- /dev/null +++ b/src/health/notifications/matrix/README.md @@ -0,0 +1,132 @@ +<!--startmeta +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/health/notifications/matrix/README.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/health/notifications/matrix/metadata.yaml" +sidebar_label: "Matrix" +learn_status: "Published" +learn_rel_path: "Alerts & Notifications/Notifications/Agent Dispatched Notifications" +message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE NOTIFICATION'S metadata.yaml FILE" +endmeta--> + +# Matrix + + +<img src="https://netdata.cloud/img/matrix.svg" width="150"/> + + +Send notifications to Matrix network rooms using Netdata's Agent alert notification feature, which supports dozens of endpoints, user roles, and more. + + + +<img src="https://img.shields.io/badge/maintained%20by-Netdata-%2300ab44" /> + +## Setup + +### Prerequisites + +#### + +- The url of the homeserver (`https://homeserver:port`). +- Credentials for connecting to the homeserver, in the form of a valid access token for your account (or for a dedicated notification account). These tokens usually don't expire. +- The Room ids that you want to sent the notification to. +- Access to the terminal where Netdata Agent is running + + + +### Configuration + +#### File + +The configuration file name for this integration is `health_alarm_notify.conf`. + + +You can edit the configuration file using the `edit-config` script from the +Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory). + +```bash +cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata +sudo ./edit-config health_alarm_notify.conf +``` +#### Options + +The following options can be defined for this notification + +<details open><summary>Config Options</summary> + +| Name | Description | Default | Required | +|:----|:-----------|:-------|:--------:| +| SEND_MATRIX | Set `SEND_MATRIX` to YES | YES | yes | +| MATRIX_HOMESERVER | set `MATRIX_HOMESERVER` to the URL of the Matrix homeserver. | | yes | +| MATRIX_ACCESSTOKEN | Set `MATRIX_ACCESSTOKEN` to the access token from your Matrix account. | | yes | +| DEFAULT_RECIPIENT_MATRIX | Set `DEFAULT_RECIPIENT_MATRIX` to the Rooms you want the alert notifications to be sent to. The format is `!roomid:homeservername`. | | yes | + +##### MATRIX_ACCESSTOKEN + +To obtain the access token, you can use the following curl command: +``` +curl -XPOST -d '{"type":"m.login.password", "user":"example", "password":"wordpass"}' "https://homeserver:8448/_matrix/client/r0/login" +``` + + +##### DEFAULT_RECIPIENT_MATRIX + +The Room ids are unique identifiers and can be obtained from the Room settings in a Matrix client (e.g. Riot). + +You can define multiple Rooms like this: `!roomid1:homeservername` `!roomid2:homeservername`. + +All roles will default to this variable if left unconfigured. + +You can have different Rooms per role, by editing `DEFAULT_RECIPIENT_MATRIX` with the `!roomid:homeservername` you want, in the following entries at the bottom of the same file: + +```conf +role_recipients_matrix[sysadmin]="!roomid1:homeservername" +role_recipients_matrix[domainadmin]="!roomid2:homeservername" +role_recipients_matrix[dba]="!roomid3:homeservername" +role_recipients_matrix[webmaster]="!roomid4:homeservername" +role_recipients_matrix[proxyadmin]="!roomid5:homeservername" +role_recipients_matrix[sitemgr]="!roomid6:homeservername" +``` + + +</details> + +#### Examples + +##### Basic Configuration + + + +```yaml +#------------------------------------------------------------------------------ +# Matrix notifications + +SEND_MATRIX="YES" +MATRIX_HOMESERVER="https://matrix.org:8448" +MATRIX_ACCESSTOKEN="XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" +DEFAULT_RECIPIENT_MATRIX="!XXXXXXXXXXXX:matrix.org" + +``` + + +## Troubleshooting + +### Test Notification + +You can run the following command by hand, to test alerts configuration: + +```bash +# become user netdata +sudo su -s /bin/bash netdata + +# enable debugging info on the console +export NETDATA_ALARM_NOTIFY_DEBUG=1 + +# send test alarms to sysadmin +/usr/libexec/netdata/plugins.d/alarm-notify.sh test + +# send test alarms to any role +/usr/libexec/netdata/plugins.d/alarm-notify.sh test "ROLE" +``` + +Note that this will test _all_ alert mechanisms for the selected role. + + diff --git a/src/health/notifications/matrix/metadata.yaml b/src/health/notifications/matrix/metadata.yaml new file mode 100644 index 000000000..db7f92eb1 --- /dev/null +++ b/src/health/notifications/matrix/metadata.yaml @@ -0,0 +1,91 @@ +# yamllint disable rule:line-length +--- +- id: 'notify-matrix' + meta: + name: 'Matrix' + link: 'https://spec.matrix.org/unstable/push-gateway-api/' + categories: + - notify.agent + icon_filename: 'matrix.svg' + keywords: + - Matrix + overview: + notification_description: | + Send notifications to Matrix network rooms using Netdata's Agent alert notification feature, which supports dozens of endpoints, user roles, and more. + notification_limitations: '' + setup: + prerequisites: + list: + - title: '' + description: | + - The url of the homeserver (`https://homeserver:port`). + - Credentials for connecting to the homeserver, in the form of a valid access token for your account (or for a dedicated notification account). These tokens usually don't expire. + - The Room ids that you want to sent the notification to. + - Access to the terminal where Netdata Agent is running + configuration: + file: + name: 'health_alarm_notify.conf' + options: + description: 'The following options can be defined for this notification' + folding: + title: 'Config Options' + enabled: true + list: + - name: 'SEND_MATRIX' + default_value: 'YES' + description: "Set `SEND_MATRIX` to YES" + required: true + - name: 'MATRIX_HOMESERVER' + default_value: '' + description: "set `MATRIX_HOMESERVER` to the URL of the Matrix homeserver." + required: true + - name: 'MATRIX_ACCESSTOKEN' + default_value: '' + description: "Set `MATRIX_ACCESSTOKEN` to the access token from your Matrix account." + required: true + detailed_description: | + To obtain the access token, you can use the following curl command: + ``` + curl -XPOST -d '{"type":"m.login.password", "user":"example", "password":"wordpass"}' "https://homeserver:8448/_matrix/client/r0/login" + ``` + - name: 'DEFAULT_RECIPIENT_MATRIX' + default_value: '' + description: "Set `DEFAULT_RECIPIENT_MATRIX` to the Rooms you want the alert notifications to be sent to. The format is `!roomid:homeservername`." + required: true + detailed_description: | + The Room ids are unique identifiers and can be obtained from the Room settings in a Matrix client (e.g. Riot). + + You can define multiple Rooms like this: `!roomid1:homeservername` `!roomid2:homeservername`. + + All roles will default to this variable if left unconfigured. + + You can have different Rooms per role, by editing `DEFAULT_RECIPIENT_MATRIX` with the `!roomid:homeservername` you want, in the following entries at the bottom of the same file: + + ```conf + role_recipients_matrix[sysadmin]="!roomid1:homeservername" + role_recipients_matrix[domainadmin]="!roomid2:homeservername" + role_recipients_matrix[dba]="!roomid3:homeservername" + role_recipients_matrix[webmaster]="!roomid4:homeservername" + role_recipients_matrix[proxyadmin]="!roomid5:homeservername" + role_recipients_matrix[sitemgr]="!roomid6:homeservername" + ``` + examples: + folding: + enabled: true + title: '' + list: + - name: 'Basic Configuration' + folding: + enabled: false + description: '' + config: | + #------------------------------------------------------------------------------ + # Matrix notifications + + SEND_MATRIX="YES" + MATRIX_HOMESERVER="https://matrix.org:8448" + MATRIX_ACCESSTOKEN="XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" + DEFAULT_RECIPIENT_MATRIX="!XXXXXXXXXXXX:matrix.org" + troubleshooting: + problems: + list: [] diff --git a/src/health/notifications/messagebird/README.md b/src/health/notifications/messagebird/README.md new file mode 100644 index 000000000..4b668fce3 --- /dev/null +++ b/src/health/notifications/messagebird/README.md @@ -0,0 +1,117 @@ +<!--startmeta +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/health/notifications/messagebird/README.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/health/notifications/messagebird/metadata.yaml" +sidebar_label: "MessageBird" +learn_status: "Published" +learn_rel_path: "Alerts & Notifications/Notifications/Agent Dispatched Notifications" +message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE NOTIFICATION'S metadata.yaml FILE" +endmeta--> + +# MessageBird + + +<img src="https://netdata.cloud/img/messagebird.svg" width="150"/> + + +Send notifications to MessageBird using Netdata's Agent alert notification feature, which supports dozens of endpoints, user roles, and more. + + + +<img src="https://img.shields.io/badge/maintained%20by-Netdata-%2300ab44" /> + +## Setup + +### Prerequisites + +#### + +- An access key under 'API ACCESS (REST)' (you will want a live key), you can read more [here](https://developers.messagebird.com/quickstarts/sms/test-credits-api-keys/). +- Access to the terminal where Netdata Agent is running + + + +### Configuration + +#### File + +The configuration file name for this integration is `health_alarm_notify.conf`. + + +You can edit the configuration file using the `edit-config` script from the +Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory). + +```bash +cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata +sudo ./edit-config health_alarm_notify.conf +``` +#### Options + +The following options can be defined for this notification + +<details open><summary>Config Options</summary> + +| Name | Description | Default | Required | +|:----|:-----------|:-------|:--------:| +| SEND_MESSAGEBIRD | Set `SEND_MESSAGEBIRD` to YES | YES | yes | +| MESSAGEBIRD_ACCESS_KEY | Set `MESSAGEBIRD_ACCESS_KEY` to your API key. | | yes | +| MESSAGEBIRD_NUMBER | Set `MESSAGEBIRD_NUMBER` to the MessageBird number you want to use for the alert. | | yes | +| DEFAULT_RECIPIENT_MESSAGEBIRD | Set `DEFAULT_RECIPIENT_MESSAGEBIRD` to the number you want the alert notification to be sent as an SMS. You can define multiple recipients like this: +15555555555 +17777777777. | | yes | + +##### DEFAULT_RECIPIENT_MESSAGEBIRD + +All roles will default to this variable if left unconfigured. + +You can then have different recipients per role, by editing `DEFAULT_RECIPIENT_MESSAGEBIRD` with the number you want, in the following entries at the bottom of the same file: +```conf +role_recipients_messagebird[sysadmin]="+15555555555" +role_recipients_messagebird[domainadmin]="+15555555556" +role_recipients_messagebird[dba]="+15555555557" +role_recipients_messagebird[webmaster]="+15555555558" +role_recipients_messagebird[proxyadmin]="+15555555559" +role_recipients_messagebird[sitemgr]="+15555555550" +``` + + +</details> + +#### Examples + +##### Basic Configuration + + + +```yaml +#------------------------------------------------------------------------------ +# Messagebird (messagebird.com) SMS options + +SEND_MESSAGEBIRD="YES" +MESSAGEBIRD_ACCESS_KEY="XXXXXXXX" +MESSAGEBIRD_NUMBER="XXXXXXX" +DEFAULT_RECIPIENT_MESSAGEBIRD="+15555555555" + +``` + + +## Troubleshooting + +### Test Notification + +You can run the following command by hand, to test alerts configuration: + +```bash +# become user netdata +sudo su -s /bin/bash netdata + +# enable debugging info on the console +export NETDATA_ALARM_NOTIFY_DEBUG=1 + +# send test alarms to sysadmin +/usr/libexec/netdata/plugins.d/alarm-notify.sh test + +# send test alarms to any role +/usr/libexec/netdata/plugins.d/alarm-notify.sh test "ROLE" +``` + +Note that this will test _all_ alert mechanisms for the selected role. + + diff --git a/health/notifications/messagebird/metadata.yaml b/src/health/notifications/messagebird/metadata.yaml index a97cdc712..a97cdc712 100644 --- a/health/notifications/messagebird/metadata.yaml +++ b/src/health/notifications/messagebird/metadata.yaml diff --git a/src/health/notifications/msteams/README.md b/src/health/notifications/msteams/README.md new file mode 100644 index 000000000..e24730777 --- /dev/null +++ b/src/health/notifications/msteams/README.md @@ -0,0 +1,118 @@ +<!--startmeta +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/health/notifications/msteams/README.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/health/notifications/msteams/metadata.yaml" +sidebar_label: "Microsoft Teams" +learn_status: "Published" +learn_rel_path: "Alerts & Notifications/Notifications/Agent Dispatched Notifications" +message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE NOTIFICATION'S metadata.yaml FILE" +endmeta--> + +# Microsoft Teams + + +<img src="https://netdata.cloud/img/msteams.svg" width="150"/> + + +You can send Netdata alerts to Microsoft Teams using Netdata's Agent alert notification feature, which supports dozens of endpoints, user roles, and more. + + + +<img src="https://img.shields.io/badge/maintained%20by-Netdata-%2300ab44" /> + +## Setup + +### Prerequisites + +#### + +- The incoming webhook URL as given by Microsoft Teams. You can use the same on all your Netdata servers (or you can have multiple if you like). +- One or more channels to post the messages to +- Access to the terminal where Netdata Agent is running + + + +### Configuration + +#### File + +The configuration file name for this integration is `health_alarm_notify.conf`. + + +You can edit the configuration file using the `edit-config` script from the +Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory). + +```bash +cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata +sudo ./edit-config health_alarm_notify.conf +``` +#### Options + +The following options can be defined for this notification + +<details open><summary>Config Options</summary> + +| Name | Description | Default | Required | +|:----|:-----------|:-------|:--------:| +| SEND_MSTEAMS | Set `SEND_MSTEAMS` to YES | YES | yes | +| MSTEAMS_WEBHOOK_URL | set `MSTEAMS_WEBHOOK_URL` to the incoming webhook URL as given by Microsoft Teams. | | yes | +| DEFAULT_RECIPIENT_MSTEAMS | Set `DEFAULT_RECIPIENT_MSTEAMS` to the encoded Microsoft Teams channel name you want the alert notifications to be sent to. | | yes | + +##### DEFAULT_RECIPIENT_MSTEAMS + +In Microsoft Teams the channel name is encoded in the URI after `/IncomingWebhook/`. You can define multiple channels like this: `CHANNEL1` `CHANNEL2`. + +All roles will default to this variable if left unconfigured. + +You can have different channels per role, by editing `DEFAULT_RECIPIENT_MSTEAMS` with the channel you want, in the following entries at the bottom of the same file: +```conf +role_recipients_msteams[sysadmin]="CHANNEL1" +role_recipients_msteams[domainadmin]="CHANNEL2" +role_recipients_msteams[dba]="databases CHANNEL3" +role_recipients_msteams[webmaster]="CHANNEL4" +role_recipients_msteams[proxyadmin]="CHANNEL5" +role_recipients_msteams[sitemgr]="CHANNEL6" +``` + + +</details> + +#### Examples + +##### Basic Configuration + + + +```yaml +#------------------------------------------------------------------------------ +# Microsoft Teams (office.com) global notification options + +SEND_MSTEAMS="YES" +MSTEAMS_WEBHOOK_URL="https://outlook.office.com/webhook/XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/IncomingWebhook/CHANNEL/XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX" +DEFAULT_RECIPIENT_MSTEAMS="XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" + +``` + + +## Troubleshooting + +### Test Notification + +You can run the following command by hand, to test alerts configuration: + +```bash +# become user netdata +sudo su -s /bin/bash netdata + +# enable debugging info on the console +export NETDATA_ALARM_NOTIFY_DEBUG=1 + +# send test alarms to sysadmin +/usr/libexec/netdata/plugins.d/alarm-notify.sh test + +# send test alarms to any role +/usr/libexec/netdata/plugins.d/alarm-notify.sh test "ROLE" +``` + +Note that this will test _all_ alert mechanisms for the selected role. + + diff --git a/health/notifications/msteams/metadata.yaml b/src/health/notifications/msteams/metadata.yaml index 72de507a4..72de507a4 100644 --- a/health/notifications/msteams/metadata.yaml +++ b/src/health/notifications/msteams/metadata.yaml diff --git a/src/health/notifications/ntfy/README.md b/src/health/notifications/ntfy/README.md new file mode 100644 index 000000000..a03e30304 --- /dev/null +++ b/src/health/notifications/ntfy/README.md @@ -0,0 +1,135 @@ +<!--startmeta +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/health/notifications/ntfy/README.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/health/notifications/ntfy/metadata.yaml" +sidebar_label: "ntfy" +learn_status: "Published" +learn_rel_path: "Alerts & Notifications/Notifications/Agent Dispatched Notifications" +message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE NOTIFICATION'S metadata.yaml FILE" +endmeta--> + +# ntfy + + +<img src="https://netdata.cloud/img/ntfy.svg" width="150"/> + + +[ntfy](https://ntfy.sh/) (pronounce: notify) is a simple HTTP-based [pub-sub](https://en.wikipedia.org/wiki/Publish%E2%80%93subscribe_pattern) notification service. It allows you to send notifications to your phone or desktop via scripts from any computer, entirely without signup, cost or setup. It's also [open source](https://github.com/binwiederhier/ntfy) if you want to run your own server. +You can send alerts to an ntfy server using Netdata's Agent alert notification feature, which supports dozens of endpoints, user roles, and more. + + + +<img src="https://img.shields.io/badge/maintained%20by-Netdata-%2300ab44" /> + +## Setup + +### Prerequisites + +#### + +- (Optional) A [self-hosted ntfy server](https://docs.ntfy.sh/faq/#can-i-self-host-it), in case you don't want to use https://ntfy.sh +- A new [topic](https://ntfy.sh/#subscribe) for the notifications to be published to +- Access to the terminal where Netdata Agent is running + + + +### Configuration + +#### File + +The configuration file name for this integration is `health_alarm_notify.conf`. + + +You can edit the configuration file using the `edit-config` script from the +Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory). + +```bash +cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata +sudo ./edit-config health_alarm_notify.conf +``` +#### Options + +The following options can be defined for this notification + +<details open><summary>Config Options</summary> + +| Name | Description | Default | Required | +|:----|:-----------|:-------|:--------:| +| SEND_NTFY | Set `SEND_NTFY` to YES | YES | yes | +| DEFAULT_RECIPIENT_NTFY | URL formed by the server-topic combination you want the alert notifications to be sent to. Unless hosting your own server, the server should always be set to https://ntfy.sh. | | yes | +| NTFY_USERNAME | The username for netdata to use to authenticate with an ntfy server. | | no | +| NTFY_PASSWORD | The password for netdata to use to authenticate with an ntfy server. | | no | +| NTFY_ACCESS_TOKEN | The access token for netdata to use to authenticate with an ntfy server. | | no | + +##### DEFAULT_RECIPIENT_NTFY + +You can define multiple recipient URLs like this: `https://SERVER1/TOPIC1` `https://SERVER2/TOPIC2` + +All roles will default to this variable if left unconfigured. + +You can then have different servers and/or topics per role, by editing DEFAULT_RECIPIENT_NTFY with the server-topic combination you want, in the following entries at the bottom of the same file: +```conf +role_recipients_ntfy[sysadmin]="https://SERVER1/TOPIC1" +role_recipients_ntfy[domainadmin]="https://SERVER2/TOPIC2" +role_recipients_ntfy[dba]="https://SERVER3/TOPIC3" +role_recipients_ntfy[webmaster]="https://SERVER4/TOPIC4" +role_recipients_ntfy[proxyadmin]="https://SERVER5/TOPIC5" +role_recipients_ntfy[sitemgr]="https://SERVER6/TOPIC6" +``` + + +##### NTFY_USERNAME + +Only useful on self-hosted ntfy instances. See [users and roles](https://docs.ntfy.sh/config/#users-and-roles) for details. +Ensure that your user has proper read/write access to the provided topic in `DEFAULT_RECIPIENT_NTFY` + + +##### NTFY_PASSWORD + +Only useful on self-hosted ntfy instances. See [users and roles](https://docs.ntfy.sh/config/#users-and-roles) for details. +Ensure that your user has proper read/write access to the provided topic in `DEFAULT_RECIPIENT_NTFY` + + +##### NTFY_ACCESS_TOKEN + +This can be used in place of `NTFY_USERNAME` and `NTFY_PASSWORD` to authenticate with a self-hosted ntfy instance. See [access tokens](https://docs.ntfy.sh/config/?h=access+to#access-tokens) for details. +Ensure that the token user has proper read/write access to the provided topic in `DEFAULT_RECIPIENT_NTFY` + + +</details> + +#### Examples + +##### Basic Configuration + + + +```yaml +SEND_NTFY="YES" +DEFAULT_RECIPIENT_NTFY="https://ntfy.sh/netdata-X7seHg7d3Tw9zGOk https://ntfy.sh/netdata-oIPm4IK1IlUtlA30" + +``` + + +## Troubleshooting + +### Test Notification + +You can run the following command by hand, to test alerts configuration: + +```bash +# become user netdata +sudo su -s /bin/bash netdata + +# enable debugging info on the console +export NETDATA_ALARM_NOTIFY_DEBUG=1 + +# send test alarms to sysadmin +/usr/libexec/netdata/plugins.d/alarm-notify.sh test + +# send test alarms to any role +/usr/libexec/netdata/plugins.d/alarm-notify.sh test "ROLE" +``` + +Note that this will test _all_ alert mechanisms for the selected role. + + diff --git a/health/notifications/ntfy/metadata.yaml b/src/health/notifications/ntfy/metadata.yaml index 0d6c0beac..0d6c0beac 100644 --- a/health/notifications/ntfy/metadata.yaml +++ b/src/health/notifications/ntfy/metadata.yaml diff --git a/src/health/notifications/opsgenie/README.md b/src/health/notifications/opsgenie/README.md new file mode 100644 index 000000000..fa5859d7d --- /dev/null +++ b/src/health/notifications/opsgenie/README.md @@ -0,0 +1,98 @@ +<!--startmeta +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/health/notifications/opsgenie/README.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/health/notifications/opsgenie/metadata.yaml" +sidebar_label: "OpsGenie" +learn_status: "Published" +learn_rel_path: "Alerts & Notifications/Notifications/Agent Dispatched Notifications" +message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE NOTIFICATION'S metadata.yaml FILE" +endmeta--> + +# OpsGenie + + +<img src="https://netdata.cloud/img/opsgenie.png" width="150"/> + + +Opsgenie is an alerting and incident response tool. It is designed to group and filter alarms, build custom routing rules for on-call teams, and correlate deployments and commits to incidents. +You can send notifications to Opsgenie using Netdata's Agent alert notification feature, which supports dozens of endpoints, user roles, and more. + + + +<img src="https://img.shields.io/badge/maintained%20by-Netdata-%2300ab44" /> + +## Setup + +### Prerequisites + +#### + +- An Opsgenie integration. You can create an [integration](https://docs.opsgenie.com/docs/api-integration) in the [Opsgenie](https://www.atlassian.com/software/opsgenie) dashboard. +- Access to the terminal where Netdata Agent is running + + + +### Configuration + +#### File + +The configuration file name for this integration is `health_alarm_notify.conf`. + + +You can edit the configuration file using the `edit-config` script from the +Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory). + +```bash +cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata +sudo ./edit-config health_alarm_notify.conf +``` +#### Options + +The following options can be defined for this notification + +<details open><summary>Config Options</summary> + +| Name | Description | Default | Required | +|:----|:-----------|:-------|:--------:| +| SEND_OPSGENIE | Set `SEND_OPSGENIE` to YES | YES | yes | +| OPSGENIE_API_KEY | Set `OPSGENIE_API_KEY` to your API key. | | yes | +| OPSGENIE_API_URL | Set `OPSGENIE_API_URL` to the corresponding URL if required, for example there are region-specific API URLs such as `https://eu.api.opsgenie.com`. | https://api.opsgenie.com | no | + +</details> + +#### Examples + +##### Basic Configuration + + + +```yaml +SEND_OPSGENIE="YES" +OPSGENIE_API_KEY="11111111-2222-3333-4444-555555555555" +OPSGENIE_API_URL="" + +``` + + +## Troubleshooting + +### Test Notification + +You can run the following command by hand, to test alerts configuration: + +```bash +# become user netdata +sudo su -s /bin/bash netdata + +# enable debugging info on the console +export NETDATA_ALARM_NOTIFY_DEBUG=1 + +# send test alarms to sysadmin +/usr/libexec/netdata/plugins.d/alarm-notify.sh test + +# send test alarms to any role +/usr/libexec/netdata/plugins.d/alarm-notify.sh test "ROLE" +``` + +Note that this will test _all_ alert mechanisms for the selected role. + + diff --git a/health/notifications/opsgenie/metadata.yaml b/src/health/notifications/opsgenie/metadata.yaml index 78bd8c2bd..78bd8c2bd 100644 --- a/health/notifications/opsgenie/metadata.yaml +++ b/src/health/notifications/opsgenie/metadata.yaml diff --git a/src/health/notifications/pagerduty/README.md b/src/health/notifications/pagerduty/README.md new file mode 100644 index 000000000..ae45e5385 --- /dev/null +++ b/src/health/notifications/pagerduty/README.md @@ -0,0 +1,117 @@ +<!--startmeta +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/health/notifications/pagerduty/README.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/health/notifications/pagerduty/metadata.yaml" +sidebar_label: "PagerDuty" +learn_status: "Published" +learn_rel_path: "Alerts & Notifications/Notifications/Agent Dispatched Notifications" +message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE NOTIFICATION'S metadata.yaml FILE" +endmeta--> + +# PagerDuty + + +<img src="https://netdata.cloud/img/pagerduty.png" width="150"/> + + +PagerDuty is an enterprise incident resolution service that integrates with ITOps and DevOps monitoring stacks to improve operational reliability and agility. From enriching and aggregating events to correlating them into incidents, PagerDuty streamlines the incident management process by reducing alert noise and resolution times. +You can send notifications to PagerDuty using Netdata's Agent alert notification feature, which supports dozens of endpoints, user roles, and more. + + + +<img src="https://img.shields.io/badge/maintained%20by-Netdata-%2300ab44" /> + +## Setup + +### Prerequisites + +#### + +- An installation of the [PagerDuty](https://www.pagerduty.com/docs/guides/agent-install-guide/) agent on the node running the Netdata Agent +- A PagerDuty Generic API service using either the `Events API v2` or `Events API v1` +- [Add a new service](https://support.pagerduty.com/docs/services-and-integrations#section-configuring-services-and-integrations) to PagerDuty. Click Use our API directly and select either `Events API v2` or `Events API v1`. Once you finish creating the service, click on the Integrations tab to find your Integration Key. +- Access to the terminal where Netdata Agent is running + + + +### Configuration + +#### File + +The configuration file name for this integration is `health_alarm_notify.conf`. + + +You can edit the configuration file using the `edit-config` script from the +Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory). + +```bash +cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata +sudo ./edit-config health_alarm_notify.conf +``` +#### Options + +The following options can be defined for this notification + +<details open><summary>Config Options</summary> + +| Name | Description | Default | Required | +|:----|:-----------|:-------|:--------:| +| SEND_PD | Set `SEND_PD` to YES | YES | yes | +| DEFAULT_RECIPIENT_PD | Set `DEFAULT_RECIPIENT_PD` to the PagerDuty service key you want the alert notifications to be sent to. You can define multiple service keys like this: `pd_service_key_1` `pd_service_key_2`. | | yes | + +##### DEFAULT_RECIPIENT_PD + +All roles will default to this variable if left unconfigured. + +The `DEFAULT_RECIPIENT_PD` can be edited in the following entries at the bottom of the same file: +```conf +role_recipients_pd[sysadmin]="xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxa" +role_recipients_pd[domainadmin]="xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxb" +role_recipients_pd[dba]="xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxc" +role_recipients_pd[webmaster]="xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxd" +role_recipients_pd[proxyadmin]="xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxe" +role_recipients_pd[sitemgr]="xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxf" +``` + + +</details> + +#### Examples + +##### Basic Configuration + + + +```yaml +#------------------------------------------------------------------------------ +# pagerduty.com notification options + +SEND_PD="YES" +DEFAULT_RECIPIENT_PD="xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" +USE_PD_VERSION="2" + +``` + + +## Troubleshooting + +### Test Notification + +You can run the following command by hand, to test alerts configuration: + +```bash +# become user netdata +sudo su -s /bin/bash netdata + +# enable debugging info on the console +export NETDATA_ALARM_NOTIFY_DEBUG=1 + +# send test alarms to sysadmin +/usr/libexec/netdata/plugins.d/alarm-notify.sh test + +# send test alarms to any role +/usr/libexec/netdata/plugins.d/alarm-notify.sh test "ROLE" +``` + +Note that this will test _all_ alert mechanisms for the selected role. + + diff --git a/health/notifications/pagerduty/metadata.yaml b/src/health/notifications/pagerduty/metadata.yaml index 6fc1d640e..6fc1d640e 100644 --- a/health/notifications/pagerduty/metadata.yaml +++ b/src/health/notifications/pagerduty/metadata.yaml diff --git a/src/health/notifications/prowl/README.md b/src/health/notifications/prowl/README.md new file mode 100644 index 000000000..0d206cee0 --- /dev/null +++ b/src/health/notifications/prowl/README.md @@ -0,0 +1,119 @@ +<!--startmeta +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/health/notifications/prowl/README.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/health/notifications/prowl/metadata.yaml" +sidebar_label: "Prowl" +learn_status: "Published" +learn_rel_path: "Alerts & Notifications/Notifications/Agent Dispatched Notifications" +message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE NOTIFICATION'S metadata.yaml FILE" +endmeta--> + +# Prowl + + +<img src="https://netdata.cloud/img/prowl.png" width="150"/> + + +Send notifications to Prowl using Netdata's Agent alert notification feature, which supports dozens of endpoints, user roles, and more. + + +<img src="https://img.shields.io/badge/maintained%20by-Netdata-%2300ab44" /> + +## Limitations + +- Because of how Netdata integrates with Prowl, there is a hard limit of at most 1000 notifications per hour (starting from the first notification sent). Any alerts beyond the first thousand in an hour will be dropped. +- Warning messages will be sent with the 'High' priority, critical messages will be sent with the 'Emergency' priority, and all other messages will be sent with the normal priority. Opening the notification's associated URL will take you to the Netdata dashboard of the system that issued the alert, directly to the chart that it triggered on. + + + +## Setup + +### Prerequisites + +#### + +- A Prowl API key, which can be requested through the Prowl website after registering +- Access to the terminal where Netdata Agent is running + + + +### Configuration + +#### File + +The configuration file name for this integration is `health_alarm_notify.conf`. + + +You can edit the configuration file using the `edit-config` script from the +Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory). + +```bash +cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata +sudo ./edit-config health_alarm_notify.conf +``` +#### Options + +The following options can be defined for this notification + +<details open><summary>Config Options</summary> + +| Name | Description | Default | Required | +|:----|:-----------|:-------|:--------:| +| SEND_PROWL | Set `SEND_PROWL` to YES | YES | yes | +| DEFAULT_RECIPIENT_PROWL | Set `DEFAULT_RECIPIENT_PROWL` to the Prowl API key you want the alert notifications to be sent to. You can define multiple API keys like this: `APIKEY1`, `APIKEY2`. | | yes | + +##### DEFAULT_RECIPIENT_PROWL + +All roles will default to this variable if left unconfigured. + +The `DEFAULT_RECIPIENT_PROWL` can be edited in the following entries at the bottom of the same file: +```conf +role_recipients_prowl[sysadmin]="AAAAAAAA" +role_recipients_prowl[domainadmin]="BBBBBBBBB" +role_recipients_prowl[dba]="CCCCCCCCC" +role_recipients_prowl[webmaster]="DDDDDDDDDD" +role_recipients_prowl[proxyadmin]="EEEEEEEEEE" +role_recipients_prowl[sitemgr]="FFFFFFFFFF" +``` + + +</details> + +#### Examples + +##### Basic Configuration + + + +```yaml +#------------------------------------------------------------------------------ +# iOS Push Notifications + +SEND_PROWL="YES" +DEFAULT_RECIPIENT_PROWL="XXXXXXXXXX" + +``` + + +## Troubleshooting + +### Test Notification + +You can run the following command by hand, to test alerts configuration: + +```bash +# become user netdata +sudo su -s /bin/bash netdata + +# enable debugging info on the console +export NETDATA_ALARM_NOTIFY_DEBUG=1 + +# send test alarms to sysadmin +/usr/libexec/netdata/plugins.d/alarm-notify.sh test + +# send test alarms to any role +/usr/libexec/netdata/plugins.d/alarm-notify.sh test "ROLE" +``` + +Note that this will test _all_ alert mechanisms for the selected role. + + diff --git a/health/notifications/prowl/metadata.yaml b/src/health/notifications/prowl/metadata.yaml index b3f0e0a1e..b3f0e0a1e 100644 --- a/health/notifications/prowl/metadata.yaml +++ b/src/health/notifications/prowl/metadata.yaml diff --git a/src/health/notifications/pushbullet/README.md b/src/health/notifications/pushbullet/README.md new file mode 100644 index 000000000..1b30f4c97 --- /dev/null +++ b/src/health/notifications/pushbullet/README.md @@ -0,0 +1,117 @@ +<!--startmeta +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/health/notifications/pushbullet/README.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/health/notifications/pushbullet/metadata.yaml" +sidebar_label: "Pushbullet" +learn_status: "Published" +learn_rel_path: "Alerts & Notifications/Notifications/Agent Dispatched Notifications" +message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE NOTIFICATION'S metadata.yaml FILE" +endmeta--> + +# Pushbullet + + +<img src="https://netdata.cloud/img/pushbullet.png" width="150"/> + + +Send notifications to Pushbullet using Netdata's Agent alert notification feature, which supports dozens of endpoints, user roles, and more. + + + +<img src="https://img.shields.io/badge/maintained%20by-Netdata-%2300ab44" /> + +## Setup + +### Prerequisites + +#### + +- A Pushbullet access token that can be created in your [account settings](https://www.pushbullet.com/#settings/account). +- Access to the terminal where Netdata Agent is running + + + +### Configuration + +#### File + +The configuration file name for this integration is `health_alarm_notify.conf`. + + +You can edit the configuration file using the `edit-config` script from the +Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory). + +```bash +cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata +sudo ./edit-config health_alarm_notify.conf +``` +#### Options + +The following options can be defined for this notification + +<details open><summary>Config Options</summary> + +| Name | Description | Default | Required | +|:----|:-----------|:-------|:--------:| +| Send_PUSHBULLET | Set `Send_PUSHBULLET` to YES | YES | yes | +| PUSHBULLET_ACCESS_TOKEN | set `PUSHBULLET_ACCESS_TOKEN` to the access token you generated. | | yes | +| DEFAULT_RECIPIENT_PUSHBULLET | Set `DEFAULT_RECIPIENT_PUSHBULLET` to the email (e.g. `example@domain.com`) or the channel tag (e.g. `#channel`) you want the alert notifications to be sent to. | | yes | + +##### DEFAULT_RECIPIENT_PUSHBULLET + +You can define multiple entries like this: user1@email.com user2@email.com. + +All roles will default to this variable if left unconfigured. + +The `DEFAULT_RECIPIENT_PUSHBULLET` can be edited in the following entries at the bottom of the same file: +```conf +role_recipients_pushbullet[sysadmin]="user1@email.com" +role_recipients_pushbullet[domainadmin]="user2@mail.com" +role_recipients_pushbullet[dba]="#channel1" +role_recipients_pushbullet[webmaster]="#channel2" +role_recipients_pushbullet[proxyadmin]="user3@mail.com" +role_recipients_pushbullet[sitemgr]="user4@mail.com" +``` + + +</details> + +#### Examples + +##### Basic Configuration + + + +```yaml +#------------------------------------------------------------------------------ +# pushbullet (pushbullet.com) push notification options + +SEND_PUSHBULLET="YES" +PUSHBULLET_ACCESS_TOKEN="XXXXXXXXX" +DEFAULT_RECIPIENT_PUSHBULLET="admin1@example.com admin3@somemail.com #examplechanneltag #anotherchanneltag" + +``` + + +## Troubleshooting + +### Test Notification + +You can run the following command by hand, to test alerts configuration: + +```bash +# become user netdata +sudo su -s /bin/bash netdata + +# enable debugging info on the console +export NETDATA_ALARM_NOTIFY_DEBUG=1 + +# send test alarms to sysadmin +/usr/libexec/netdata/plugins.d/alarm-notify.sh test + +# send test alarms to any role +/usr/libexec/netdata/plugins.d/alarm-notify.sh test "ROLE" +``` + +Note that this will test _all_ alert mechanisms for the selected role. + + diff --git a/health/notifications/pushbullet/metadata.yaml b/src/health/notifications/pushbullet/metadata.yaml index 430033cca..430033cca 100644 --- a/health/notifications/pushbullet/metadata.yaml +++ b/src/health/notifications/pushbullet/metadata.yaml diff --git a/src/health/notifications/pushover/README.md b/src/health/notifications/pushover/README.md new file mode 100644 index 000000000..9d30dfa97 --- /dev/null +++ b/src/health/notifications/pushover/README.md @@ -0,0 +1,119 @@ +<!--startmeta +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/health/notifications/pushover/README.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/health/notifications/pushover/metadata.yaml" +sidebar_label: "PushOver" +learn_status: "Published" +learn_rel_path: "Alerts & Notifications/Notifications/Agent Dispatched Notifications" +message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE NOTIFICATION'S metadata.yaml FILE" +endmeta--> + +# PushOver + + +<img src="https://netdata.cloud/img/pushover.png" width="150"/> + + +Send notification to Pushover using Netdata's Agent alert notification feature, which supports dozens of endpoints, user roles, and more. +- Netdata will send warning messages with priority 0 and critical messages with priority 1. +- Pushover allows you to select do-not-disturb hours. The way this is configured, critical notifications will ring and vibrate your phone, even during the do-not-disturb-hours. +- All other notifications will be delivered silently. + + + +<img src="https://img.shields.io/badge/maintained%20by-Netdata-%2300ab44" /> + +## Setup + +### Prerequisites + +#### + +- An Application token. You can use the same on all your Netdata servers. +- A User token for each user you are going to send notifications to. This is the actual recipient of the notification. +- Access to the terminal where Netdata Agent is running + + + +### Configuration + +#### File + +The configuration file name for this integration is `health_alarm_notify.conf`. + + +You can edit the configuration file using the `edit-config` script from the +Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory). + +```bash +cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata +sudo ./edit-config health_alarm_notify.conf +``` +#### Options + +The following options can be defined for this notification + +<details open><summary>Config Options</summary> + +| Name | Description | Default | Required | +|:----|:-----------|:-------|:--------:| +| SEND_PUSHOVER | Set `SEND_PUSHOVER` to YES | YES | yes | +| PUSHOVER_WEBHOOK_URL | set `PUSHOVER_WEBHOOK_URL` to your Pushover Application token. | | yes | +| DEFAULT_RECIPIENT_PUSHOVER | Set `DEFAULT_RECIPIENT_PUSHOVER` the Pushover User token you want the alert notifications to be sent to. You can define multiple User tokens like this: `USERTOKEN1` `USERTOKEN2`. | | yes | + +##### DEFAULT_RECIPIENT_PUSHOVER + +All roles will default to this variable if left unconfigured. + +The `DEFAULT_RECIPIENT_PUSHOVER` can be edited in the following entries at the bottom of the same file: +```conf +role_recipients_pushover[sysadmin]="USERTOKEN1" +role_recipients_pushover[domainadmin]="USERTOKEN2" +role_recipients_pushover[dba]="USERTOKEN3 USERTOKEN4" +role_recipients_pushover[webmaster]="USERTOKEN5" +role_recipients_pushover[proxyadmin]="USERTOKEN6" +role_recipients_pushover[sitemgr]="USERTOKEN7" +``` + + +</details> + +#### Examples + +##### Basic Configuration + + + +```yaml +#------------------------------------------------------------------------------ +# pushover (pushover.net) global notification options + +SEND_PUSHOVER="YES" +PUSHOVER_APP_TOKEN="XXXXXXXXX" +DEFAULT_RECIPIENT_PUSHOVER="USERTOKEN" + +``` + + +## Troubleshooting + +### Test Notification + +You can run the following command by hand, to test alerts configuration: + +```bash +# become user netdata +sudo su -s /bin/bash netdata + +# enable debugging info on the console +export NETDATA_ALARM_NOTIFY_DEBUG=1 + +# send test alarms to sysadmin +/usr/libexec/netdata/plugins.d/alarm-notify.sh test + +# send test alarms to any role +/usr/libexec/netdata/plugins.d/alarm-notify.sh test "ROLE" +``` + +Note that this will test _all_ alert mechanisms for the selected role. + + diff --git a/health/notifications/pushover/metadata.yaml b/src/health/notifications/pushover/metadata.yaml index 9af729ea8..9af729ea8 100644 --- a/health/notifications/pushover/metadata.yaml +++ b/src/health/notifications/pushover/metadata.yaml diff --git a/src/health/notifications/rocketchat/README.md b/src/health/notifications/rocketchat/README.md new file mode 100644 index 000000000..b9b0d5687 --- /dev/null +++ b/src/health/notifications/rocketchat/README.md @@ -0,0 +1,116 @@ +<!--startmeta +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/health/notifications/rocketchat/README.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/health/notifications/rocketchat/metadata.yaml" +sidebar_label: "RocketChat" +learn_status: "Published" +learn_rel_path: "Alerts & Notifications/Notifications/Agent Dispatched Notifications" +message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE NOTIFICATION'S metadata.yaml FILE" +endmeta--> + +# RocketChat + + +<img src="https://netdata.cloud/img/rocketchat.png" width="150"/> + + +Send notifications to Rocket.Chat using Netdata's Agent alert notification feature, which supports dozens of endpoints, user roles, and more. + + + +<img src="https://img.shields.io/badge/maintained%20by-Netdata-%2300ab44" /> + +## Setup + +### Prerequisites + +#### + +- The incoming webhook URL as given by RocketChat. You can use the same on all your Netdata servers (or you can have multiple if you like - your decision). +- One or more channels to post the messages to +- Access to the terminal where Netdata Agent is running + + + +### Configuration + +#### File + +The configuration file name for this integration is `health_alarm_notify.conf`. + + +You can edit the configuration file using the `edit-config` script from the +Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory). + +```bash +cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata +sudo ./edit-config health_alarm_notify.conf +``` +#### Options + +The following options can be defined for this notification + +<details open><summary>Config Options</summary> + +| Name | Description | Default | Required | +|:----|:-----------|:-------|:--------:| +| SEND_ROCKETCHAT | Set `SEND_ROCKETCHAT` to `YES` | YES | yes | +| ROCKETCHAT_WEBHOOK_URL | set `ROCKETCHAT_WEBHOOK_URL` to your webhook URL. | | yes | +| DEFAULT_RECIPIENT_ROCKETCHAT | Set `DEFAULT_RECIPIENT_ROCKETCHAT` to the channel you want the alert notifications to be sent to. You can define multiple channels like this: `alerts` `systems`. | | yes | + +##### DEFAULT_RECIPIENT_ROCKETCHAT + +All roles will default to this variable if left unconfigured. + +The `DEFAULT_RECIPIENT_ROCKETCHAT` can be edited in the following entries at the bottom of the same file: +```conf +role_recipients_rocketchat[sysadmin]="systems" +role_recipients_rocketchat[domainadmin]="domains" +role_recipients_rocketchat[dba]="databases systems" +role_recipients_rocketchat[webmaster]="marketing development" +role_recipients_rocketchat[proxyadmin]="proxy_admin" +role_recipients_rocketchat[sitemgr]="sites" +``` + + +</details> + +#### Examples + +##### Basic Configuration + + + +```yaml +#------------------------------------------------------------------------------ +# rocketchat (rocket.chat) global notification options + +SEND_ROCKETCHAT="YES" +ROCKETCHAT_WEBHOOK_URL="<your_incoming_webhook_url>" +DEFAULT_RECIPIENT_ROCKETCHAT="monitoring_alarms" + +``` + + +## Troubleshooting + +### Test Notification + +You can run the following command by hand, to test alerts configuration: + +```bash +# become user netdata +sudo su -s /bin/bash netdata + +# enable debugging info on the console +export NETDATA_ALARM_NOTIFY_DEBUG=1 + +# send test alarms to sysadmin +/usr/libexec/netdata/plugins.d/alarm-notify.sh test + +# send test alarms to any role +/usr/libexec/netdata/plugins.d/alarm-notify.sh test "ROLE" +``` + +Note that this will test _all_ alert mechanisms for the selected role. + + diff --git a/health/notifications/rocketchat/metadata.yaml b/src/health/notifications/rocketchat/metadata.yaml index f644b93e1..f644b93e1 100644 --- a/health/notifications/rocketchat/metadata.yaml +++ b/src/health/notifications/rocketchat/metadata.yaml diff --git a/health/notifications/sample-metadata.yaml b/src/health/notifications/sample-metadata.yaml index 41a287aeb..41a287aeb 100644 --- a/health/notifications/sample-metadata.yaml +++ b/src/health/notifications/sample-metadata.yaml diff --git a/src/health/notifications/slack/README.md b/src/health/notifications/slack/README.md new file mode 100644 index 000000000..35cb75a18 --- /dev/null +++ b/src/health/notifications/slack/README.md @@ -0,0 +1,101 @@ +<!--startmeta +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/health/notifications/slack/README.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/health/notifications/slack/metadata.yaml" +sidebar_label: "Slack" +learn_status: "Published" +learn_rel_path: "Alerts & Notifications/Notifications/Agent Dispatched Notifications" +message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE NOTIFICATION'S metadata.yaml FILE" +endmeta--> + +# Slack + + +<img src="https://netdata.cloud/img/slack.png" width="150"/> + + +Send notifications to a Slack workspace using Netdata's Agent alert notification feature, which supports dozens of endpoints, user roles, and more. + + + +<img src="https://img.shields.io/badge/maintained%20by-Netdata-%2300ab44" /> + +## Setup + +### Prerequisites + +#### + +- Slack app along with an incoming webhook, read Slack's guide on the topic [here](https://api.slack.com/messaging/webhooks). +- One or more channels to post the messages to +- Access to the terminal where Netdata Agent is running + + + +### Configuration + +#### File + +The configuration file name for this integration is `health_alarm_notify.conf`. + + +You can edit the configuration file using the `edit-config` script from the +Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory). + +```bash +cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata +sudo ./edit-config health_alarm_notify.conf +``` +#### Options + +The following options can be defined for this notification + +<details open><summary>Config Options</summary> + +| Name | Description | Default | Required | +|:----|:-----------|:-------|:--------:| +| SEND_SLACK | Set `SEND_SLACK` to YES | YES | yes | +| SLACK_WEBHOOK_URL | set `SLACK_WEBHOOK_URL` to your Slack app's webhook URL. | | yes | +| DEFAULT_RECIPIENT_SLACK | Set `DEFAULT_RECIPIENT_SLACK` to the Slack channel your Slack app is set to send messages to. The syntax for channels is `#channel` or `channel`. | | yes | + +</details> + +#### Examples + +##### Basic Configuration + + + +```yaml +#------------------------------------------------------------------------------ +# slack (slack.com) global notification options + +SEND_SLACK="YES" +SLACK_WEBHOOK_URL="https://hooks.slack.com/services/XXXXXXXX/XXXXXXXX/XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" +DEFAULT_RECIPIENT_SLACK="#alarms" + +``` + + +## Troubleshooting + +### Test Notification + +You can run the following command by hand, to test alerts configuration: + +```bash +# become user netdata +sudo su -s /bin/bash netdata + +# enable debugging info on the console +export NETDATA_ALARM_NOTIFY_DEBUG=1 + +# send test alarms to sysadmin +/usr/libexec/netdata/plugins.d/alarm-notify.sh test + +# send test alarms to any role +/usr/libexec/netdata/plugins.d/alarm-notify.sh test "ROLE" +``` + +Note that this will test _all_ alert mechanisms for the selected role. + + diff --git a/health/notifications/slack/metadata.yaml b/src/health/notifications/slack/metadata.yaml index 226c7ca37..226c7ca37 100644 --- a/health/notifications/slack/metadata.yaml +++ b/src/health/notifications/slack/metadata.yaml diff --git a/src/health/notifications/smstools3/README.md b/src/health/notifications/smstools3/README.md new file mode 100644 index 000000000..dafc0b7f4 --- /dev/null +++ b/src/health/notifications/smstools3/README.md @@ -0,0 +1,126 @@ +<!--startmeta +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/health/notifications/smstools3/README.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/health/notifications/smstools3/metadata.yaml" +sidebar_label: "SMS" +learn_status: "Published" +learn_rel_path: "Alerts & Notifications/Notifications/Agent Dispatched Notifications" +message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE NOTIFICATION'S metadata.yaml FILE" +endmeta--> + +# SMS + + +<img src="https://netdata.cloud/img/sms.svg" width="150"/> + + +Send notifications to `smstools3` using Netdata's Agent alert notification feature, which supports dozens of endpoints, user roles, and more. +The SMS Server Tools 3 is a SMS Gateway software which can send and receive short messages through GSM modems and mobile phones. + + + +<img src="https://img.shields.io/badge/maintained%20by-Netdata-%2300ab44" /> + +## Setup + +### Prerequisites + +#### + +- [Install](http://smstools3.kekekasvi.com/index.php?p=compiling) and [configure](http://smstools3.kekekasvi.com/index.php?p=configure) `smsd` +- To ensure that the user `netdata` can execute `sendsms`. Any user executing `sendsms` needs to: + - Have write permissions to /tmp and /var/spool/sms/outgoing + - Be a member of group smsd + - To ensure that the steps above are successful, just su netdata and execute sendsms phone message. +- Access to the terminal where Netdata Agent is running + + + +### Configuration + +#### File + +The configuration file name for this integration is `health_alarm_notify.conf`. + + +You can edit the configuration file using the `edit-config` script from the +Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory). + +```bash +cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata +sudo ./edit-config health_alarm_notify.conf +``` +#### Options + +The following options can be defined for this notification + +<details open><summary>Config Options</summary> + +| Name | Description | Default | Required | +|:----|:-----------|:-------|:--------:| +| sendsms | Set the path for `sendsms`, otherwise Netdata will search for it in your system `$PATH:` | YES | yes | +| SEND_SMS | Set `SEND_SMS` to `YES`. | | yes | +| DEFAULT_RECIPIENT_SMS | Set DEFAULT_RECIPIENT_SMS to the phone number you want the alert notifications to be sent to. You can define multiple phone numbers like this: PHONE1 PHONE2. | | yes | + +##### sendsms + +# The full path of the sendsms command (smstools3). +# If empty, the system $PATH will be searched for it. +# If not found, SMS notifications will be silently disabled. +sendsms="/usr/bin/sendsms" + + +##### DEFAULT_RECIPIENT_SMS + +All roles will default to this variable if left unconfigured. + +You can then have different phone numbers per role, by editing `DEFAULT_RECIPIENT_SMS` with the phone number you want, in the following entries at the bottom of the same file: +```conf +role_recipients_sms[sysadmin]="PHONE1" +role_recipients_sms[domainadmin]="PHONE2" +role_recipients_sms[dba]="PHONE3" +role_recipients_sms[webmaster]="PHONE4" +role_recipients_sms[proxyadmin]="PHONE5" +role_recipients_sms[sitemgr]="PHONE6" +``` + + +</details> + +#### Examples + +##### Basic Configuration + + + +```yaml +#------------------------------------------------------------------------------ +# SMS Server Tools 3 (smstools3) global notification options +SEND_SMS="YES" +DEFAULT_RECIPIENT_SMS="1234567890" + +``` + + +## Troubleshooting + +### Test Notification + +You can run the following command by hand, to test alerts configuration: + +```bash +# become user netdata +sudo su -s /bin/bash netdata + +# enable debugging info on the console +export NETDATA_ALARM_NOTIFY_DEBUG=1 + +# send test alarms to sysadmin +/usr/libexec/netdata/plugins.d/alarm-notify.sh test + +# send test alarms to any role +/usr/libexec/netdata/plugins.d/alarm-notify.sh test "ROLE" +``` + +Note that this will test _all_ alert mechanisms for the selected role. + + diff --git a/health/notifications/smstools3/metadata.yaml b/src/health/notifications/smstools3/metadata.yaml index 3a29183a5..3a29183a5 100644 --- a/health/notifications/smstools3/metadata.yaml +++ b/src/health/notifications/smstools3/metadata.yaml diff --git a/src/health/notifications/syslog/README.md b/src/health/notifications/syslog/README.md new file mode 100644 index 000000000..72534b1c8 --- /dev/null +++ b/src/health/notifications/syslog/README.md @@ -0,0 +1,132 @@ +<!--startmeta +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/health/notifications/syslog/README.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/health/notifications/syslog/metadata.yaml" +sidebar_label: "syslog" +learn_status: "Published" +learn_rel_path: "Alerts & Notifications/Notifications/Agent Dispatched Notifications" +message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE NOTIFICATION'S metadata.yaml FILE" +endmeta--> + +# syslog + + +<img src="https://netdata.cloud/img/syslog.png" width="150"/> + + +Send notifications to Syslog using Netdata's Agent alert notification feature, which supports dozens of endpoints, user roles, and more. + + + +<img src="https://img.shields.io/badge/maintained%20by-Netdata-%2300ab44" /> + +## Setup + +### Prerequisites + +#### + +- A working `logger` command for this to work. This is the case on pretty much every Linux system in existence, and most BSD systems. +- Access to the terminal where Netdata Agent is running + + + +### Configuration + +#### File + +The configuration file name for this integration is `health_alarm_notify.conf`. + + +You can edit the configuration file using the `edit-config` script from the +Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory). + +```bash +cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata +sudo ./edit-config health_alarm_notify.conf +``` +#### Options + +The following options can be defined for this notification + +<details open><summary>Config Options</summary> + +| Name | Description | Default | Required | +|:----|:-----------|:-------|:--------:| +| SYSLOG_FACILITY | Set `SYSLOG_FACILITY` to the facility used for logging, by default this value is set to `local6`. | | yes | +| DEFAULT_RECIPIENT_SYSLOG | Set `DEFAULT_RECIPIENT_SYSLOG` to the recipient you want the alert notifications to be sent to. | | yes | +| SEND_SYSLOG | Set SEND_SYSLOG to YES, make sure you have everything else configured before turning this on. | | yes | + +##### DEFAULT_RECIPIENT_SYSLOG + +Targets are defined as follows: + +``` +[[facility.level][@host[:port]]/]prefix +``` + +prefix defines what the log messages are prefixed with. By default, all lines are prefixed with 'netdata'. + +The facility and level are the standard syslog facility and level options, for more info on them see your local logger and syslog documentation. By default, Netdata will log to the local6 facility, with a log level dependent on the type of message (crit for CRITICAL, warning for WARNING, and info for everything else). + +You can configure sending directly to remote log servers by specifying a host (and optionally a port). However, this has a somewhat high overhead, so it is much preferred to use your local syslog daemon to handle the forwarding of messages to remote systems (pretty much all of them allow at least simple forwarding, and most of the really popular ones support complex queueing and routing of messages to remote log servers). + +You can define multiple recipients like this: daemon.notice@loghost:514/netdata daemon.notice@loghost2:514/netdata. +All roles will default to this variable if left unconfigured. + + +##### SEND_SYSLOG + +You can then have different recipients per role, by editing DEFAULT_RECIPIENT_SYSLOG with the recipient you want, in the following entries at the bottom of the same file: + +```conf +role_recipients_syslog[sysadmin]="daemon.notice@loghost1:514/netdata" +role_recipients_syslog[domainadmin]="daemon.notice@loghost2:514/netdata" +role_recipients_syslog[dba]="daemon.notice@loghost3:514/netdata" +role_recipients_syslog[webmaster]="daemon.notice@loghost4:514/netdata" +role_recipients_syslog[proxyadmin]="daemon.notice@loghost5:514/netdata" +role_recipients_syslog[sitemgr]="daemon.notice@loghost6:514/netdata" +``` + + +</details> + +#### Examples + +##### Basic Configuration + + + +```yaml +#------------------------------------------------------------------------------ +# syslog notifications + +SEND_SYSLOG="YES" +SYSLOG_FACILITY='local6' +DEFAULT_RECIPIENT_SYSLOG="daemon.notice@loghost6:514/netdata" + +``` + + +## Troubleshooting + +### Test Notification + +You can run the following command by hand, to test alerts configuration: + +```bash +# become user netdata +sudo su -s /bin/bash netdata + +# enable debugging info on the console +export NETDATA_ALARM_NOTIFY_DEBUG=1 + +# send test alarms to sysadmin +/usr/libexec/netdata/plugins.d/alarm-notify.sh test + +# send test alarms to any role +/usr/libexec/netdata/plugins.d/alarm-notify.sh test "ROLE" +``` + +Note that this will test _all_ alert mechanisms for the selected role. + + diff --git a/health/notifications/syslog/metadata.yaml b/src/health/notifications/syslog/metadata.yaml index c5f241e76..c5f241e76 100644 --- a/health/notifications/syslog/metadata.yaml +++ b/src/health/notifications/syslog/metadata.yaml diff --git a/src/health/notifications/telegram/README.md b/src/health/notifications/telegram/README.md new file mode 100644 index 000000000..e263d0bb5 --- /dev/null +++ b/src/health/notifications/telegram/README.md @@ -0,0 +1,117 @@ +<!--startmeta +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/health/notifications/telegram/README.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/health/notifications/telegram/metadata.yaml" +sidebar_label: "Telegram" +learn_status: "Published" +learn_rel_path: "Alerts & Notifications/Notifications/Agent Dispatched Notifications" +message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE NOTIFICATION'S metadata.yaml FILE" +endmeta--> + +# Telegram + + +<img src="https://netdata.cloud/img/telegram.svg" width="150"/> + + +Send notifications to Telegram using Netdata's Agent alert notification feature, which supports dozens of endpoints, user roles, and more. + + + +<img src="https://img.shields.io/badge/maintained%20by-Netdata-%2300ab44" /> + +## Setup + +### Prerequisites + +#### + +- A bot token. To get one, contact the [@BotFather](https://t.me/BotFather) bot and send the command `/newbot` and follow the instructions. Invite your bot to a group where you want it to send messages. +- The chat ID for every chat you want to send messages to. Invite [@myidbot](https://t.me/myidbot) bot to the group that will receive notifications, and write the command `/getgroupid@myidbot` to get the group chat ID. Group IDs start with a hyphen, supergroup IDs start with `-100`. +- Terminal access to the Agent you wish to configure. + + + +### Configuration + +#### File + +The configuration file name for this integration is `health_alarm_notify.conf`. + + +You can edit the configuration file using the `edit-config` script from the +Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory). + +```bash +cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata +sudo ./edit-config health_alarm_notify.conf +``` +#### Options + +The following options can be defined for this notification + +<details open><summary>Config Options</summary> + +| Name | Description | Default | Required | +|:----|:-----------|:-------|:--------:| +| SEND_TELEGRAM | Set `SEND_TELEGRAM` to YES | YES | yes | +| TELEGRAM_BOT_TOKEN | set `TELEGRAM_BOT_TOKEN` to your bot token. | | yes | +| DEFAULT_RECIPIENT_TELEGRAM | Set `DEFAULT_RECIPIENT_TELEGRAM` to the chat ID you want the alert notifications to be sent to. You can define multiple chat IDs like this: -49999333322 -1009999222255. | | yes | + +##### DEFAULT_RECIPIENT_TELEGRAM + +All roles will default to this variable if left unconfigured. + +The `DEFAULT_RECIPIENT_CUSTOM` can be edited in the following entries at the bottom of the same file: + +```conf +role_recipients_telegram[sysadmin]="-49999333324" +role_recipients_telegram[domainadmin]="-49999333389" +role_recipients_telegram[dba]="-10099992222" +role_recipients_telegram[webmaster]="-10099992222 -49999333389" +role_recipients_telegram[proxyadmin]="-49999333344" +role_recipients_telegram[sitemgr]="-49999333876" +``` + + +</details> + +#### Examples + +##### Basic Configuration + + + +```yaml +#------------------------------------------------------------------------------ +# telegram (telegram.org) global notification options + +SEND_TELEGRAM="YES" +TELEGRAM_BOT_TOKEN="111122223:7OpFlFFRzRBbrUUmIjj5HF9Ox2pYJZy5" +DEFAULT_RECIPIENT_TELEGRAM="-49999333876" + +``` + + +## Troubleshooting + +### Test Notification + +You can run the following command by hand, to test alerts configuration: + +```bash +# become user netdata +sudo su -s /bin/bash netdata + +# enable debugging info on the console +export NETDATA_ALARM_NOTIFY_DEBUG=1 + +# send test alarms to sysadmin +/usr/libexec/netdata/plugins.d/alarm-notify.sh test + +# send test alarms to any role +/usr/libexec/netdata/plugins.d/alarm-notify.sh test "ROLE" +``` + +Note that this will test _all_ alert mechanisms for the selected role. + + diff --git a/src/health/notifications/telegram/metadata.yaml b/src/health/notifications/telegram/metadata.yaml new file mode 100644 index 000000000..cc6d8c91e --- /dev/null +++ b/src/health/notifications/telegram/metadata.yaml @@ -0,0 +1,76 @@ +# yamllint disable rule:line-length +--- +- id: 'notify-telegram' + meta: + name: 'Telegram' + link: 'https://telegram.org/' + categories: + - notify.agent + icon_filename: 'telegram.svg' + keywords: + - Telegram + overview: + notification_description: | + Send notifications to Telegram using Netdata's Agent alert notification feature, which supports dozens of endpoints, user roles, and more. + notification_limitations: '' + setup: + prerequisites: + list: + - title: '' + description: | + - A bot token. To get one, contact the [@BotFather](https://t.me/BotFather) bot and send the command `/newbot` and follow the instructions. Invite your bot to a group where you want it to send messages. + - The chat ID for every chat you want to send messages to. Invite [@myidbot](https://t.me/myidbot) bot to the group that will receive notifications, and write the command `/getgroupid@myidbot` to get the group chat ID. Group IDs start with a hyphen, supergroup IDs start with `-100`. + - Terminal access to the Agent you wish to configure. + configuration: + file: + name: 'health_alarm_notify.conf' + options: + description: 'The following options can be defined for this notification' + folding: + title: 'Config Options' + enabled: true + list: + - name: 'SEND_TELEGRAM' + default_value: 'YES' + description: "Set `SEND_TELEGRAM` to YES" + required: true + - name: 'TELEGRAM_BOT_TOKEN' + default_value: '' + description: "set `TELEGRAM_BOT_TOKEN` to your bot token." + required: true + - name: 'DEFAULT_RECIPIENT_TELEGRAM' + default_value: '' + description: "Set `DEFAULT_RECIPIENT_TELEGRAM` to the chat ID you want the alert notifications to be sent to. You can define multiple chat IDs like this: -49999333322 -1009999222255." + required: true + detailed_description: | + All roles will default to this variable if left unconfigured. + + The `DEFAULT_RECIPIENT_CUSTOM` can be edited in the following entries at the bottom of the same file: + + ```conf + role_recipients_telegram[sysadmin]="-49999333324" + role_recipients_telegram[domainadmin]="-49999333389" + role_recipients_telegram[dba]="-10099992222" + role_recipients_telegram[webmaster]="-10099992222 -49999333389" + role_recipients_telegram[proxyadmin]="-49999333344" + role_recipients_telegram[sitemgr]="-49999333876" + ``` + examples: + folding: + enabled: true + title: '' + list: + - name: 'Basic Configuration' + folding: + enabled: false + description: '' + config: | + #------------------------------------------------------------------------------ + # telegram (telegram.org) global notification options + + SEND_TELEGRAM="YES" + TELEGRAM_BOT_TOKEN="111122223:7OpFlFFRzRBbrUUmIjj5HF9Ox2pYJZy5" + DEFAULT_RECIPIENT_TELEGRAM="-49999333876" + troubleshooting: + problems: + list: [] diff --git a/src/health/notifications/twilio/README.md b/src/health/notifications/twilio/README.md new file mode 100644 index 000000000..cd9b17e7f --- /dev/null +++ b/src/health/notifications/twilio/README.md @@ -0,0 +1,118 @@ +<!--startmeta +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/health/notifications/twilio/README.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/health/notifications/twilio/metadata.yaml" +sidebar_label: "Twilio" +learn_status: "Published" +learn_rel_path: "Alerts & Notifications/Notifications/Agent Dispatched Notifications" +message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE NOTIFICATION'S metadata.yaml FILE" +endmeta--> + +# Twilio + + +<img src="https://netdata.cloud/img/twilio.png" width="150"/> + + +Send notifications to Twilio using Netdata's Agent alert notification feature, which supports dozens of endpoints, user roles, and more. + + + +<img src="https://img.shields.io/badge/maintained%20by-Netdata-%2300ab44" /> + +## Setup + +### Prerequisites + +#### + +- Get your SID, and Token from https://www.twilio.com/console +- Terminal access to the Agent you wish to configure + + + +### Configuration + +#### File + +The configuration file name for this integration is `health_alarm_notify.conf`. + + +You can edit the configuration file using the `edit-config` script from the +Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory). + +```bash +cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata +sudo ./edit-config health_alarm_notify.conf +``` +#### Options + +The following options can be defined for this notification + +<details open><summary>Config Options</summary> + +| Name | Description | Default | Required | +|:----|:-----------|:-------|:--------:| +| SEND_TWILIO | Set `SEND_TWILIO` to YES | YES | yes | +| TWILIO_ACCOUNT_SID | set `TWILIO_ACCOUNT_SID` to your account SID. | | yes | +| TWILIO_ACCOUNT_TOKEN | Set `TWILIO_ACCOUNT_TOKEN` to your account token. | | yes | +| TWILIO_NUMBER | Set `TWILIO_NUMBER` to your account's number. | | yes | +| DEFAULT_RECIPIENT_TWILIO | Set DEFAULT_RECIPIENT_TWILIO to the number you want the alert notifications to be sent to. You can define multiple numbers like this: +15555555555 +17777777777. | | yes | + +##### DEFAULT_RECIPIENT_TWILIO + +You can then have different recipients per role, by editing DEFAULT_RECIPIENT_TWILIO with the recipient's number you want, in the following entries at the bottom of the same file: + +```conf +role_recipients_twilio[sysadmin]="+15555555555" +role_recipients_twilio[domainadmin]="+15555555556" +role_recipients_twilio[dba]="+15555555557" +role_recipients_twilio[webmaster]="+15555555558" +role_recipients_twilio[proxyadmin]="+15555555559" +role_recipients_twilio[sitemgr]="+15555555550" +``` + + +</details> + +#### Examples + +##### Basic Configuration + + + +```yaml +#------------------------------------------------------------------------------ +# Twilio (twilio.com) SMS options + +SEND_TWILIO="YES" +TWILIO_ACCOUNT_SID="xxxxxxxxx" +TWILIO_ACCOUNT_TOKEN="xxxxxxxxxx" +TWILIO_NUMBER="xxxxxxxxxxx" +DEFAULT_RECIPIENT_TWILIO="+15555555555" + +``` + + +## Troubleshooting + +### Test Notification + +You can run the following command by hand, to test alerts configuration: + +```bash +# become user netdata +sudo su -s /bin/bash netdata + +# enable debugging info on the console +export NETDATA_ALARM_NOTIFY_DEBUG=1 + +# send test alarms to sysadmin +/usr/libexec/netdata/plugins.d/alarm-notify.sh test + +# send test alarms to any role +/usr/libexec/netdata/plugins.d/alarm-notify.sh test "ROLE" +``` + +Note that this will test _all_ alert mechanisms for the selected role. + + diff --git a/health/notifications/twilio/metadata.yaml b/src/health/notifications/twilio/metadata.yaml index 35fc3f042..35fc3f042 100644 --- a/health/notifications/twilio/metadata.yaml +++ b/src/health/notifications/twilio/metadata.yaml diff --git a/src/health/notifications/web/README.md b/src/health/notifications/web/README.md new file mode 100644 index 000000000..d7115be3d --- /dev/null +++ b/src/health/notifications/web/README.md @@ -0,0 +1,18 @@ +<!-- +title: "Browser pop up agent alert notifications" +sidebar_label: "Browser pop ups" +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/health/notifications/web/README.md" +learn_status: "Published" +learn_topic_type: "Tasks" +learn_rel_path: "Integrations/Notify/Agent alert notifications" +learn_autogeneration_metadata: "{'part_of_cloud': False, 'part_of_agent': True}" +--> + +# Browser pop up agent alert notifications + +The Netdata dashboard shows HTML notifications, when it is open. + +Such web notifications look like this: +![image](https://cloud.githubusercontent.com/assets/2662304/18407279/82bac6a6-7714-11e6-847e-c2e84eeacbfb.png) + + diff --git a/src/health/rrdcalc.c b/src/health/rrdcalc.c new file mode 100644 index 000000000..bce709bf4 --- /dev/null +++ b/src/health/rrdcalc.c @@ -0,0 +1,512 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "database/rrd.h" +#include "health_internals.h" + +// ---------------------------------------------------------------------------- +// RRDCALC helpers + +void rrdcalc_flags_to_json_array(BUFFER *wb, const char *key, RRDCALC_FLAGS flags) { + buffer_json_member_add_array(wb, key); + + if(flags & RRDCALC_FLAG_DB_ERROR) + buffer_json_add_array_item_string(wb, "DB_ERROR"); + if(flags & RRDCALC_FLAG_DB_NAN) + buffer_json_add_array_item_string(wb, "DB_NAN"); + if(flags & RRDCALC_FLAG_CALC_ERROR) + buffer_json_add_array_item_string(wb, "CALC_ERROR"); + if(flags & RRDCALC_FLAG_WARN_ERROR) + buffer_json_add_array_item_string(wb, "WARN_ERROR"); + if(flags & RRDCALC_FLAG_CRIT_ERROR) + buffer_json_add_array_item_string(wb, "CRIT_ERROR"); + if(flags & RRDCALC_FLAG_RUNNABLE) + buffer_json_add_array_item_string(wb, "RUNNABLE"); + if(flags & RRDCALC_FLAG_DISABLED) + buffer_json_add_array_item_string(wb, "DISABLED"); + if(flags & RRDCALC_FLAG_SILENCED) + buffer_json_add_array_item_string(wb, "SILENCED"); + if(flags & RRDCALC_FLAG_RUN_ONCE) + buffer_json_add_array_item_string(wb, "RUN_ONCE"); + + buffer_json_array_close(wb); +} + +inline const char *rrdcalc_status2string(RRDCALC_STATUS status) { + switch(status) { + case RRDCALC_STATUS_REMOVED: + return "REMOVED"; + + case RRDCALC_STATUS_UNDEFINED: + return "UNDEFINED"; + + case RRDCALC_STATUS_UNINITIALIZED: + return "UNINITIALIZED"; + + case RRDCALC_STATUS_CLEAR: + return "CLEAR"; + + case RRDCALC_STATUS_RAISED: + return "RAISED"; + + case RRDCALC_STATUS_WARNING: + return "WARNING"; + + case RRDCALC_STATUS_CRITICAL: + return "CRITICAL"; + + default: + netdata_log_error("Unknown alarm status %d", status); + return "UNKNOWN"; + } +} + +uint32_t rrdcalc_get_unique_id(RRDHOST *host, STRING *chart, STRING *name, uint32_t *next_event_id, nd_uuid_t *config_hash_id) { + rw_spinlock_read_lock(&host->health_log.spinlock); + + // re-use old IDs, by looking them up in the alarm log + ALARM_ENTRY *ae = NULL; + for(ae = host->health_log.alarms; ae ;ae = ae->next) { + if(unlikely(name == ae->name && chart == ae->chart && uuid_eq(ae->config_hash_id, *config_hash_id))) { + if(next_event_id) *next_event_id = ae->alarm_event_id + 1; + break; + } + } + + uint32_t alarm_id; + + if(ae) + alarm_id = ae->alarm_id; + else { + alarm_id = sql_get_alarm_id(host, chart, name, next_event_id); + if (!alarm_id) { + if (unlikely(!host->health_log.next_alarm_id)) + host->health_log.next_alarm_id = (uint32_t)now_realtime_sec(); + alarm_id = host->health_log.next_alarm_id++; + } + } + + rw_spinlock_read_unlock(&host->health_log.spinlock); + return alarm_id; +} + +// ---------------------------------------------------------------------------- +// RRDCALC replacing info/summary text variables with RRDSET labels + +static STRING *rrdcalc_replace_variables_with_rrdset_labels(const char *line, RRDCALC *rc) { + if (!line || !*line) + return NULL; + + size_t pos = 0; + char *temp = strdupz(line); + char var[RRDCALC_VAR_MAX]; + char *m, *lbl_value = NULL; + + while ((m = strchr(temp + pos, '$')) && *(m+1) == '{') { + int i = 0; + char *e = m; + while (*e) { + var[i++] = *e; + + if (*e == '}' || i == RRDCALC_VAR_MAX - 1) + break; + + e++; + } + + var[i] = '\0'; + pos = m - temp + 1; + + if (!strcmp(var, RRDCALC_VAR_FAMILY)) { + char *buf = find_and_replace(temp, var, (rc->rrdset && rc->rrdset->family) ? rrdset_family(rc->rrdset) : "", m); + freez(temp); + temp = buf; + } + else if (!strncmp(var, RRDCALC_VAR_LABEL, RRDCALC_VAR_LABEL_LEN)) { + char label_val[RRDCALC_VAR_MAX + RRDCALC_VAR_LABEL_LEN + 1] = { 0 }; + strcpy(label_val, var+RRDCALC_VAR_LABEL_LEN); + label_val[i - RRDCALC_VAR_LABEL_LEN - 1] = '\0'; + + if(likely(rc->rrdset && rc->rrdset->rrdlabels)) { + lbl_value = NULL; + rrdlabels_get_value_strdup_or_null(rc->rrdset->rrdlabels, &lbl_value, label_val); + if (lbl_value) { + char *buf = find_and_replace(temp, var, lbl_value, m); + freez(temp); + temp = buf; + freez(lbl_value); + } + } + } + } + + STRING *ret = string_strdupz(temp); + freez(temp); + + return ret; +} + +void rrdcalc_update_info_using_rrdset_labels(RRDCALC *rc) { + if(rc->rrdset && rc->rrdset->rrdlabels) { + size_t labels_version = rrdlabels_version(rc->rrdset->rrdlabels); + if (rc->labels_version != labels_version) { + STRING *old; + + old = rc->info; + rc->info = rrdcalc_replace_variables_with_rrdset_labels(string2str(rc->config.info), rc); + string_freez(old); + + old = rc->summary; + rc->summary = rrdcalc_replace_variables_with_rrdset_labels(string2str(rc->config.summary), rc); + string_freez(old); + + rc->labels_version = labels_version; + } + } + + if(!rc->summary) + rc->summary = string_dup(rc->config.summary); + + if(!rc->info) + rc->info = string_dup(rc->config.info); +} + +// ---------------------------------------------------------------------------- +// RRDCALC index management for RRDSET + +// the dictionary requires a unique key for every item +// we use {chart id}.{alert name} for both the RRDHOST and RRDSET alert indexes. + +#define RRDCALC_MAX_KEY_SIZE 1024 +static size_t rrdcalc_key(char *dst, size_t dst_len, const char *chart, const char *alert) { + return snprintfz(dst, dst_len, "%s,on[%s]", alert, chart); +} + +const RRDCALC_ACQUIRED *rrdcalc_from_rrdset_get(RRDSET *st, const char *alert_name) { + char key[RRDCALC_MAX_KEY_SIZE + 1]; + size_t key_len = rrdcalc_key(key, RRDCALC_MAX_KEY_SIZE, rrdset_id(st), alert_name); + + const RRDCALC_ACQUIRED *rca = (const RRDCALC_ACQUIRED *)dictionary_get_and_acquire_item_advanced(st->rrdhost->rrdcalc_root_index, key, (ssize_t)key_len); + + if(!rca) { + key_len = rrdcalc_key(key, RRDCALC_MAX_KEY_SIZE, rrdset_name(st), alert_name); + rca = (const RRDCALC_ACQUIRED *)dictionary_get_and_acquire_item_advanced(st->rrdhost->rrdcalc_root_index, key, (ssize_t)key_len); + } + + return rca; +} + +void rrdcalc_from_rrdset_release(RRDSET *st, const RRDCALC_ACQUIRED *rca) { + if(!rca) return; + + dictionary_acquired_item_release(st->rrdhost->rrdcalc_root_index, (const DICTIONARY_ITEM *)rca); +} + +RRDCALC *rrdcalc_acquired_to_rrdcalc(const RRDCALC_ACQUIRED *rca) { + if(rca) + return dictionary_acquired_item_value((const DICTIONARY_ITEM *)rca); + + return NULL; +} + +// ---------------------------------------------------------------------------- +// RRDCALC managing the linking with RRDSET + +static void rrdcalc_link_to_rrdset(RRDCALC *rc) { + RRDSET *st = rc->rrdset; + RRDHOST *host = st->rrdhost; + + rw_spinlock_write_lock(&st->alerts.spinlock); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(st->alerts.base, rc, prev, next); + rw_spinlock_write_unlock(&st->alerts.spinlock); + + char buf[RRDVAR_MAX_LENGTH + 1]; + snprintfz(buf, RRDVAR_MAX_LENGTH, "%s.%s", rrdset_name(st), rrdcalc_name(rc)); + STRING *rrdset_name_rrdcalc_name = string_strdupz(buf); + snprintfz(buf, RRDVAR_MAX_LENGTH, "%s.%s", rrdset_id(st), rrdcalc_name(rc)); + STRING *rrdset_id_rrdcalc_name = string_strdupz(buf); + + string_freez(rrdset_id_rrdcalc_name); + string_freez(rrdset_name_rrdcalc_name); + + time_t now = now_realtime_sec(); + ALARM_ENTRY *ae = health_create_alarm_entry( + host, + rc, + now, + now - rc->last_status_change, + rc->old_value, + rc->value, + RRDCALC_STATUS_REMOVED, + rc->status, + 0, + rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0); + + health_log_alert(host, ae); + health_alarm_log_add_entry(host, ae); + rrdset_flag_set(st, RRDSET_FLAG_HAS_RRDCALC_LINKED); + +} + +static void rrdcalc_unlink_from_rrdset(RRDCALC *rc, bool having_ll_wrlock) { + RRDSET *st = rc->rrdset; + + if(!st) { + netdata_log_error( + "Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", + rrdcalc_chart_name(rc), rrdcalc_name(rc)); + return; + } + + RRDHOST *host = st->rrdhost; + + time_t now = now_realtime_sec(); + + if (likely(rc->status != RRDCALC_STATUS_REMOVED)) { + ALARM_ENTRY *ae = health_create_alarm_entry( + host, + rc, + now, + now - rc->last_status_change, + rc->old_value, + rc->value, + rc->status, + RRDCALC_STATUS_REMOVED, + 0, + 0); + + health_log_alert(host, ae); + health_alarm_log_add_entry(host, ae); + } + + // unlink it + + if(!having_ll_wrlock) + rw_spinlock_write_lock(&st->alerts.spinlock); + + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(st->alerts.base, rc, prev, next); + + if(!having_ll_wrlock) + rw_spinlock_write_unlock(&st->alerts.spinlock); + + rc->rrdset = NULL; +} + +// ---------------------------------------------------------------------------- +// RRDCALC rrdhost index management - constructor + +struct rrdcalc_constructor { + RRDSET *rrdset; + RRD_ALERT_PROTOTYPE *ap; + + enum { + RRDCALC_REACT_NONE, + RRDCALC_REACT_NEW, + } react_action; +}; + +static void rrdcalc_rrdhost_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, void *rrdcalc, void *constructor_data) { + RRDCALC *rc = rrdcalc; + struct rrdcalc_constructor *ctr = constructor_data; + RRDSET *st = ctr->rrdset; + RRDHOST *host = st->rrdhost; + RRD_ALERT_PROTOTYPE *ap = ctr->ap; + + rc->key = string_strdupz(dictionary_acquired_item_name(item)); + rc->rrdset = st; + rc->chart = string_dup(st->id); + + health_prototype_copy_config(&rc->config, &ap->config); + health_prototype_copy_match_without_patterns(&rc->match, &ap->match); + + rc->next_event_id = 1; + rc->value = NAN; + rc->old_value = NAN; + rc->last_repeat = 0; + rc->times_repeat = 0; + rc->last_status_change_value = rc->value; + rc->last_status_change = now_realtime_sec(); + + if(!rc->config.units) + rc->config.units = string_dup(st->units); + + if(rc->config.update_every < rc->rrdset->update_every) { + netdata_log_info( + "HEALTH: alert '%s.%s' has update every %d, less than chart update every %d. " + "Setting alarm update frequency to %d.", + string2str(st->id), string2str(rc->config.name), + rc->config.update_every, rc->rrdset->update_every, rc->rrdset->update_every); + + rc->config.update_every = st->update_every; + } + + rc->id = rrdcalc_get_unique_id(host, rc->chart, rc->config.name, &rc->next_event_id, &rc->config.hash_id); + + expression_set_variable_lookup_callback(rc->config.calculation, alert_variable_lookup, rc); + expression_set_variable_lookup_callback(rc->config.warning, alert_variable_lookup, rc); + expression_set_variable_lookup_callback(rc->config.critical, alert_variable_lookup, rc); + + rrdcalc_update_info_using_rrdset_labels(rc); + + ctr->react_action = RRDCALC_REACT_NEW; +} + +static bool rrdcalc_rrdhost_conflict_callback(const DICTIONARY_ITEM *item __maybe_unused, void *rrdcalc __maybe_unused, void *rrdcalc_new __maybe_unused, void *constructor_data) { + struct rrdcalc_constructor *ctr = constructor_data; + ctr->react_action = RRDCALC_REACT_NONE; + return false; +} + +static void rrdcalc_rrdhost_react_callback(const DICTIONARY_ITEM *item __maybe_unused, void *rrdcalc, void *constructor_data) { + RRDCALC *rc = rrdcalc; + struct rrdcalc_constructor *ctr = constructor_data; + + if(ctr->react_action == RRDCALC_REACT_NEW) + rrdcalc_link_to_rrdset(rc); +} + +// ---------------------------------------------------------------------------- +// RRDCALC rrdhost index management - destructor + +static void rrdcalc_free_internals(RRDCALC *rc) { + if(unlikely(!rc)) return; + + rrd_alert_match_cleanup(&rc->match); + rrd_alert_config_cleanup(&rc->config); + + string_freez(rc->key); + string_freez(rc->chart); + + string_freez(rc->info); + string_freez(rc->summary); +} + +static void rrdcalc_rrdhost_delete_callback(const DICTIONARY_ITEM *item __maybe_unused, void *rrdcalc, void *rrdhost __maybe_unused) { + RRDCALC *rc = rrdcalc; + //RRDHOST *host = rrdhost; + + if(unlikely(rc->rrdset)) + rrdcalc_unlink_from_rrdset(rc, false); + + // any destruction actions that require other locks + // have to be placed in rrdcalc_del(), because the object is actually locked for deletion + + rrdcalc_free_internals(rc); +} + +// ---------------------------------------------------------------------------- +// RRDCALC rrdhost index management - index API + +void rrdcalc_rrdhost_index_init(RRDHOST *host) { + if(!host->rrdcalc_root_index) { + host->rrdcalc_root_index = dictionary_create_advanced(DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_FIXED_SIZE, + &dictionary_stats_category_rrdhealth, sizeof(RRDCALC)); + + dictionary_register_insert_callback(host->rrdcalc_root_index, rrdcalc_rrdhost_insert_callback, NULL); + dictionary_register_conflict_callback(host->rrdcalc_root_index, rrdcalc_rrdhost_conflict_callback, NULL); + dictionary_register_react_callback(host->rrdcalc_root_index, rrdcalc_rrdhost_react_callback, NULL); + dictionary_register_delete_callback(host->rrdcalc_root_index, rrdcalc_rrdhost_delete_callback, host); + } +} + +void rrdcalc_rrdhost_index_destroy(RRDHOST *host) { + dictionary_destroy(host->rrdcalc_root_index); + host->rrdcalc_root_index = NULL; +} + +bool rrdcalc_add_from_prototype(RRDHOST *host, RRDSET *st, RRD_ALERT_PROTOTYPE *ap) { + char key[RRDCALC_MAX_KEY_SIZE + 1]; + size_t key_len = rrdcalc_key(key, RRDCALC_MAX_KEY_SIZE, + string2str(st->id), string2str(ap->config.name)); + + struct rrdcalc_constructor tmp = { + .ap = ap, + .rrdset = st, + .react_action = RRDCALC_REACT_NONE, + }; + + bool ret = true; + + dictionary_set_advanced(host->rrdcalc_root_index, key, (ssize_t)key_len, + NULL, sizeof(RRDCALC), &tmp); + + if(tmp.react_action != RRDCALC_REACT_NEW) + ret = false; + + return ret; +} + +void rrdcalc_unlink_and_delete(RRDHOST *host, RRDCALC *rc, bool having_ll_wrlock) { + if(rc->rrdset) + rrdcalc_unlink_from_rrdset(rc, having_ll_wrlock); + + dictionary_del_advanced(host->rrdcalc_root_index, string2str(rc->key), (ssize_t)string_strlen(rc->key)); +} + + +// ---------------------------------------------------------------------------- +// RRDCALC cleanup API functions + +void rrdcalc_unlink_and_delete_all_rrdset_alerts(RRDSET *st) { + RRDCALC *rc, *last = NULL; + rw_spinlock_write_lock(&st->alerts.spinlock); + while((rc = st->alerts.base)) { + if(last == rc) { + netdata_log_error("RRDCALC: malformed list of alerts linked to chart - cannot cleanup - giving up."); + break; + } + last = rc; + + rrdcalc_unlink_and_delete(st->rrdhost, rc, true); + } + rw_spinlock_write_unlock(&st->alerts.spinlock); +} + +void rrdcalc_delete_all(RRDHOST *host) { + dictionary_flush(host->rrdcalc_root_index); +} + +void rrdcalc_child_disconnected(RRDHOST *host) { + rrdcalc_delete_all(host); + + rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION); + RRDSET *st; + rrdset_foreach_read(st, host) { + rrdset_flag_clear(st, RRDSET_FLAG_PENDING_HEALTH_INITIALIZATION); + } + rrdset_foreach_done(st); +} + +void rrd_alert_match_cleanup(struct rrd_alert_match *am) { + if(am->is_template) + string_freez(am->on.context); + else + string_freez(am->on.chart); + + string_freez(am->host_labels); + pattern_array_free(am->host_labels_pattern); + + string_freez(am->chart_labels); + pattern_array_free(am->chart_labels_pattern); +} + +void rrd_alert_config_cleanup(struct rrd_alert_config *ac) { + string_freez(ac->name); + + string_freez(ac->exec); + string_freez(ac->recipient); + + string_freez(ac->classification); + string_freez(ac->component); + string_freez(ac->type); + + string_freez(ac->source); + string_freez(ac->units); + string_freez(ac->summary); + string_freez(ac->info); + + string_freez(ac->dimensions); + + expression_free(ac->calculation); + expression_free(ac->warning); + expression_free(ac->critical); +} diff --git a/src/health/rrdcalc.h b/src/health/rrdcalc.h new file mode 100644 index 000000000..3a7951a73 --- /dev/null +++ b/src/health/rrdcalc.h @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "database/rrd.h" +#include "web/api/queries/rrdr.h" +#include "health_prototypes.h" + +#ifndef NETDATA_RRDCALC_H +#define NETDATA_RRDCALC_H 1 + +// calculated variables (defined in health configuration) +// These aggregate time-series data at fixed intervals +// (defined in their update_every member below) +// They increase the overhead of netdata. +// +// These calculations are stored under RRDHOST. +// Then are also linked to RRDSET (of course only when a +// matching chart is found). + +typedef enum rrdcalc_status { + RRDCALC_STATUS_REMOVED = -2, + RRDCALC_STATUS_UNDEFINED = -1, + RRDCALC_STATUS_UNINITIALIZED = 0, + RRDCALC_STATUS_CLEAR = 1, + RRDCALC_STATUS_RAISED = 2, // DO NOT CHANGE THESE NUMBERS + RRDCALC_STATUS_WARNING = 3, // DO NOT CHANGE THESE NUMBERS + RRDCALC_STATUS_CRITICAL = 4, // DO NOT CHANGE THESE NUMBERS +} RRDCALC_STATUS; + +typedef enum { + RRDCALC_FLAG_DB_ERROR = (1 << 0), + RRDCALC_FLAG_DB_NAN = (1 << 1), + // RRDCALC_FLAG_DB_STALE = (1 << 2), + RRDCALC_FLAG_CALC_ERROR = (1 << 3), + RRDCALC_FLAG_WARN_ERROR = (1 << 4), + RRDCALC_FLAG_CRIT_ERROR = (1 << 5), + RRDCALC_FLAG_RUNNABLE = (1 << 6), + RRDCALC_FLAG_DISABLED = (1 << 7), + RRDCALC_FLAG_SILENCED = (1 << 8), + RRDCALC_FLAG_RUN_ONCE = (1 << 9), +} RRDCALC_FLAGS; +void rrdcalc_flags_to_json_array(BUFFER *wb, const char *key, RRDCALC_FLAGS flags); + +#define RRDCALC_ALL_OPTIONS_EXCLUDING_THE_RRDR_ONES (RRDCALC_OPTION_NO_CLEAR_NOTIFICATION) + +struct rrdcalc { + uint32_t id; // the unique id of this alarm + uint32_t next_event_id; // the next event id that will be used for this alarm + + STRING *key; // the unique key in the host's rrdcalc_root_index + STRING *chart; // the chart id this should be linked to + + struct rrd_alert_match match; + struct rrd_alert_config config; + + // ------------------------------------------------------------------------ + // runtime information + + STRING *summary; // the original summary field before any variable replacement + STRING *info; // the original info field before any variable replacement + + RRDCALC_STATUS old_status; // the old status of the alarm + RRDCALC_STATUS status; // the current status of the alarm + + NETDATA_DOUBLE value; // the current value of the alarm + NETDATA_DOUBLE old_value; // the previous value of the alarm + NETDATA_DOUBLE last_status_change_value; // the value at the last status change + + RRDCALC_FLAGS run_flags; // check RRDCALC_FLAG_* + + time_t last_updated; // the last update timestamp of the alarm + time_t next_update; // the next update timestamp of the alarm + time_t last_status_change; // the timestamp of the last time this alarm changed status + time_t last_repeat; // the last time the alarm got repeated + uint32_t times_repeat; // number of times the alarm got repeated + + time_t db_after; // the first timestamp evaluated by the db lookup + time_t db_before; // the last timestamp evaluated by the db lookup + + time_t delay_up_to_timestamp; // the timestamp up to which we should delay notifications + int delay_up_current; // the current up notification delay duration + int delay_down_current; // the current down notification delay duration + int delay_last; // the last delay we used + + // ------------------------------------------------------------------------ + // the chart this alarm it is linked to + + size_t labels_version; + struct rrdset *rrdset; + + struct rrdcalc *next; + struct rrdcalc *prev; +}; + +#define rrdcalc_name(rc) string2str((rc)->config.name) +#define rrdcalc_chart_name(rc) string2str((rc)->chart) +#define rrdcalc_exec(rc) string2str((rc)->config.exec) +#define rrdcalc_recipient(rc) string2str((rc)->config.recipient) +#define rrdcalc_classification(rc) string2str((rc)->config.classification) +#define rrdcalc_component(rc) string2str((rc)->config.component) +#define rrdcalc_type(rc) string2str((rc)->config.type) +#define rrdcalc_source(rc) string2str((rc)->config.source) +#define rrdcalc_units(rc) string2str((rc)->config.units) +#define rrdcalc_dimensions(rc) string2str((rc)->config.dimensions) + +#define foreach_rrdcalc_in_rrdhost_read(host, rc) \ + dfe_start_read((host)->rrdcalc_root_index, rc) \ + +#define foreach_rrdcalc_in_rrdhost_reentrant(host, rc) \ + dfe_start_reentrant((host)->rrdcalc_root_index, rc) + +#define foreach_rrdcalc_in_rrdhost_done(rc) \ + dfe_done(rc) + +#define RRDCALC_HAS_DB_LOOKUP(rc) ((rc)->config.after) + +void rrdcalc_update_info_using_rrdset_labels(RRDCALC *rc); + +const RRDCALC_ACQUIRED *rrdcalc_from_rrdset_get(RRDSET *st, const char *alert_name); +void rrdcalc_from_rrdset_release(RRDSET *st, const RRDCALC_ACQUIRED *rca); +RRDCALC *rrdcalc_acquired_to_rrdcalc(const RRDCALC_ACQUIRED *rca); + +const char *rrdcalc_status2string(RRDCALC_STATUS status); + +uint32_t rrdcalc_get_unique_id(RRDHOST *host, STRING *chart, STRING *name, uint32_t *next_event_id, nd_uuid_t *config_hash_id); + +static inline int rrdcalc_isrepeating(RRDCALC *rc) { + if (unlikely(rc->config.warn_repeat_every > 0 || rc->config.crit_repeat_every > 0)) { + return 1; + } + return 0; +} + +void rrdcalc_unlink_and_delete_all_rrdset_alerts(RRDSET *st); +void rrdcalc_delete_all(RRDHOST *host); + +void rrdcalc_rrdhost_index_init(RRDHOST *host); +void rrdcalc_rrdhost_index_destroy(RRDHOST *host); + +void rrdcalc_unlink_and_delete(RRDHOST *host, RRDCALC *rc, bool having_ll_wrlock); + +#define RRDCALC_VAR_MAX 100 +#define RRDCALC_VAR_FAMILY "${family}" +#define RRDCALC_VAR_LABEL "${label:" +#define RRDCALC_VAR_LABEL_LEN (sizeof(RRDCALC_VAR_LABEL)-1) + +void rrdcalc_child_disconnected(RRDHOST *host); + +#endif //NETDATA_RRDCALC_H diff --git a/src/health/rrdvar.c b/src/health/rrdvar.c new file mode 100644 index 000000000..4e28e62a3 --- /dev/null +++ b/src/health/rrdvar.c @@ -0,0 +1,342 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "database/rrd.h" + +typedef struct rrdvar { + NETDATA_DOUBLE value; +} RRDVAR; + +// ---------------------------------------------------------------------------- +// RRDVAR management + +inline int rrdvar_fix_name(char *variable) { + int fixed = 0; + while(*variable) { + if (!isalnum((uint8_t)*variable) && *variable != '.' && *variable != '_') { + *variable++ = '_'; + fixed++; + } + else + variable++; + } + + return fixed; +} + +inline STRING *rrdvar_name_to_string(const char *name) { + char *variable = strdupz(name); + rrdvar_fix_name(variable); + STRING *name_string = string_strdupz(variable); + freez(variable); + return name_string; +} + +static bool rrdvar_conflict_callback(const DICTIONARY_ITEM *item __maybe_unused, void *old_value, void *new_value, void *data __maybe_unused) { + RRDVAR *rv = old_value; + RRDVAR *nrv = new_value; + + rv->value = nrv->value; + return false; +} + +DICTIONARY *rrdvariables_create(void) { + DICTIONARY *dict = dictionary_create_advanced(DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_FIXED_SIZE, + &dictionary_stats_category_rrdhealth, sizeof(RRDVAR)); + dictionary_register_conflict_callback(dict, rrdvar_conflict_callback, NULL); + return dict; +} + +void rrdvariables_destroy(DICTIONARY *dict) { + dictionary_destroy(dict); +} + +static inline const RRDVAR_ACQUIRED *rrdvar_get_and_acquire(DICTIONARY *dict, STRING *name) { + return (const RRDVAR_ACQUIRED *)dictionary_get_and_acquire_item_advanced(dict, string2str(name), (ssize_t)string_strlen(name)); +} + +inline const RRDVAR_ACQUIRED *rrdvar_add_and_acquire(DICTIONARY *dict, STRING *name, NETDATA_DOUBLE value) { + if(unlikely(!dict || !name)) return NULL; + RRDVAR tmp = { + .value = value, + }; + return (const RRDVAR_ACQUIRED *)dictionary_set_and_acquire_item_advanced( + dict, string2str(name), (ssize_t)string_strlen(name), + &tmp, sizeof(tmp), NULL); +} + +void rrdvar_delete_all(DICTIONARY *dict) { + dictionary_flush(dict); +} + +void rrdvar_release(DICTIONARY *dict, const RRDVAR_ACQUIRED *rva) { + if(unlikely(!dict || !rva)) return; // when health is not enabled + dictionary_acquired_item_release(dict, (const DICTIONARY_ITEM *)rva); +} + +// ---------------------------------------------------------------------------- +// CUSTOM HOST VARIABLES + +inline int rrdvar_walkthrough_read(DICTIONARY *dict, int (*callback)(const DICTIONARY_ITEM *item, void *rrdvar, void *data), void *data) { + if(unlikely(!dict)) return 0; // when health is not enabled + return dictionary_walkthrough_read(dict, callback, data); +} + +const RRDVAR_ACQUIRED *rrdvar_host_variable_add_and_acquire(RRDHOST *host, const char *name) { + if(unlikely(!host->rrdvars)) return NULL; // when health is not enabled + + STRING *name_string = rrdvar_name_to_string(name); + const RRDVAR_ACQUIRED *rva = rrdvar_add_and_acquire(host->rrdvars, name_string, NAN); + + string_freez(name_string); + return rva; +} + +void rrdvar_host_variable_set(RRDHOST *host, const RRDVAR_ACQUIRED *rva, NETDATA_DOUBLE value) { + if(unlikely(!host->rrdvars || !rva)) return; // when health is not enabled + + RRDVAR *rv = dictionary_acquired_item_value((const DICTIONARY_ITEM *)rva); + if(rv->value != value) { + rv->value = value; + + // if the host is streaming, send this variable upstream immediately + rrdpush_sender_send_this_host_variable_now(host, rva); + } +} + +// ---------------------------------------------------------------------------- +// CUSTOM CHART VARIABLES + +const RRDVAR_ACQUIRED *rrdvar_chart_variable_add_and_acquire(RRDSET *st, const char *name) { + if(unlikely(!st->rrdvars)) return NULL; + + STRING *name_string = rrdvar_name_to_string(name); + const RRDVAR_ACQUIRED *rs = rrdvar_add_and_acquire(st->rrdvars, name_string, NAN); + string_freez(name_string); + return rs; +} + +void rrdvar_chart_variable_set(RRDSET *st, const RRDVAR_ACQUIRED *rva, NETDATA_DOUBLE value) { + if(unlikely(!st->rrdvars || !rva)) return; + + RRDVAR *rv = dictionary_acquired_item_value((const DICTIONARY_ITEM *)rva); + if(rv->value != value) { + rv->value = value; + rrdset_flag_set(st, RRDSET_FLAG_UPSTREAM_SEND_VARIABLES); + } +} + +// ---------------------------------------------------------------------------- +// RRDVAR lookup + +NETDATA_DOUBLE rrdvar2number(const RRDVAR_ACQUIRED *rva) { + if(unlikely(!rva)) return NAN; + RRDVAR *rv = dictionary_acquired_item_value((const DICTIONARY_ITEM *)rva); + return rv->value; +} + +static inline bool rrdvar_get_value(DICTIONARY *dict, STRING *variable, NETDATA_DOUBLE *result) { + bool found = false; + + const RRDVAR_ACQUIRED *rva = rrdvar_get_and_acquire(dict, variable); + if(rva) { + *result = rrdvar2number(rva); + found = true; + dictionary_acquired_item_release(dict, (const DICTIONARY_ITEM *)rva); + } + + return found; +} + +bool rrdvar_get_custom_host_variable_value(RRDHOST *host, STRING *variable, NETDATA_DOUBLE *result) { + return rrdvar_get_value(host->rrdvars, variable, result); +} + +bool rrdvar_get_custom_chart_variable_value(RRDSET *st, STRING *variable, NETDATA_DOUBLE *result) { + return rrdvar_get_value(st->rrdvars, variable, result); +} + +// ---------------------------------------------------------------------------- +// RRDVAR to JSON + +void rrdvar_to_json_members(DICTIONARY *dict, BUFFER *wb) { + RRDVAR *rv; + dfe_start_read(dict, rv) { + buffer_json_member_add_double(wb, rv_dfe.name, rv->value); + } + dfe_done(rv); +} + +void health_api_v1_chart_custom_variables2json(RRDSET *st, BUFFER *buf) { + rrdvar_to_json_members(st->rrdvars, buf); +} + +void health_api_v1_chart_variables2json(RRDSET *st, BUFFER *wb) { + + // FIXME this list is incomplete + // alerts can also access {context}.{dimension} from the entire host database + + RRDHOST *host = st->rrdhost; + + buffer_json_initialize(wb, "\"", "\"", 0, true, BUFFER_JSON_OPTIONS_DEFAULT); + + buffer_json_member_add_string(wb, "chart", rrdset_id(st)); + buffer_json_member_add_string(wb, "chart_name", rrdset_name(st)); + buffer_json_member_add_string(wb, "chart_context", rrdset_context(st)); + buffer_json_member_add_string(wb, "family", rrdset_family(st)); + buffer_json_member_add_string(wb, "host", rrdhost_hostname(host)); + + time_t now = now_realtime_sec(); + + buffer_json_member_add_object(wb, "current_alert_values"); + { + buffer_json_member_add_double(wb, "this", NAN); + buffer_json_member_add_double(wb, "after", (NETDATA_DOUBLE)now - 1); + buffer_json_member_add_double(wb, "before", (NETDATA_DOUBLE)now); + buffer_json_member_add_double(wb, "now", (NETDATA_DOUBLE)now); + buffer_json_member_add_double(wb, "status", (NETDATA_DOUBLE)RRDCALC_STATUS_REMOVED); + buffer_json_member_add_double(wb, "REMOVED", (NETDATA_DOUBLE)RRDCALC_STATUS_REMOVED); + buffer_json_member_add_double(wb, "UNDEFINED", (NETDATA_DOUBLE)RRDCALC_STATUS_UNDEFINED); + buffer_json_member_add_double(wb, "UNINITIALIZED", (NETDATA_DOUBLE)RRDCALC_STATUS_UNINITIALIZED); + buffer_json_member_add_double(wb, "CLEAR", (NETDATA_DOUBLE)RRDCALC_STATUS_CLEAR); + buffer_json_member_add_double(wb, "WARNING", (NETDATA_DOUBLE)RRDCALC_STATUS_WARNING); + buffer_json_member_add_double(wb, "CRITICAL", (NETDATA_DOUBLE)RRDCALC_STATUS_CRITICAL); + buffer_json_member_add_double(wb, "green", NAN); + buffer_json_member_add_double(wb, "red", NAN); + } + buffer_json_object_close(wb); + + buffer_json_member_add_object(wb, "dimensions_last_stored_values"); + { + RRDDIM *rd; + dfe_start_read(st->rrddim_root_index, rd) { + buffer_json_member_add_double(wb, string2str(rd->id), rd->collector.last_stored_value); + if(rd->name != rd->id) + buffer_json_member_add_double(wb, string2str(rd->name), rd->collector.last_stored_value); + } + dfe_done(rd); + } + buffer_json_object_close(wb); + + buffer_json_member_add_object(wb, "dimensions_last_collected_values"); + { + char name[RRD_ID_LENGTH_MAX + 1 + 100]; + RRDDIM *rd; + dfe_start_read(st->rrddim_root_index, rd) { + snprintfz(name, sizeof(name), "%s_raw", string2str(rd->id)); + buffer_json_member_add_int64(wb, name, rd->collector.last_collected_value); + if(rd->name != rd->id) { + snprintfz(name, sizeof(name), "%s_raw", string2str(rd->name)); + buffer_json_member_add_int64(wb, name, rd->collector.last_collected_value); + } + } + dfe_done(rd); + } + buffer_json_object_close(wb); + + buffer_json_member_add_object(wb, "dimensions_last_collected_time"); + { + char name[RRD_ID_LENGTH_MAX + 1 + 100]; + RRDDIM *rd; + dfe_start_read(st->rrddim_root_index, rd) { + snprintfz(name, sizeof(name), "%s_last_collected_t", string2str(rd->id)); + buffer_json_member_add_int64(wb, name, rd->collector.last_collected_time.tv_sec); + if(rd->name != rd->id) { + snprintfz(name, sizeof(name), "%s_last_collected_t", string2str(rd->name)); + buffer_json_member_add_int64(wb, name, rd->collector.last_collected_time.tv_sec); + } + } + dfe_done(rd); + } + buffer_json_object_close(wb); + + buffer_json_member_add_object(wb, "chart_variables"); + { + buffer_json_member_add_int64(wb, "update_every", st->update_every); + buffer_json_member_add_uint64(wb, "last_collected_t", st->last_collected_time.tv_sec); + + rrdvar_to_json_members(st->rrdvars, wb); + } + buffer_json_object_close(wb); + + buffer_json_member_add_object(wb, "host_variables"); + { + rrdvar_to_json_members(st->rrdhost->rrdvars, wb); + } + buffer_json_object_close(wb); + + buffer_json_member_add_object(wb, "alerts"); + { + struct scored { + bool existing; + STRING *chart; + STRING *context; + NETDATA_DOUBLE value; + size_t score; + } tmp, *z; + DICTIONARY *dict = dictionary_create(DICT_OPTION_SINGLE_THREADED | DICT_OPTION_DONT_OVERWRITE_VALUE); + + RRDCALC *rc; + dfe_start_read(st->rrdhost->rrdcalc_root_index, rc) { + tmp = (struct scored) { + .existing = false, + .chart = string_dup(rc->rrdset->id), + .context = string_dup(rc->rrdset->context), + .value = rc->value, + .score = rrdlabels_common_count(rc->rrdset->rrdlabels, st->rrdlabels), + }; + z = dictionary_set(dict, string2str(rc->config.name), &tmp, sizeof(tmp)); + + if(z->existing) { + if(tmp.score > z->score) + SWAP(*z, tmp); + z->existing = true; + string_freez(tmp.chart); + string_freez(tmp.context); + } + else + z->existing = true; + } + dfe_done(rc); + + dfe_start_read(dict, z) { + buffer_json_member_add_object(wb, z_dfe.name); + { + buffer_json_member_add_double(wb, "value", z->value); + buffer_json_member_add_string(wb, "instance", string2str(z->chart)); + buffer_json_member_add_string(wb, "context", string2str(z->context)); + buffer_json_member_add_uint64(wb, "score", z->score); + } + buffer_json_object_close(wb); + + string_freez(z->chart); + string_freez(z->context); + } + dfe_done(z); + + dictionary_destroy(dict); + } + buffer_json_object_close(wb); + + buffer_json_finalize(wb); +} + +// ---------------------------------------------------------------------------- +// RRDVAR private members examination + +const char *rrdvar_name(const RRDVAR_ACQUIRED *rva) { + return dictionary_acquired_item_name((const DICTIONARY_ITEM *)rva); +} + +void rrdvar_print_to_streaming_custom_chart_variables(RRDSET *st, BUFFER *wb) { + rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_SEND_VARIABLES); + + // send the chart local custom variables + RRDVAR *rv; + dfe_start_read(st->rrdvars, rv) { + buffer_sprintf(wb + , "VARIABLE CHART %s = " NETDATA_DOUBLE_FORMAT "\n" + , rv_dfe.name, rv->value + ); + } + dfe_done(rv); +} diff --git a/src/health/rrdvar.h b/src/health/rrdvar.h new file mode 100644 index 000000000..31530589d --- /dev/null +++ b/src/health/rrdvar.h @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_RRDVAR_H +#define NETDATA_RRDVAR_H 1 + +#include "libnetdata/libnetdata.h" + +#define RRDVAR_MAX_LENGTH 1024 + +int rrdvar_fix_name(char *variable); + +#include "database/rrd.h" + +STRING *rrdvar_name_to_string(const char *name); + +const RRDVAR_ACQUIRED *rrdvar_host_variable_add_and_acquire(RRDHOST *host, const char *name); +void rrdvar_host_variable_set(RRDHOST *host, const RRDVAR_ACQUIRED *rva, NETDATA_DOUBLE value); + +int rrdvar_walkthrough_read(DICTIONARY *dict, int (*callback)(const DICTIONARY_ITEM *item, void *rrdvar, void *data), void *data); + +#define rrdvar_host_variable_release(host, rva) rrdvar_release((host)->rrdvars, rva) +#define rrdvar_chart_variable_release(st, rva) rrdvar_release((st)->rrdvars, rva) +void rrdvar_release(DICTIONARY *dict, const RRDVAR_ACQUIRED *rva); + +NETDATA_DOUBLE rrdvar2number(const RRDVAR_ACQUIRED *rva); + +const RRDVAR_ACQUIRED *rrdvar_add_and_acquire(DICTIONARY *dict, STRING *name, NETDATA_DOUBLE value); + +DICTIONARY *rrdvariables_create(void); +void rrdvariables_destroy(DICTIONARY *dict); + +void rrdvar_delete_all(DICTIONARY *dict); + +const char *rrdvar_name(const RRDVAR_ACQUIRED *rva); + +void rrdvar_print_to_streaming_custom_chart_variables(RRDSET *st, BUFFER *wb); + +const RRDVAR_ACQUIRED *rrdvar_chart_variable_add_and_acquire(RRDSET *st, const char *name); +void rrdvar_chart_variable_set(RRDSET *st, const RRDVAR_ACQUIRED *rva, NETDATA_DOUBLE value); + +bool rrdvar_get_custom_host_variable_value(RRDHOST *host, STRING *variable, NETDATA_DOUBLE *result); +bool rrdvar_get_custom_chart_variable_value(RRDSET *st, STRING *variable, NETDATA_DOUBLE *result); + +#endif //NETDATA_RRDVAR_H diff --git a/src/health/schema.d/health%3Aalert%3Aprototype.json b/src/health/schema.d/health%3Aalert%3Aprototype.json new file mode 100644 index 000000000..309d052de --- /dev/null +++ b/src/health/schema.d/health%3Aalert%3Aprototype.json @@ -0,0 +1,687 @@ +{ + "jsonSchema": { + "$schema": "http://json-schema.org/draft-07/schema#", + "definitions": { + "matchInstanceLabels": { + "type": "string", + "default": "*", + "title": "Only for instances with these labels" + }, + "matchHostLabels": { + "type": "string", + "default": "*", + "title": "Only for nodes with these host labels" + }, + "matchInstance": { + "type": "object", + "title": "Apply this rule to a single instance", + "description": "This is a single alert rule that will be applied to the specific instance on all nodes hosted on this Netdata.", + "properties": { + "on": { + "type": "string", + "default": "", + "title": "The instance this rule should be applied to.", + "description": "You can find the instance names on all charts at the instances drop down menu. Do not include the host name in this field." + }, + "host_labels": { "$ref": "#/definitions/matchHostLabels" }, + "instance_labels": { "$ref": "#/definitions/matchInstanceLabels" } + }, + "required": [ + "on", + "host_labels", + "instance_labels" + ] + }, + "matchTemplate": { + "type": "object", + "title": "Apply this rule to all instances of a context", + "description": "This rule defines a template, that will apply this alert to all instances (e.g. disks, network interfaces, nginx servers, etc) on all nodes hosted on this Netdata.", + "properties": { + "on": { + "type": "string", + "default": "", + "title": "The context of the instances this rule should be applied to.", + "description": "The context is the code-name of each chart on the dashboard, that appears at the chart title bar, between the chart title and its unit of measurement, like: system.cpu, disk.io, etc." + }, + "host_labels": { "$ref": "#/definitions/matchHostLabels" }, + "instance_labels": { "$ref": "#/definitions/matchInstanceLabels" } + }, + "required": [ + "on", + "host_labels", + "instance_labels" + ] + }, + "configSummary": { + "type": "string", + "title": "Short description of the alert", + "description": "This field is used in notification as a short description of the alert. Variables, like ${label:key}, are replaced with the value of instance label called 'key'." + }, + "configInfo": { + "type": "string", + "title": "Long description of the alert", + "description": "This field is used to provide enough information about the type and nature of the alert. Variables, like ${label:key}, are replaced with the value of instance label called 'key'." + }, + "configType": { + "type": "string", + "title": "Alert Type", + "description": "Use categories like: 'System', 'Containers', 'Web Servers', 'Message Brokers', etc." + }, + "configComponent": { + "type": "string", + "title": "Alert Component", + "description": "Component is a sub-type of Alert Type. Examples: 'CPU', 'Memory', 'Network', 'Disk', 'Hardware', 'nginx', 'redis', 'postgresql', etc." + }, + "configClassification": { + "type": "string", + "title": "Classification", + "description": "Use 'Workload', 'Utilization', 'Latency', 'Availability', 'Errors', etc." + }, + "configValue": { + "type": "object", + "title": "", + "description": "Each alert has a value. This section defines how this value is calculated.", + "properties": { + "database_lookup": { + "type": "object", + "title": "Database Query to Get Value", + "description": "The database query to be executed to calculate the value of the alert. When set, the query is executed before any other calculations. The result of the query will be available as $this in further calculations.", + "properties": { + "data_source": { + "type": "string", + "oneOf": [ + { "const": "samples", "title": "Samples", "description": "Use the time-series values for each dimension" }, + { "const": "percentages", "title": "Percentages", "description": "Use the percentage of each dimension vs the sum of all dimensions" }, + { "const": "anomalies", "title": "Anomalies", "description": "Use the anomaly rate of each dimension" } + ], + "default": "samples", + "title": " ", + "description": "" + }, + "time_group": { + "type": "string", + "oneOf": [ + { "const": "average", "title": "Average" }, + { "const": "median", "title": "Median" }, + { "const": "min", "title": "Minimum" }, + { "const": "max", "title": "Maximum" }, + { "const": "sum", "title": "Sum" }, + { "const": "incremental_sum", "title": "Incremental Sum" }, + { "const": "stddev", "title": "Standard Deviation" }, + { "const": "cv", "title": "Coefficient of Variation" }, + { "const": "trimmed-mean", "title": "Trimmed Mean" }, + { "const": "trimmed-median", "title": "Trimmed Median" }, + { "const": "percentile", "title": "Percentile" }, + { "const": "ses", "title": "Simple Exponential Smoothing" }, + { "const": "des", "title": "Double Exponential Smoothing" }, + { "const": "countif", "title": "Count If" } + ], + "default": "average", + "title": "Time Aggregation", + "description": "" + }, + "after": { + "type": "integer", + "default": -600, + "title": "From", + "description": "Relative to 'To'" + }, + "before": { + "type": "integer", + "default": 0, + "title": "To", + "description": "Ending timestamp" + }, + "dims_group": { + "type": "string", + "oneOf": [ + { "const": "average", "title": "Average", "description": "The average of all dimensions" }, + { "const": "min", "title": "Minimum", "description": "The minimum of all dimensions" }, + { "const": "max", "title": "Maximum", "description": "The maximum of all dimensions" }, + { "const": "sum", "title": "Sum", "description": "The sum of all dimensions" }, + { "const": "min2max", "title": "Min-to-Max", "description": "The delta between the minimum of the maximum of the dimensions" } + ], + "default": "sum", + "title": "Dims Aggregation", + "description": "on dimensions" + }, + "dimensions": { + "type": "string", + "title": "Dimensions", + "description": "Simple pattern", + "default": "*" + }, + "options": { + "type": "array", + "title": "Time-Series Query options", + "description": "Options affecting the way the value is calculated", + "uniqueItems": true, + "items": { + "oneOf": [ + { "const": "unaligned", "title": "Do not shift the time-frame for visual presentation" }, + { "const": "abs", "title": "Make all values positive before using them" }, + { "const": "null2zero", "title": "Treat gaps in the time-series as a zero value" }, + { "const": "match_ids", "title": "Match only dimension IDs, not Names" }, + { "const": "match_names", "title": "Match only dimension Names, not IDs" } + ] + }, + "default": [ "unaligned" ] + } + }, + "allOf": [ + { + "if": { + "properties": { + "time_group": { + "enum": ["trimmed-mean"] + } + } + }, + "then": { + "properties": { + "time_group_value": { + "type": "integer", + "default": 1, + "title": "Trim %", + "description": "" + } + }, + "required": ["time_group_value"] + } + }, + { + "if": { + "properties": { + "time_group": { + "enum": ["trimmed-median"] + } + } + }, + "then": { + "properties": { + "time_group_value": { + "type": "integer", + "default": 1, + "title": "Trim %", + "description": "" + } + }, + "required": ["time_group_value"] + } + }, + { + "if": { + "properties": { + "time_group": { + "enum": ["percentile"] + } + } + }, + "then": { + "properties": { + "time_group_value": { + "type": "integer", + "default": 95, + "title": "Percentage", + "description": "" + } + }, + "required": ["time_group_value"] + } + }, + { + "if": { + "properties": { + "time_group": { + "const": "countif" + } + } + }, + "then": { + "properties": { + "time_group_condition": { + "type": "string", + "oneOf": [ + { "const": "!=", "title": "!=" }, + { "const": "=", "title": "==" }, + { "const": ">=", "title": ">=" }, + { "const": ">", "title": ">" }, + { "const": "<=", "title": "<=" }, + { "const": "<", "title": "<" } + ], + "default": "equal", + "title": "Condition", + "description": "" + }, + "time_group_value": { + "type": "number", + "default": 1, + "title": "Value to match", + "description": "" + } + }, + "required": ["time_group_condition", "time_group_value"] + } + } + ] + }, + "calculation": { + "type": "string", + "title": "Calculation", + "description": "An expression to transform the value" + }, + "units": { + "type": "string", + "title": "Unit", + "description": "of measurement" + }, + "update_every": { + "type": "integer", + "default": 10, + "minimum": 1, + "title": "Frequency", + "description": "of evaluation" + } + } + }, + "configConditions": { + "type": "object", + "title": "", + "properties": { + "warning_condition": { + "type": "string", + "title": "Warning Expression", + "description": "The alert value is available as '$this'. If this expression evaluates to a non-zero value, the alert is considered to be in warning level." + }, + "critical_condition": { + "type": "string", + "title": "Critical Expression", + "description": "The alert value is available as '$this'. If this expression evaluates to a non-zero value, the alert is considered to be in critical level." + } + } + }, + "configAction": { + "type": "object", + "title": "", + "description": "The action the alert should take when it transitions states", + "properties": { + "execute": { + "type": "string", + "title": "Command to execute when the alert transitions states" + }, + "recipient": { + "type": "string", + "title": "Recipient(s)" + }, + "options": { + "type": "array", + "title": "Action Options", + "uniqueItems": true, + "items": { + "oneOf": [ + { "const": "no-clear-notification", "title": "Do not perform any action when the alert is cleared"} + ] + }, + "default": [] + }, + "delay": { + "type": "object", + "title": "Delay the action (notification)", + "description": "Rules to postpone the action, to avoid multiple notifications on flapping alerts.", + "properties": { + "up": { + "type": "integer", + "title": "Delay when raising" + }, + "down": { + "type": "integer", + "title": "Delay when going Down" + }, + "multiplier": { + "type": "number", + "title": "Back-Off" + }, + "max": { + "type": "integer", + "title": "Max" + } + } + }, + "repeat": { + "type": "object", + "title": "Auto-Repeat Action", + "description": "Repeat the action while the alert is raised.", + "properties": { + "enabled": { + "type": "boolean" + }, + "warning": { + "type": "integer", + "title": "Repeat on Warning" + }, + "critical": { + "type": "integer", + "title": "Repeat on Critical" + } + } + } + } + }, + "configInstance": { + "type": "object", + "title": "Alert Configuration", + "description": "The properties that control the value the alert will get, the conditions it will trigger, the back-off for notifications, the auto-repeating of notifications, etc.", + "properties": { + "match": { "$ref": "#/definitions/matchInstance" }, + "summary": { "$ref": "#/definitions/configSummary" }, + "info": { "$ref": "#/definitions/configInfo" }, + "type": { "$ref": "#/definitions/configType" }, + "component": { "$ref": "#/definitions/configComponent" }, + "classification": { "$ref": "#/definitions/configClassification" }, + "value": { "$ref": "#/definitions/configValue" }, + "conditions": { "$ref": "#/definitions/configConditions" }, + "actions": { "$ref": "#/definitions/configAction" } + }, + "required": [] + }, + "configTemplate": { + "type": "object", + "title": "Alert Configuration", + "description": "The properties that control the value the alert will get, the conditions it will trigger, the back-off for notifications, the auto-repeating of notifications, etc.", + "properties": { + "match": { "$ref": "#/definitions/matchTemplate" }, + "summary": { "$ref": "#/definitions/configSummary" }, + "info": { "$ref": "#/definitions/configInfo" }, + "type": { "$ref": "#/definitions/configType" }, + "component": { "$ref": "#/definitions/configComponent" }, + "classification": { "$ref": "#/definitions/configClassification" }, + "value": { "$ref": "#/definitions/configValue" }, + "conditions": { "$ref": "#/definitions/configConditions" }, + "action": { "$ref": "#/definitions/configAction" } + }, + "required": [] + } + }, + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "format_version": { + "type": "integer", + "default": 1 + }, + "rules": { + "type": "array", + "items": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean", + "default": true, + "title": "Enabled", + "description": "" + }, + "type": { + "type": "string", + "oneOf": [ + { "const": "instance" , "title": "A specific Instance" }, + { "const": "template" , "title": "Each of the Instances" } + ], + "default": "template", + "title": "Apply this rule to:", + "description": "" + } + }, + "required": [ "type", "enabled" ], + "if": { + "properties": { + "type": { "const": "instance" } + } + }, + "then": { + "properties": { + "config": { "$ref": "#/definitions/configInstance" } + } + }, + "else": { + "properties": { + "config": { "$ref": "#/definitions/configTemplate" } + } + } + } + } + }, + "required": [ + "rules" + ] + }, + "uiSchema": { + "uiOptions": { + "fullPage": true + }, + "format_version": { + "ui:widget": "hidden" + }, + "name": { + "ui:widget": "hidden" + }, + "rules": { + "ui:openEmptyItem": true, + "items": { + "ui:classNames": "dyncfg-grid dyncfg-grid-col-6", + "enabled": { + "ui:classNames": "dyncfg-grid-col-span-1-2", + "ui:widget": "checkbox" + }, + "type": { + "ui:classNames": "dyncfg-grid-col-span-5-2", + "ui:help": "Rules can be configured to match a specific instance (like a specific disk), or match all the instances (like all the disks). All rules are always checked against all nodes streamed to this Netdata, so the matching rules include patterns to match both instances and nodes.", + "ui:widget": "radio", + "ui:options": { + "flavour": "buttonGroup" + } + }, + "config": { + "ui:classNames": "dyncfg-grid dyncfg-grid-col-6 dyncfg-grid-col-span-1-6", + "ui:flavour": "tabs", + "ui:options": { + "tabs": [ + { + "title": "Match", + "fields": [ + "match" + ] + }, + { + "title": "Value", + "fields": [ + "value" + ] + }, + { + "title": "Triggers", + "fields": [ + "conditions" + ] + }, + { + "title": "Action", + "fields": [ + "action" + ] + }, + { + "title": "Description", + "fields": [ + "summary", + "info", + "type", + "component", + "classification" + ] + } + ] + }, + "match": { + "ui:classNames": "dyncfg-grid dyncfg-grid-col-6 dyncfg-grid-col-span-1-6", + "on": { + "ui:classNames": "dyncfg-grid-col-span-1-6" + }, + "host_labels": { + "ui:help": "A simple pattern to match the node labels of the nodes this rule is to be applied to. A space separated list of label=value pairs is accepted. Asterisks can be placed anywhere, including the label key. The label keys and their values are available at the labels filter of the charts on the dashboard.", + "ui:classNames": "dyncfg-grid-col-span-1-3" + }, + "instance_labels": { + "ui:classNames": "dyncfg-grid-col-span-4-3", + "ui:help": "A simple pattern to match the instance labels of the instances this rule is to be applied to. A space separated list of label=value pairs is accepted. Asterisks can be placed anywhere, including the label key. The label keys and their values are available at the labels filter of the charts on the dashboard." + } + }, + "summary": { + "ui:classNames": "dyncfg-grid-col-span-1-3" + }, + "info": { + "ui:classNames": "dyncfg-grid-col-span-4-3" + }, + "type": { + "ui:classNames": "dyncfg-grid-col-span-1-2" + }, + "component": { + "ui:classNames": "dyncfg-grid-col-span-3-2" + }, + "classification": { + "ui:classNames": "dyncfg-grid-col-span-5-2" + }, + "value": { + "ui:classNames": "dyncfg-grid dyncfg-grid-col-6 dyncfg-grid-col-span-1-6", + "database_lookup": { + "ui:order": ["data_source", "time_group", "time_group_condition", "time_group_value", "after", "before", "dims_group", "dimensions", "options"], + "ui:classNames": "dyncfg-grid dyncfg-grid-col-6 dyncfg-grid-col-span-1-6", + "ui:collapsible": true, + "ui:initiallyExpanded": true, + "data_source": { + "ui:widget": "radio", + "ui:options": { + "flavour": "buttonGroup" + }, + "ui:classNames": "dyncfg-grid-col-span-1-6" + }, + "time_group": { + "ui:help": "When querying time-series data we need to come up with a single value. This function is used to aggregate all the values of the time-series data to a single value.", + "ui:classNames": "dyncfg-grid-col-span-1-2" + }, + "time_group_condition": { + "ui:classNames": "dyncfg-grid-col-span-3-1" + }, + "time_group_value": { + "ui:classNames": "dyncfg-grid-col-span-4-1" + }, + "after": { + "ui:help": "The oldest timestamp of the time-series data to be included in the query. Negative values define a duration in seconds in the past of 'To' (so, -60 means a minute ago from 'To').", + "ui:classNames": "dyncfg-grid-col-span-1-1" + }, + "before": { + "ui:help": "The newest timestamp of the time-series data to be included in the query. Negative value define a duration in seconds in the past (so, -60 means a minute ago). Zero means now.", + "ui:classNames": "dyncfg-grid-col-span-2-1" + }, + "dims_group": { + "ui:help": "After each dimension has a single computed value, use this algorithm to derive the final value.", + "ui:classNames": "dyncfg-grid-col-span-3-2" + }, + "dimensions": { + "ui:help": "A simple pattern to match the dimensions that should be included in the query", + "ui:classNames": "dyncfg-grid-col-span-5-2" + }, + "options": { + "ui:classNames": "dyncfg-grid-col-span-1-6" + } + }, + "calculation": { + "ui:help": "The database value is available as '$this'. This expression can utilize variables to transform the value of the alert.", + "ui:classNames": "dyncfg-grid-col-span-1-4", + "ui:placeholder": "$this * 1" + }, + "units": { + "ui:help": "The unit of measurement the alert value is expressed with. If unset, the units of the instance the alert is attached to will be used.", + "ui:classNames": "dyncfg-grid-col-span-5-1" + }, + "update_every": { + "ui:help": "The frequency this alarm is to be evaluated, in seconds.", + "ui:classNames": "dyncfg-grid-col-span-6-1" + } + }, + "conditions": { + "ui:classNames": "dyncfg-grid dyncfg-grid-col-6 dyncfg-grid-col-span-1-6", + "warning_condition": { + "ui:classNames": "dyncfg-grid-col-span-1-6" + }, + "critical_condition": { + "ui:classNames": "dyncfg-grid-col-span-1-6" + } + }, + "action": { + "ui:classNames": "dyncfg-grid dyncfg-grid-col-6 dyncfg-grid-col-span-1-6", + "execute": { + "ui:classNames": "dyncfg-grid-col-span-1-3", + "ui:help": "Leave this empty to get the default alert notification" + }, + "recipient": { + "ui:classNames": "dyncfg-grid-col-span-4-1", + "ui:help": "A space separated list of the recipients of the alert notifications. The special recipient 'silent' prevents this alert from taking any action (i.e. sending notifications)." + }, + "options": { + "ui:classNames": "dyncfg-grid-col-span-5-2", + "ui:help": "Options related to the actions this alert will take." + }, + "delay": { + "ui:collapsible": true, + "ui:initiallyExpanded": false, + "ui:classNames": "dyncfg-grid dyncfg-grid-col-6 dyncfg-grid-col-span-1-6", + "up": { + "ui:classNames": "dyncfg-grid-col-span-1-2", + "ui:help": "Delay the action (notification) that many seconds, when the alert is rising." + }, + "down": { + "ui:classNames": "dyncfg-grid-col-span-3-2", + "ui:help": "Delay the action (notification) that many seconds, when the alert is recovering." + }, + "multiplier": { + "ui:classNames": "dyncfg-grid-col-span-5-1", + "ui:help": "Multiply the delay by this number, every time the alert transitions to a new state, while the action (notification) is being delayed." + }, + "max": { + "ui:classNames": "dyncfg-grid-col-span-6-1", + "ui:help": "The maximum acceptable delay in seconds, for taking the action (notification)." + } + }, + "repeat": { + "ui:collapsible": true, + "ui:initiallyExpanded": false, + "ui:classNames": "dyncfg-grid dyncfg-grid-col-6 dyncfg-grid-col-span-1-6", + "enabled": { + "ui:classNames": "dyncfg-grid-col-span-1-2" + }, + "warning": { + "ui:classNames": "dyncfg-grid-col-span-3-2", + "ui:help": "The number of seconds to repeat the action while the alert is in warning state" + }, + "critical": { + "ui:classNames": "dyncfg-grid-col-span-5-2", + "ui:help": "The number of seconds to repeat the action while the alert is in critical state" + } + } + }, + "hash": { + "ui:widget": "hidden" + }, + "source_type": { + "ui:widget": "hidden" + }, + "source": { + "ui:widget": "hidden" + } + } + } + } + } +} |