diff options
Diffstat (limited to 'collectors/proc.plugin/proc_mdstat.c')
-rw-r--r-- | collectors/proc.plugin/proc_mdstat.c | 1288 |
1 files changed, 647 insertions, 641 deletions
diff --git a/collectors/proc.plugin/proc_mdstat.c b/collectors/proc.plugin/proc_mdstat.c index abfd2ff12..e932453b4 100644 --- a/collectors/proc.plugin/proc_mdstat.c +++ b/collectors/proc.plugin/proc_mdstat.c @@ -1,641 +1,647 @@ -// SPDX-License-Identifier: GPL-3.0-or-later
-
-#include "plugin_proc.h"
-
-#define PLUGIN_PROC_MODULE_MDSTAT_NAME "/proc/mdstat"
-
-struct raid {
- int redundant;
- char *name;
- uint32_t hash;
-
- RRDDIM *rd_health;
- unsigned long long failed_disks;
-
- RRDSET *st_disks;
- RRDDIM *rd_down;
- RRDDIM *rd_inuse;
- unsigned long long total_disks;
- unsigned long long inuse_disks;
-
- RRDSET *st_operation;
- RRDDIM *rd_check;
- RRDDIM *rd_resync;
- RRDDIM *rd_recovery;
- RRDDIM *rd_reshape;
- unsigned long long check;
- unsigned long long resync;
- unsigned long long recovery;
- unsigned long long reshape;
-
- RRDSET *st_finish;
- RRDDIM *rd_finish_in;
- unsigned long long finish_in;
-
- RRDSET *st_speed;
- RRDDIM *rd_speed;
- unsigned long long speed;
-
- char *mismatch_cnt_filename;
- RRDSET *st_mismatch_cnt;
- RRDDIM *rd_mismatch_cnt;
- unsigned long long mismatch_cnt;
-
- RRDSET *st_nonredundant;
- RRDDIM *rd_nonredundant;
-};
-
-struct old_raid {
- int redundant;
- char *name;
- uint32_t hash;
- int found;
-};
-
-static inline char *remove_trailing_chars(char *s, char c) {
- while(*s) {
- if(unlikely(*s == c)) {
- *s = '\0';
- }
- s++;
- }
- return s;
-}
-
-static inline void make_chart_obsolete(char *name, const char *id_modifier) {
- char id[50 + 1];
- RRDSET *st = NULL;
-
- if(likely(name && id_modifier)) {
- snprintfz(id, 50, "mdstat.%s_%s", name, id_modifier);
- st = rrdset_find_byname_localhost(id);
- if(likely(st)) rrdset_is_obsolete(st);
- }
-}
-
-int do_proc_mdstat(int update_every, usec_t dt) {
- (void)dt;
- static procfile *ff = NULL;
- static int do_health = -1, do_nonredundant = -1, do_disks = -1, do_operations = -1, do_mismatch = -1, do_mismatch_config = -1;
- static int make_charts_obsolete = -1;
- static char *mdstat_filename = NULL, *mismatch_cnt_filename = NULL;
- static struct raid *raids = NULL;
- static size_t raids_allocated = 0;
- size_t raids_num = 0, raid_idx = 0, redundant_num = 0;
- static struct old_raid *old_raids = NULL;
- static size_t old_raids_allocated = 0;
- size_t old_raid_idx = 0;
-
- if(unlikely(do_health == -1)){
- do_health = config_get_boolean("plugin:proc:/proc/mdstat", "faulty devices", CONFIG_BOOLEAN_YES);
- do_nonredundant = config_get_boolean("plugin:proc:/proc/mdstat", "nonredundant arrays availability", CONFIG_BOOLEAN_YES);
- do_mismatch_config = config_get_boolean_ondemand("plugin:proc:/proc/mdstat", "mismatch count", CONFIG_BOOLEAN_AUTO);
- do_disks = config_get_boolean("plugin:proc:/proc/mdstat", "disk stats", CONFIG_BOOLEAN_YES);
- do_operations = config_get_boolean("plugin:proc:/proc/mdstat", "operation status", CONFIG_BOOLEAN_YES);
-
- make_charts_obsolete = config_get_boolean("plugin:proc:/proc/mdstat", "make charts obsolete", CONFIG_BOOLEAN_YES);
-
- char filename[FILENAME_MAX + 1];
-
- snprintfz(filename, FILENAME_MAX, "%s%s", netdata_configured_host_prefix, "/proc/mdstat");
- mdstat_filename = config_get("plugin:proc:/proc/mdstat", "filename to monitor", filename);
-
- snprintfz(filename, FILENAME_MAX, "%s%s", netdata_configured_host_prefix, "/sys/block/%s/md/mismatch_cnt");
- mismatch_cnt_filename = config_get("plugin:proc:/proc/mdstat", "mismatch_cnt filename to monitor", filename);
- }
-
- if(unlikely(!ff)) {
- ff = procfile_open(mdstat_filename, " \t:", PROCFILE_FLAG_DEFAULT);
- if(unlikely(!ff)) return 1;
- }
-
- ff = procfile_readall(ff);
- if(unlikely(!ff)) return 0; // we return 0, so that we will retry opening it next time
-
- size_t lines = procfile_lines(ff);
- size_t words = 0;
-
- if(unlikely(lines < 2)) {
- error("Cannot read /proc/mdstat. Expected 2 or more lines, read %zu.", lines);
- return 1;
- }
-
- // find how many raids are there
- size_t l;
- raids_num = 0;
- for(l = 1; l < lines - 2 ; l++) {
- if(unlikely(procfile_lineword(ff, l, 1)[0] == 'a')) // check if the raid is active
- raids_num++;
- }
-
- if(unlikely(!raids_num && !old_raids_allocated)) return 0; // we return 0, so that we will retry searching for raids next time
-
- // allocate the memory we need;
- if(unlikely(raids_num != raids_allocated)) {
- for(raid_idx = 0; raid_idx < raids_allocated; raid_idx++) {
- struct raid *raid = &raids[raid_idx];
- freez(raid->name);
- freez(raid->mismatch_cnt_filename);
- }
- if(raids_num) {
- raids = (struct raid *)reallocz(raids, raids_num * sizeof(struct raid));
- memset(raids, 0, raids_num * sizeof(struct raid));
- }
- else {
- freez(raids);
- raids = NULL;
- }
- raids_allocated = raids_num;
- }
-
- // loop through all lines except the first and the last ones
- for(l = 1, raid_idx = 0; l < (lines - 2) && raid_idx < raids_num; l++) {
- struct raid *raid = &raids[raid_idx];
- raid->redundant = 0;
-
- words = procfile_linewords(ff, l);
- if(unlikely(words < 2)) continue;
-
- if(unlikely(procfile_lineword(ff, l, 1)[0] != 'a')) continue;
- if(unlikely(!raid->name)) {
- raid->name = strdupz(procfile_lineword(ff, l, 0));
- raid->hash = simple_hash(raid->name);
- }
- else if(unlikely(strcmp(raid->name, procfile_lineword(ff, l, 0)))) {
- freez(raid->name);
- freez(raid->mismatch_cnt_filename);
- memset(raid, 0, sizeof(struct raid));
- raid->name = strdupz(procfile_lineword(ff, l, 0));
- raid->hash = simple_hash(raid->name);
- }
- if(unlikely(!raid->name || !raid->name[0])) continue;
- raid_idx++;
-
- // check if raid has disk status
- l++;
- words = procfile_linewords(ff, l);
- if(words < 2 || procfile_lineword(ff, l, words - 1)[0] != '[') continue;
-
- // split inuse and total number of disks
- if(likely(do_health || do_disks)) {
- char *s = NULL, *str_total = NULL, *str_inuse = NULL;
-
- s = procfile_lineword(ff, l, words - 2);
- if(unlikely(s[0] != '[')) {
- error("Cannot read /proc/mdstat raid health status. Unexpected format: missing opening bracket.");
- continue;
- }
- str_total = ++s;
- while(*s) {
- if(unlikely(*s == '/')) {
- *s = '\0';
- str_inuse = s + 1;
- }
- else if(unlikely(*s == ']')) {
- *s = '\0';
- break;
- }
- s++;
- }
- if(unlikely(str_total[0] == '\0' || !str_inuse || str_inuse[0] == '\0')) {
- error("Cannot read /proc/mdstat raid health status. Unexpected format.");
- continue;
- }
-
- raid->inuse_disks = str2ull(str_inuse);
- raid->total_disks = str2ull(str_total);
- raid->failed_disks = raid->total_disks - raid->inuse_disks;
- }
-
- raid->redundant = 1;
- redundant_num++;
- l++;
-
- // check if any operation is performed on the raid
- if(likely(do_operations)) {
- char *s = NULL;
-
- raid->check = 0;
- raid->resync = 0;
- raid->recovery = 0;
- raid->reshape = 0;
- raid->finish_in = 0;
- raid->speed = 0;
-
- words = procfile_linewords(ff, l);
- if(likely(words < 2)) continue;
- if(unlikely(procfile_lineword(ff, l, 0)[0] != '[')) continue;
- if(unlikely(words < 7)) {
- error("Cannot read /proc/mdstat line. Expected 7 params, read %zu.", words);
- continue;
- }
-
- char *word;
- word = procfile_lineword(ff, l, 3);
- remove_trailing_chars(word, '%');
-
- unsigned long long percentage = (unsigned long long)(str2ld(word, NULL) * 100);
- // possible operations: check, resync, recovery, reshape
- // 4-th character is unique for each operation so it is checked
- switch(procfile_lineword(ff, l, 1)[3]) {
- case 'c': // check
- raid->check = percentage;
- break;
- case 'y': // resync
- raid->resync = percentage;
- break;
- case 'o': // recovery
- raid->recovery = percentage;
- break;
- case 'h': // reshape
- raid->reshape = percentage;
- break;
- }
-
- word = procfile_lineword(ff, l, 5);
- s = remove_trailing_chars(word, 'm'); // remove trailing "min"
-
- word += 7; // skip leading "finish="
-
- if(likely(s > word))
- raid->finish_in = (unsigned long long)(str2ld(word, NULL) * 60);
-
- word = procfile_lineword(ff, l, 6);
- s = remove_trailing_chars(word, 'K'); // remove trailing "K/sec"
-
- word += 6; // skip leading "speed="
-
- if(likely(s > word))
- raid->speed = str2ull(word);
- }
- }
-
- // read mismatch_cnt files
- if(do_mismatch == -1) {
- if(do_mismatch_config == CONFIG_BOOLEAN_AUTO) {
- if(raids_num > 50)
- do_mismatch = CONFIG_BOOLEAN_NO;
- else
- do_mismatch = CONFIG_BOOLEAN_YES;
- }
- else
- do_mismatch = do_mismatch_config;
- }
-
- if(likely(do_mismatch)) {
- for(raid_idx = 0; raid_idx < raids_num ; raid_idx++) {
- char filename[FILENAME_MAX + 1];
- struct raid *raid = &raids[raid_idx];
-
- if(likely(raid->redundant)) {
- if(unlikely(!raid->mismatch_cnt_filename)) {
- snprintfz(filename, FILENAME_MAX, mismatch_cnt_filename, raid->name);
- raid->mismatch_cnt_filename = strdupz(filename);
- }
- if(unlikely(read_single_number_file(raid->mismatch_cnt_filename, &raid->mismatch_cnt))) {
- error("Cannot read file '%s'", raid->mismatch_cnt_filename);
- do_mismatch = CONFIG_BOOLEAN_NO;
- error("Monitoring for mismatch count has been disabled");
- break;
- }
- }
- }
- }
-
- // check for disappeared raids
- for(old_raid_idx = 0; old_raid_idx < old_raids_allocated; old_raid_idx++) {
- struct old_raid *old_raid = &old_raids[old_raid_idx];
- int found = 0;
-
- for(raid_idx = 0; raid_idx < raids_num ; raid_idx++) {
- struct raid *raid = &raids[raid_idx];
-
- if(unlikely(raid->hash == old_raid->hash
- && !strcmp(raid->name, old_raid->name)
- && raid->redundant == old_raid->redundant)) found = 1;
- }
-
- old_raid->found = found;
- }
-
- int raid_disappeared = 0;
- for(old_raid_idx = 0; old_raid_idx < old_raids_allocated; old_raid_idx++) {
- struct old_raid *old_raid = &old_raids[old_raid_idx];
-
- if(unlikely(!old_raid->found)) {
- if(likely(make_charts_obsolete)) {
- make_chart_obsolete(old_raid->name, "disks");
- make_chart_obsolete(old_raid->name, "mismatch");
- make_chart_obsolete(old_raid->name, "operation");
- make_chart_obsolete(old_raid->name, "finish");
- make_chart_obsolete(old_raid->name, "speed");
- make_chart_obsolete(old_raid->name, "availability");
- }
- raid_disappeared = 1;
- }
- }
-
- // allocate memory for nonredundant arrays
- if(unlikely(raid_disappeared || old_raids_allocated != raids_num)) {
- for(old_raid_idx = 0; old_raid_idx < old_raids_allocated; old_raid_idx++) {
- freez(old_raids[old_raid_idx].name);
- }
- if(likely(raids_num)) {
- old_raids = reallocz(old_raids, sizeof(struct old_raid) * raids_num);
- memset(old_raids, 0, sizeof(struct old_raid) * raids_num);
- }
- else {
- freez(old_raids);
- old_raids = NULL;
- }
- old_raids_allocated = raids_num;
- for(old_raid_idx = 0; old_raid_idx < old_raids_allocated; old_raid_idx++) {
- struct old_raid *old_raid = &old_raids[old_raid_idx];
- struct raid *raid = &raids[old_raid_idx];
-
- old_raid->name = strdupz(raid->name);
- old_raid->hash = raid->hash;
- old_raid->redundant = raid->redundant;
- }
- }
-
- // --------------------------------------------------------------------
-
- if(likely(do_health && redundant_num)) {
- static RRDSET *st_mdstat_health = NULL;
- if(unlikely(!st_mdstat_health)) {
- st_mdstat_health = rrdset_create_localhost(
- "mdstat"
- , "mdstat_health"
- , NULL
- , "health"
- , "md.health"
- , "Faulty Devices In MD"
- , "failed disks"
- , PLUGIN_PROC_NAME
- , PLUGIN_PROC_MODULE_MDSTAT_NAME
- , NETDATA_CHART_PRIO_MDSTAT_HEALTH
- , update_every
- , RRDSET_TYPE_LINE
- );
-
- rrdset_isnot_obsolete(st_mdstat_health);
- }
- else
- rrdset_next(st_mdstat_health);
-
- if(!redundant_num) {
- if(likely(make_charts_obsolete)) make_chart_obsolete("mdstat", "health");
- }
- else {
- for(raid_idx = 0; raid_idx < raids_num; raid_idx++) {
- struct raid *raid = &raids[raid_idx];
-
- if(likely(raid->redundant)) {
- if(unlikely(!raid->rd_health && !(raid->rd_health = rrddim_find(st_mdstat_health, raid->name))))
- raid->rd_health = rrddim_add(st_mdstat_health, raid->name, NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
-
- rrddim_set_by_pointer(st_mdstat_health, raid->rd_health, raid->failed_disks);
- }
- }
-
- rrdset_done(st_mdstat_health);
- }
- }
-
- // --------------------------------------------------------------------
-
- for(raid_idx = 0; raid_idx < raids_num ; raid_idx++) {
- struct raid *raid = &raids[raid_idx];
- char id[50 + 1];
- char family[50 + 1];
-
- if(likely(raid->redundant)) {
- if(likely(do_disks)) {
- snprintfz(id, 50, "%s_disks", raid->name);
-
- if(unlikely(!raid->st_disks && !(raid->st_disks = rrdset_find_byname_localhost(id)))) {
- snprintfz(family, 50, "%s", raid->name);
-
- raid->st_disks = rrdset_create_localhost(
- "mdstat"
- , id
- , NULL
- , family
- , "md.disks"
- , "Disks Stats"
- , "disks"
- , PLUGIN_PROC_NAME
- , PLUGIN_PROC_MODULE_MDSTAT_NAME
- , NETDATA_CHART_PRIO_MDSTAT_DISKS + raid_idx * 10
- , update_every
- , RRDSET_TYPE_STACKED
- );
-
- rrdset_isnot_obsolete(raid->st_disks);
- }
- else
- rrdset_next(raid->st_disks);
-
- if(unlikely(!raid->rd_inuse && !(raid->rd_inuse = rrddim_find(raid->st_disks, "inuse"))))
- raid->rd_inuse = rrddim_add(raid->st_disks, "inuse", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
- if(unlikely(!raid->rd_down && !(raid->rd_down = rrddim_find(raid->st_disks, "down"))))
- raid->rd_down = rrddim_add(raid->st_disks, "down", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
-
- rrddim_set_by_pointer(raid->st_disks, raid->rd_inuse, raid->inuse_disks);
- rrddim_set_by_pointer(raid->st_disks, raid->rd_down, raid->failed_disks);
-
- rrdset_done(raid->st_disks);
- }
-
- // --------------------------------------------------------------------
-
- if(likely(do_mismatch)) {
- snprintfz(id, 50, "%s_mismatch", raid->name);
-
- if(unlikely(!raid->st_mismatch_cnt && !(raid->st_mismatch_cnt = rrdset_find_byname_localhost(id)))) {
- snprintfz(family, 50, "%s", raid->name);
-
- raid->st_mismatch_cnt = rrdset_create_localhost(
- "mdstat"
- , id
- , NULL
- , family
- , "md.mismatch_cnt"
- , "Mismatch Count"
- , "unsynchronized blocks"
- , PLUGIN_PROC_NAME
- , PLUGIN_PROC_MODULE_MDSTAT_NAME
- , NETDATA_CHART_PRIO_MDSTAT_MISMATCH + raid_idx * 10
- , update_every
- , RRDSET_TYPE_LINE
- );
-
- rrdset_isnot_obsolete(raid->st_mismatch_cnt);
- }
- else
- rrdset_next(raid->st_mismatch_cnt);
-
- if(unlikely(!raid->rd_mismatch_cnt && !(raid->rd_mismatch_cnt = rrddim_find(raid->st_mismatch_cnt, "count"))))
- raid->rd_mismatch_cnt = rrddim_add(raid->st_mismatch_cnt, "count", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
-
- rrddim_set_by_pointer(raid->st_mismatch_cnt, raid->rd_mismatch_cnt, raid->mismatch_cnt);
-
- rrdset_done(raid->st_mismatch_cnt);
- }
-
- // --------------------------------------------------------------------
-
- if(likely(do_operations)) {
- snprintfz(id, 50, "%s_operation", raid->name);
-
- if(unlikely(!raid->st_operation && !(raid->st_operation = rrdset_find_byname_localhost(id)))) {
- snprintfz(family, 50, "%s", raid->name);
-
- raid->st_operation = rrdset_create_localhost(
- "mdstat"
- , id
- , NULL
- , family
- , "md.status"
- , "Current Status"
- , "percent"
- , PLUGIN_PROC_NAME
- , PLUGIN_PROC_MODULE_MDSTAT_NAME
- , NETDATA_CHART_PRIO_MDSTAT_OPERATION + raid_idx * 10
- , update_every
- , RRDSET_TYPE_LINE
- );
-
- rrdset_isnot_obsolete(raid->st_operation);
- }
- else
- rrdset_next(raid->st_operation);
-
- if(unlikely(!raid->rd_check && !(raid->rd_check = rrddim_find(raid->st_operation, "check"))))
- raid->rd_check = rrddim_add(raid->st_operation, "check", NULL, 1, 100, RRD_ALGORITHM_ABSOLUTE);
- if(unlikely(!raid->rd_resync && !(raid->rd_resync = rrddim_find(raid->st_operation, "resync"))))
- raid->rd_resync = rrddim_add(raid->st_operation, "resync", NULL, 1, 100, RRD_ALGORITHM_ABSOLUTE);
- if(unlikely(!raid->rd_recovery && !(raid->rd_recovery = rrddim_find(raid->st_operation, "recovery"))))
- raid->rd_recovery = rrddim_add(raid->st_operation, "recovery", NULL, 1, 100, RRD_ALGORITHM_ABSOLUTE);
- if(unlikely(!raid->rd_reshape && !(raid->rd_reshape = rrddim_find(raid->st_operation, "reshape"))))
- raid->rd_reshape = rrddim_add(raid->st_operation, "reshape", NULL, 1, 100, RRD_ALGORITHM_ABSOLUTE);
-
- rrddim_set_by_pointer(raid->st_operation, raid->rd_check, raid->check);
- rrddim_set_by_pointer(raid->st_operation, raid->rd_resync, raid->resync);
- rrddim_set_by_pointer(raid->st_operation, raid->rd_recovery, raid->recovery);
- rrddim_set_by_pointer(raid->st_operation, raid->rd_reshape, raid->reshape);
-
- rrdset_done(raid->st_operation);
-
- // --------------------------------------------------------------------
-
- snprintfz(id, 50, "%s_finish", raid->name);
-
- if(unlikely(!raid->st_finish && !(raid->st_finish = rrdset_find_byname_localhost(id)))) {
- snprintfz(family, 50, "%s", raid->name);
-
- raid->st_finish = rrdset_create_localhost(
- "mdstat"
- , id
- , NULL
- , family
- , "md.rate"
- , "Approximate Time Unit Finish"
- , "seconds"
- , PLUGIN_PROC_NAME
- , PLUGIN_PROC_MODULE_MDSTAT_NAME
- , NETDATA_CHART_PRIO_MDSTAT_FINISH + raid_idx * 10
- , update_every
- , RRDSET_TYPE_LINE
- );
-
- rrdset_isnot_obsolete(raid->st_finish);
- }
- else
- rrdset_next(raid->st_finish);
-
- if(unlikely(!raid->rd_finish_in && !(raid->rd_finish_in = rrddim_find(raid->st_finish, "finish_in"))))
- raid->rd_finish_in = rrddim_add(raid->st_finish, "finish_in", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
-
- rrddim_set_by_pointer(raid->st_finish, raid->rd_finish_in, raid->finish_in);
-
- rrdset_done(raid->st_finish);
-
- // --------------------------------------------------------------------
-
- snprintfz(id, 50, "%s_speed", raid->name);
-
- if(unlikely(!raid->st_speed && !(raid->st_speed = rrdset_find_byname_localhost(id)))) {
- snprintfz(family, 50, "%s", raid->name);
-
- raid->st_speed = rrdset_create_localhost(
- "mdstat"
- , id
- , NULL
- , family
- , "md.rate"
- , "Operation Speed"
- , "KiB/s"
- , PLUGIN_PROC_NAME
- , PLUGIN_PROC_MODULE_MDSTAT_NAME
- , NETDATA_CHART_PRIO_MDSTAT_SPEED + raid_idx * 10
- , update_every
- , RRDSET_TYPE_LINE
- );
-
- rrdset_isnot_obsolete(raid->st_speed);
- }
- else
- rrdset_next(raid->st_speed);
-
- if(unlikely(!raid->rd_speed && !(raid->rd_speed = rrddim_find(raid->st_speed, "speed"))))
- raid->rd_speed = rrddim_add(raid->st_speed, "speed", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
-
- rrddim_set_by_pointer(raid->st_speed, raid->rd_speed, raid->speed);
-
- rrdset_done(raid->st_speed);
- }
- }
- else {
-
- // --------------------------------------------------------------------
-
- if(likely(do_nonredundant)) {
- snprintfz(id, 50, "%s_availability", raid->name);
-
- if(unlikely(!raid->st_nonredundant && !(raid->st_nonredundant = rrdset_find_localhost(id)))) {
- snprintfz(family, 50, "%s", raid->name);
-
- raid->st_nonredundant = rrdset_create_localhost(
- "mdstat"
- , id
- , NULL
- , family
- , "md.nonredundant"
- , "Nonredundant Array Availability"
- , "boolean"
- , PLUGIN_PROC_NAME
- , PLUGIN_PROC_MODULE_MDSTAT_NAME
- , NETDATA_CHART_PRIO_MDSTAT_NONREDUNDANT + raid_idx * 10
- , update_every
- , RRDSET_TYPE_LINE
- );
-
- rrdset_isnot_obsolete(raid->st_nonredundant);
- }
- else
- rrdset_next(raid->st_nonredundant);
-
- if(unlikely(!raid->rd_nonredundant && !(raid->rd_nonredundant = rrddim_find(raid->st_nonredundant, "available"))))
- raid->rd_nonredundant = rrddim_add(raid->st_nonredundant, "available", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
-
- rrddim_set_by_pointer(raid->st_nonredundant, raid->rd_nonredundant, 1);
-
- rrdset_done(raid->st_nonredundant);
- }
- }
- }
-
- return 0;
-}
+// SPDX-License-Identifier: GPL-3.0-or-later + +#include "plugin_proc.h" + +#define PLUGIN_PROC_MODULE_MDSTAT_NAME "/proc/mdstat" + +struct raid { + int redundant; + char *name; + uint32_t hash; + + RRDDIM *rd_health; + unsigned long long failed_disks; + + RRDSET *st_disks; + RRDDIM *rd_down; + RRDDIM *rd_inuse; + unsigned long long total_disks; + unsigned long long inuse_disks; + + RRDSET *st_operation; + RRDDIM *rd_check; + RRDDIM *rd_resync; + RRDDIM *rd_recovery; + RRDDIM *rd_reshape; + unsigned long long check; + unsigned long long resync; + unsigned long long recovery; + unsigned long long reshape; + + RRDSET *st_finish; + RRDDIM *rd_finish_in; + unsigned long long finish_in; + + RRDSET *st_speed; + RRDDIM *rd_speed; + unsigned long long speed; + + char *mismatch_cnt_filename; + RRDSET *st_mismatch_cnt; + RRDDIM *rd_mismatch_cnt; + unsigned long long mismatch_cnt; + + RRDSET *st_nonredundant; + RRDDIM *rd_nonredundant; +}; + +struct old_raid { + int redundant; + char *name; + uint32_t hash; + int found; +}; + +static inline char *remove_trailing_chars(char *s, char c) +{ + while (*s) { + if (unlikely(*s == c)) { + *s = '\0'; + } + s++; + } + return s; +} + +static inline void make_chart_obsolete(char *name, const char *id_modifier) +{ + char id[50 + 1]; + RRDSET *st = NULL; + + if (likely(name && id_modifier)) { + snprintfz(id, 50, "mdstat.%s_%s", name, id_modifier); + st = rrdset_find_active_byname_localhost(id); + if (likely(st)) + rrdset_is_obsolete(st); + } +} + +int do_proc_mdstat(int update_every, usec_t dt) +{ + (void)dt; + static procfile *ff = NULL; + static int do_health = -1, do_nonredundant = -1, do_disks = -1, do_operations = -1, do_mismatch = -1, + do_mismatch_config = -1; + static int make_charts_obsolete = -1; + static char *mdstat_filename = NULL, *mismatch_cnt_filename = NULL; + static struct raid *raids = NULL; + static size_t raids_allocated = 0; + size_t raids_num = 0, raid_idx = 0, redundant_num = 0; + static struct old_raid *old_raids = NULL; + static size_t old_raids_allocated = 0; + size_t old_raid_idx = 0; + + if (unlikely(do_health == -1)) { + do_health = + config_get_boolean("plugin:proc:/proc/mdstat", "faulty devices", CONFIG_BOOLEAN_YES); + do_nonredundant = + config_get_boolean("plugin:proc:/proc/mdstat", "nonredundant arrays availability", CONFIG_BOOLEAN_YES); + do_mismatch_config = + config_get_boolean_ondemand("plugin:proc:/proc/mdstat", "mismatch count", CONFIG_BOOLEAN_AUTO); + do_disks = + config_get_boolean("plugin:proc:/proc/mdstat", "disk stats", CONFIG_BOOLEAN_YES); + do_operations = + config_get_boolean("plugin:proc:/proc/mdstat", "operation status", CONFIG_BOOLEAN_YES); + + make_charts_obsolete = + config_get_boolean("plugin:proc:/proc/mdstat", "make charts obsolete", CONFIG_BOOLEAN_YES); + + char filename[FILENAME_MAX + 1]; + + snprintfz(filename, FILENAME_MAX, "%s%s", netdata_configured_host_prefix, "/proc/mdstat"); + mdstat_filename = config_get("plugin:proc:/proc/mdstat", "filename to monitor", filename); + + snprintfz(filename, FILENAME_MAX, "%s%s", netdata_configured_host_prefix, "/sys/block/%s/md/mismatch_cnt"); + mismatch_cnt_filename = config_get("plugin:proc:/proc/mdstat", "mismatch_cnt filename to monitor", filename); + } + + if (unlikely(!ff)) { + ff = procfile_open(mdstat_filename, " \t:", PROCFILE_FLAG_DEFAULT); + if (unlikely(!ff)) + return 1; + } + + ff = procfile_readall(ff); + if (unlikely(!ff)) + return 0; // we return 0, so that we will retry opening it next time + + size_t lines = procfile_lines(ff); + size_t words = 0; + + if (unlikely(lines < 2)) { + error("Cannot read /proc/mdstat. Expected 2 or more lines, read %zu.", lines); + return 1; + } + + // find how many raids are there + size_t l; + raids_num = 0; + for (l = 1; l < lines - 2; l++) { + if (unlikely(procfile_lineword(ff, l, 1)[0] == 'a')) // check if the raid is active + raids_num++; + } + + if (unlikely(!raids_num && !old_raids_allocated)) + return 0; // we return 0, so that we will retry searching for raids next time + + // allocate the memory we need; + if (unlikely(raids_num != raids_allocated)) { + for (raid_idx = 0; raid_idx < raids_allocated; raid_idx++) { + struct raid *raid = &raids[raid_idx]; + freez(raid->name); + freez(raid->mismatch_cnt_filename); + } + if (raids_num) { + raids = (struct raid *)reallocz(raids, raids_num * sizeof(struct raid)); + memset(raids, 0, raids_num * sizeof(struct raid)); + } else { + freez(raids); + raids = NULL; + } + raids_allocated = raids_num; + } + + // loop through all lines except the first and the last ones + for (l = 1, raid_idx = 0; l < (lines - 2) && raid_idx < raids_num; l++) { + struct raid *raid = &raids[raid_idx]; + raid->redundant = 0; + + words = procfile_linewords(ff, l); + + if (unlikely(words < 2)) + continue; + + if (unlikely(procfile_lineword(ff, l, 1)[0] != 'a')) + continue; + + if (unlikely(!raid->name)) { + raid->name = strdupz(procfile_lineword(ff, l, 0)); + raid->hash = simple_hash(raid->name); + } else if (unlikely(strcmp(raid->name, procfile_lineword(ff, l, 0)))) { + freez(raid->name); + freez(raid->mismatch_cnt_filename); + memset(raid, 0, sizeof(struct raid)); + raid->name = strdupz(procfile_lineword(ff, l, 0)); + raid->hash = simple_hash(raid->name); + } + + if (unlikely(!raid->name || !raid->name[0])) + continue; + + raid_idx++; + + // check if raid has disk status + l++; + words = procfile_linewords(ff, l); + if (words < 2 || procfile_lineword(ff, l, words - 1)[0] != '[') + continue; + + // split inuse and total number of disks + if (likely(do_health || do_disks)) { + char *s = NULL, *str_total = NULL, *str_inuse = NULL; + + s = procfile_lineword(ff, l, words - 2); + if (unlikely(s[0] != '[')) { + error("Cannot read /proc/mdstat raid health status. Unexpected format: missing opening bracket."); + continue; + } + str_total = ++s; + while (*s) { + if (unlikely(*s == '/')) { + *s = '\0'; + str_inuse = s + 1; + } else if (unlikely(*s == ']')) { + *s = '\0'; + break; + } + s++; + } + if (unlikely(str_total[0] == '\0' || !str_inuse || str_inuse[0] == '\0')) { + error("Cannot read /proc/mdstat raid health status. Unexpected format."); + continue; + } + + raid->inuse_disks = str2ull(str_inuse); + raid->total_disks = str2ull(str_total); + raid->failed_disks = raid->total_disks - raid->inuse_disks; + } + + raid->redundant = 1; + redundant_num++; + l++; + + // check if any operation is performed on the raid + if (likely(do_operations)) { + char *s = NULL; + + raid->check = 0; + raid->resync = 0; + raid->recovery = 0; + raid->reshape = 0; + raid->finish_in = 0; + raid->speed = 0; + + words = procfile_linewords(ff, l); + + if (likely(words < 2)) + continue; + + if (unlikely(procfile_lineword(ff, l, 0)[0] != '[')) + continue; + + if (unlikely(words < 7)) { + error("Cannot read /proc/mdstat line. Expected 7 params, read %zu.", words); + continue; + } + + char *word; + word = procfile_lineword(ff, l, 3); + remove_trailing_chars(word, '%'); + + unsigned long long percentage = (unsigned long long)(str2ld(word, NULL) * 100); + // possible operations: check, resync, recovery, reshape + // 4-th character is unique for each operation so it is checked + switch (procfile_lineword(ff, l, 1)[3]) { + case 'c': // check + raid->check = percentage; + break; + case 'y': // resync + raid->resync = percentage; + break; + case 'o': // recovery + raid->recovery = percentage; + break; + case 'h': // reshape + raid->reshape = percentage; + break; + } + + word = procfile_lineword(ff, l, 5); + s = remove_trailing_chars(word, 'm'); // remove trailing "min" + + word += 7; // skip leading "finish=" + + if (likely(s > word)) + raid->finish_in = (unsigned long long)(str2ld(word, NULL) * 60); + + word = procfile_lineword(ff, l, 6); + s = remove_trailing_chars(word, 'K'); // remove trailing "K/sec" + + word += 6; // skip leading "speed=" + + if (likely(s > word)) + raid->speed = str2ull(word); + } + } + + // read mismatch_cnt files + if (do_mismatch == -1) { + if (do_mismatch_config == CONFIG_BOOLEAN_AUTO) { + if (raids_num > 50) + do_mismatch = CONFIG_BOOLEAN_NO; + else + do_mismatch = CONFIG_BOOLEAN_YES; + } else + do_mismatch = do_mismatch_config; + } + + if (likely(do_mismatch)) { + for (raid_idx = 0; raid_idx < raids_num; raid_idx++) { + char filename[FILENAME_MAX + 1]; + struct raid *raid = &raids[raid_idx]; + + if (likely(raid->redundant)) { + if (unlikely(!raid->mismatch_cnt_filename)) { + snprintfz(filename, FILENAME_MAX, mismatch_cnt_filename, raid->name); + raid->mismatch_cnt_filename = strdupz(filename); + } + if (unlikely(read_single_number_file(raid->mismatch_cnt_filename, &raid->mismatch_cnt))) { + error("Cannot read file '%s'", raid->mismatch_cnt_filename); + do_mismatch = CONFIG_BOOLEAN_NO; + error("Monitoring for mismatch count has been disabled"); + break; + } + } + } + } + + // check for disappeared raids + for (old_raid_idx = 0; old_raid_idx < old_raids_allocated; old_raid_idx++) { + struct old_raid *old_raid = &old_raids[old_raid_idx]; + int found = 0; + + for (raid_idx = 0; raid_idx < raids_num; raid_idx++) { + struct raid *raid = &raids[raid_idx]; + + if (unlikely( + raid->hash == old_raid->hash && !strcmp(raid->name, old_raid->name) && + raid->redundant == old_raid->redundant)) + found = 1; + } + + old_raid->found = found; + } + + int raid_disappeared = 0; + for (old_raid_idx = 0; old_raid_idx < old_raids_allocated; old_raid_idx++) { + struct old_raid *old_raid = &old_raids[old_raid_idx]; + + if (unlikely(!old_raid->found)) { + if (likely(make_charts_obsolete)) { + make_chart_obsolete(old_raid->name, "disks"); + make_chart_obsolete(old_raid->name, "mismatch"); + make_chart_obsolete(old_raid->name, "operation"); + make_chart_obsolete(old_raid->name, "finish"); + make_chart_obsolete(old_raid->name, "speed"); + make_chart_obsolete(old_raid->name, "availability"); + } + raid_disappeared = 1; + } + } + + // allocate memory for nonredundant arrays + if (unlikely(raid_disappeared || old_raids_allocated != raids_num)) { + for (old_raid_idx = 0; old_raid_idx < old_raids_allocated; old_raid_idx++) { + freez(old_raids[old_raid_idx].name); + } + if (likely(raids_num)) { + old_raids = reallocz(old_raids, sizeof(struct old_raid) * raids_num); + memset(old_raids, 0, sizeof(struct old_raid) * raids_num); + } else { + freez(old_raids); + old_raids = NULL; + } + old_raids_allocated = raids_num; + for (old_raid_idx = 0; old_raid_idx < old_raids_allocated; old_raid_idx++) { + struct old_raid *old_raid = &old_raids[old_raid_idx]; + struct raid *raid = &raids[old_raid_idx]; + + old_raid->name = strdupz(raid->name); + old_raid->hash = raid->hash; + old_raid->redundant = raid->redundant; + } + } + + // -------------------------------------------------------------------- + + if (likely(do_health && redundant_num)) { + static RRDSET *st_mdstat_health = NULL; + if (unlikely(!st_mdstat_health)) { + st_mdstat_health = rrdset_create_localhost( + "mdstat", + "mdstat_health", + NULL, + "health", + "md.health", + "Faulty Devices In MD", + "failed disks", + PLUGIN_PROC_NAME, + PLUGIN_PROC_MODULE_MDSTAT_NAME, + NETDATA_CHART_PRIO_MDSTAT_HEALTH, + update_every, + RRDSET_TYPE_LINE); + + rrdset_isnot_obsolete(st_mdstat_health); + } else + rrdset_next(st_mdstat_health); + + if (!redundant_num) { + if (likely(make_charts_obsolete)) + make_chart_obsolete("mdstat", "health"); + } else { + for (raid_idx = 0; raid_idx < raids_num; raid_idx++) { + struct raid *raid = &raids[raid_idx]; + + if (likely(raid->redundant)) { + if (unlikely(!raid->rd_health && !(raid->rd_health = rrddim_find_active(st_mdstat_health, raid->name)))) + raid->rd_health = rrddim_add(st_mdstat_health, raid->name, NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + + rrddim_set_by_pointer(st_mdstat_health, raid->rd_health, raid->failed_disks); + } + } + + rrdset_done(st_mdstat_health); + } + } + + // -------------------------------------------------------------------- + + for (raid_idx = 0; raid_idx < raids_num; raid_idx++) { + struct raid *raid = &raids[raid_idx]; + char id[50 + 1]; + char family[50 + 1]; + + if (likely(raid->redundant)) { + if (likely(do_disks)) { + snprintfz(id, 50, "%s_disks", raid->name); + + if (unlikely(!raid->st_disks && !(raid->st_disks = rrdset_find_active_byname_localhost(id)))) { + snprintfz(family, 50, "%s", raid->name); + + raid->st_disks = rrdset_create_localhost( + "mdstat", + id, + NULL, + family, + "md.disks", + "Disks Stats", + "disks", + PLUGIN_PROC_NAME, + PLUGIN_PROC_MODULE_MDSTAT_NAME, + NETDATA_CHART_PRIO_MDSTAT_DISKS + raid_idx * 10, + update_every, + RRDSET_TYPE_STACKED); + + rrdset_isnot_obsolete(raid->st_disks); + } else + rrdset_next(raid->st_disks); + + if (unlikely(!raid->rd_inuse && !(raid->rd_inuse = rrddim_find_active(raid->st_disks, "inuse")))) + raid->rd_inuse = rrddim_add(raid->st_disks, "inuse", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + if (unlikely(!raid->rd_down && !(raid->rd_down = rrddim_find_active(raid->st_disks, "down")))) + raid->rd_down = rrddim_add(raid->st_disks, "down", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + + rrddim_set_by_pointer(raid->st_disks, raid->rd_inuse, raid->inuse_disks); + rrddim_set_by_pointer(raid->st_disks, raid->rd_down, raid->failed_disks); + + rrdset_done(raid->st_disks); + } + + // -------------------------------------------------------------------- + + if (likely(do_mismatch)) { + snprintfz(id, 50, "%s_mismatch", raid->name); + + if (unlikely(!raid->st_mismatch_cnt && !(raid->st_mismatch_cnt = rrdset_find_active_byname_localhost(id)))) { + snprintfz(family, 50, "%s", raid->name); + + raid->st_mismatch_cnt = rrdset_create_localhost( + "mdstat", + id, + NULL, + family, + "md.mismatch_cnt", + "Mismatch Count", + "unsynchronized blocks", + PLUGIN_PROC_NAME, + PLUGIN_PROC_MODULE_MDSTAT_NAME, + NETDATA_CHART_PRIO_MDSTAT_MISMATCH + raid_idx * 10, + update_every, + RRDSET_TYPE_LINE); + + rrdset_isnot_obsolete(raid->st_mismatch_cnt); + } else + rrdset_next(raid->st_mismatch_cnt); + + if (unlikely(!raid->rd_mismatch_cnt && !(raid->rd_mismatch_cnt = rrddim_find_active(raid->st_mismatch_cnt, "count")))) + raid->rd_mismatch_cnt = rrddim_add(raid->st_mismatch_cnt, "count", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + + rrddim_set_by_pointer(raid->st_mismatch_cnt, raid->rd_mismatch_cnt, raid->mismatch_cnt); + + rrdset_done(raid->st_mismatch_cnt); + } + + // -------------------------------------------------------------------- + + if (likely(do_operations)) { + snprintfz(id, 50, "%s_operation", raid->name); + + if (unlikely(!raid->st_operation && !(raid->st_operation = rrdset_find_active_byname_localhost(id)))) { + snprintfz(family, 50, "%s", raid->name); + + raid->st_operation = rrdset_create_localhost( + "mdstat", + id, + NULL, + family, + "md.status", + "Current Status", + "percent", + PLUGIN_PROC_NAME, + PLUGIN_PROC_MODULE_MDSTAT_NAME, + NETDATA_CHART_PRIO_MDSTAT_OPERATION + raid_idx * 10, + update_every, + RRDSET_TYPE_LINE); + + rrdset_isnot_obsolete(raid->st_operation); + } else + rrdset_next(raid->st_operation); + + if(unlikely(!raid->rd_check && !(raid->rd_check = rrddim_find_active(raid->st_operation, "check")))) + raid->rd_check = rrddim_add(raid->st_operation, "check", NULL, 1, 100, RRD_ALGORITHM_ABSOLUTE); + if(unlikely(!raid->rd_resync && !(raid->rd_resync = rrddim_find_active(raid->st_operation, "resync")))) + raid->rd_resync = rrddim_add(raid->st_operation, "resync", NULL, 1, 100, RRD_ALGORITHM_ABSOLUTE); + if(unlikely(!raid->rd_recovery && !(raid->rd_recovery = rrddim_find_active(raid->st_operation, "recovery")))) + raid->rd_recovery = rrddim_add(raid->st_operation, "recovery", NULL, 1, 100, RRD_ALGORITHM_ABSOLUTE); + if(unlikely(!raid->rd_reshape && !(raid->rd_reshape = rrddim_find_active(raid->st_operation, "reshape")))) + raid->rd_reshape = rrddim_add(raid->st_operation, "reshape", NULL, 1, 100, RRD_ALGORITHM_ABSOLUTE); + + rrddim_set_by_pointer(raid->st_operation, raid->rd_check, raid->check); + rrddim_set_by_pointer(raid->st_operation, raid->rd_resync, raid->resync); + rrddim_set_by_pointer(raid->st_operation, raid->rd_recovery, raid->recovery); + rrddim_set_by_pointer(raid->st_operation, raid->rd_reshape, raid->reshape); + + rrdset_done(raid->st_operation); + + // -------------------------------------------------------------------- + + snprintfz(id, 50, "%s_finish", raid->name); + + if (unlikely(!raid->st_finish && !(raid->st_finish = rrdset_find_active_byname_localhost(id)))) { + snprintfz(family, 50, "%s", raid->name); + + raid->st_finish = rrdset_create_localhost( + "mdstat", + id, + NULL, + family, + "md.rate", + "Approximate Time Unit Finish", + "seconds", + PLUGIN_PROC_NAME, + PLUGIN_PROC_MODULE_MDSTAT_NAME, + NETDATA_CHART_PRIO_MDSTAT_FINISH + raid_idx * 10, + update_every, RRDSET_TYPE_LINE); + + rrdset_isnot_obsolete(raid->st_finish); + } else + rrdset_next(raid->st_finish); + + if(unlikely(!raid->rd_finish_in && !(raid->rd_finish_in = rrddim_find_active(raid->st_finish, "finish_in")))) + raid->rd_finish_in = rrddim_add(raid->st_finish, "finish_in", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + + rrddim_set_by_pointer(raid->st_finish, raid->rd_finish_in, raid->finish_in); + + rrdset_done(raid->st_finish); + + // -------------------------------------------------------------------- + + snprintfz(id, 50, "%s_speed", raid->name); + + if (unlikely(!raid->st_speed && !(raid->st_speed = rrdset_find_active_byname_localhost(id)))) { + snprintfz(family, 50, "%s", raid->name); + + raid->st_speed = rrdset_create_localhost( + "mdstat", + id, + NULL, + family, + "md.rate", + "Operation Speed", + "KiB/s", + PLUGIN_PROC_NAME, + PLUGIN_PROC_MODULE_MDSTAT_NAME, + NETDATA_CHART_PRIO_MDSTAT_SPEED + raid_idx * 10, + update_every, + RRDSET_TYPE_LINE); + + rrdset_isnot_obsolete(raid->st_speed); + } else + rrdset_next(raid->st_speed); + + if (unlikely(!raid->rd_speed && !(raid->rd_speed = rrddim_find_active(raid->st_speed, "speed")))) + raid->rd_speed = rrddim_add(raid->st_speed, "speed", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + + rrddim_set_by_pointer(raid->st_speed, raid->rd_speed, raid->speed); + + rrdset_done(raid->st_speed); + } + } else { + // -------------------------------------------------------------------- + + if (likely(do_nonredundant)) { + snprintfz(id, 50, "%s_availability", raid->name); + + if (unlikely(!raid->st_nonredundant && !(raid->st_nonredundant = rrdset_find_active_localhost(id)))) { + snprintfz(family, 50, "%s", raid->name); + + raid->st_nonredundant = rrdset_create_localhost( + "mdstat", + id, + NULL, + family, + "md.nonredundant", + "Nonredundant Array Availability", + "boolean", + PLUGIN_PROC_NAME, + PLUGIN_PROC_MODULE_MDSTAT_NAME, + NETDATA_CHART_PRIO_MDSTAT_NONREDUNDANT + raid_idx * 10, + update_every, + RRDSET_TYPE_LINE); + + rrdset_isnot_obsolete(raid->st_nonredundant); + } else + rrdset_next(raid->st_nonredundant); + + if (unlikely(!raid->rd_nonredundant && !(raid->rd_nonredundant = rrddim_find_active(raid->st_nonredundant, "available")))) + raid->rd_nonredundant = rrddim_add(raid->st_nonredundant, "available", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + + rrddim_set_by_pointer(raid->st_nonredundant, raid->rd_nonredundant, 1); + + rrdset_done(raid->st_nonredundant); + } + } + } + + return 0; +} |