diff options
Diffstat (limited to '')
-rw-r--r-- | web/api/health/README.md | 49 | ||||
-rw-r--r-- | web/api/health/health_cmdapi.c | 172 | ||||
-rw-r--r-- | web/api/health/health_cmdapi.h | 2 | ||||
-rw-r--r-- | web/api/netdata-swagger.json | 110 | ||||
-rw-r--r-- | web/api/netdata-swagger.yaml | 77 |
5 files changed, 338 insertions, 72 deletions
diff --git a/web/api/health/README.md b/web/api/health/README.md index 2003a61e0..66a80d5f6 100644 --- a/web/api/health/README.md +++ b/web/api/health/README.md @@ -45,6 +45,7 @@ The following will return an SVG badge of the alarm named `NAME`, attached to th ## Health Management API Netdata v1.12 and beyond provides a command API to control health checks and notifications at runtime. The feature is especially useful for maintenance periods, during which you receive meaningless alarms. +From Netdata v1.16.0 and beyond, the configuration controlled via the API commands is [persisted across netdata restarts](#persistence). Specifically, the API allows you to: - Disable health checks completely. Alarm conditions will not be evaluated at all and no entries will be added to the alarm log. @@ -142,6 +143,43 @@ Example 2.2: Add one more selector, to also silence alarms for cpu1 and cpu2 http://localhost/api/v1/manage/health?families=cpu1 cpu2 ``` +### List silencers + +The command `LIST` was added in netdata v1.16.0 and returns a JSON with the current status of the silencers. + +``` + curl "http://myserver/api/v1/manage/health?cmd=LIST" -H "X-Auth-Token: Mytoken" +``` + +As an example, the following response shows that we have two silencers configured, one for an alarm called `samplealarm` and one for alarms with context `random` on host `myhost` +``` +json +{ + "all": false, + "type": "SILENCE", + "silencers": [ + { + "alarm": "samplealarm" + }, + { + "context": "random", + "hosts": "myhost" + } + ] +} +``` + +The response below shows that we have disabled all health checks. + +``` +json +{ + "all": true, + "type": "DISABLE", + "silencers": [] +} + + ### Responses - "Auth Error" : Token authentication failed @@ -155,6 +193,17 @@ http://localhost/api/v1/manage/health?families=cpu1 cpu2 - "WARNING: Added alarm selector to silence/disable alarms without a SILENCE or DISABLE command." : Added to the response if a selector is added without a selector-specific command. - "WARNING: SILENCE or DISABLE command is ineffective without defining any alarm selectors." : Added to the response if a selector-specific command is issued without a selector. +### Persistence + +From netdata v1.16.0 and beyond, the silencers configuration is persisted to disk and loaded when netdata starts. +The JSON string returned by the [LIST command](#list-silencers) is automatically saved to the `silencers file`, every time a command alters the silencers configuration. +The file's location is configurable in `netdata.conf`. The default is shown below: + +``` +[health] + # silencers file = /var/lib/netdata/health.silencers.json +``` + ### Further reading The test script under [tests/health_mgmtapi](../../../tests/health_mgmtapi) contains a series of tests that you can either run or read through to understand the various calls and responses better. diff --git a/web/api/health/health_cmdapi.c b/web/api/health/health_cmdapi.c index ec177751b..468054c67 100644 --- a/web/api/health/health_cmdapi.c +++ b/web/api/health/health_cmdapi.c @@ -1,17 +1,16 @@ // -// Created by christopher on 11/12/18. +// Created by Christopher on 11/12/18. // #include "health_cmdapi.h" - -static SILENCER *create_silencer(void) { - SILENCER *t = callocz(1, sizeof(SILENCER)); - debug(D_HEALTH, "HEALTH command API: Created empty silencer"); - - return t; -} - +/** + * Free Silencers + * + * Clean the silencer structure + * + * @param t is the structure that will be cleaned. + */ void free_silencers(SILENCER *t) { if (!t) return; if (t->next) free_silencers(t->next); @@ -31,38 +30,104 @@ void free_silencers(SILENCER *t) { return; } +/** + * Silencers to JSON Entry + * + * Fill the buffer with the other values given. + * + * @param wb a pointer to the output buffer + * @param var the json variable + * @param val the json value + * @param hasprev has it a previous value? + * + * @return + */ +int health_silencers2json_entry(BUFFER *wb, char* var, char* val, int hasprev) { + if (val) { + buffer_sprintf(wb, "%s\n\t\t\t\"%s\": \"%s\"", (hasprev)?",":"", var, val); + return 1; + } else { + return hasprev; + } +} +/** + * Silencer to JSON + * + * Write the silencer values using JSON format inside a buffer. + * + * @param wb is the buffer to write the silencers. + */ +void health_silencers2json(BUFFER *wb) { + buffer_sprintf(wb, "{\n\t\"all\": %s," + "\n\t\"type\": \"%s\"," + "\n\t\"silencers\": [", + (silencers->all_alarms)?"true":"false", + (silencers->stype == STYPE_NONE)?"None":((silencers->stype == STYPE_DISABLE_ALARMS)?"DISABLE":"SILENCE")); + + SILENCER *silencer; + int i = 0, j = 0; + for(silencer = silencers->silencers; silencer ; silencer = silencer->next) { + if(likely(i)) buffer_strcat(wb, ","); + buffer_strcat(wb, "\n\t\t{"); + j=health_silencers2json_entry(wb, HEALTH_ALARM_KEY, silencer->alarms, j); + j=health_silencers2json_entry(wb, HEALTH_CHART_KEY, silencer->charts, j); + j=health_silencers2json_entry(wb, HEALTH_CONTEXT_KEY, silencer->contexts, j); + j=health_silencers2json_entry(wb, HEALTH_HOST_KEY, silencer->hosts, j); + health_silencers2json_entry(wb, HEALTH_FAMILIES_KEY, silencer->families, j); + j=0; + buffer_strcat(wb, "\n\t\t}"); + i++; + } + if(likely(i)) buffer_strcat(wb, "\n\t"); + buffer_strcat(wb, "]\n}\n"); +} +/** + * Silencer to FILE + * + * Write the sliencer buffer to a file. + * @param wb + */ +void health_silencers2file(BUFFER *wb) { + if (wb->len == 0) return; + + FILE *fd = fopen(silencers_filename, "wb"); + if(fd) { + size_t written = (size_t)fprintf(fd, "%s", wb->buffer) ; + if (written == wb->len ) { + info("Silencer changes written to %s", silencers_filename); + } + fclose(fd); + return; + } + error("Silencer changes could not be written to %s. Error %s", silencers_filename, strerror(errno)); +} + +/** + * Request V1 MGMT Health + * + * Function called by api to management the health. + * + * @param host main structure with client information! + * @param w is the structure with all information of the client request. + * @param url is the url that netdata is working + * + * @return It returns 200 on success and another code otherwise. + */ int web_client_api_request_v1_mgmt_health(RRDHOST *host, struct web_client *w, char *url) { int ret = 400; (void) host; - - BUFFER *wb = w->response.data; buffer_flush(wb); wb->contenttype = CT_TEXT_PLAIN; buffer_flush(w->response.data); - static uint32_t - hash_alarm = 0, - hash_template = 0, - hash_chart = 0, - hash_context = 0, - hash_host = 0, - hash_families = 0; - - if (unlikely(!hash_alarm)) { - hash_alarm = simple_uhash(HEALTH_ALARM_KEY); - hash_template = simple_uhash(HEALTH_TEMPLATE_KEY); - hash_chart = simple_uhash(HEALTH_CHART_KEY); - hash_context = simple_uhash(HEALTH_CONTEXT_KEY); - hash_host = simple_uhash(HEALTH_HOST_KEY); - hash_families = simple_uhash(HEALTH_FAMILIES_KEY); - } - + //Local instance of the silencer SILENCER *silencer = NULL; + int config_changed = 1; if (!w->auth_bearer_token) { buffer_strcat(wb, HEALTH_CMDAPI_MSG_AUTHERROR); @@ -105,50 +170,17 @@ int web_client_api_request_v1_mgmt_health(RRDHOST *host, struct web_client *w, c free_silencers(silencers->silencers); silencers->silencers = NULL; buffer_strcat(wb, HEALTH_CMDAPI_MSG_RESET); + } else if (!strcmp(value, HEALTH_CMDAPI_CMD_LIST)) { + w->response.data->contenttype = CT_APPLICATION_JSON; + health_silencers2json(wb); + config_changed=0; } } else { - uint32_t hash = simple_uhash(key); - if (unlikely(silencer == NULL)) { - if ( - (hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) || - (hash == hash_template && !strcasecmp(key, HEALTH_TEMPLATE_KEY)) || - (hash == hash_chart && !strcasecmp(key, HEALTH_CHART_KEY)) || - (hash == hash_context && !strcasecmp(key, HEALTH_CONTEXT_KEY)) || - (hash == hash_host && !strcasecmp(key, HEALTH_HOST_KEY)) || - (hash == hash_families && !strcasecmp(key, HEALTH_FAMILIES_KEY)) - ) { - silencer = create_silencer(); - } - } - - if (hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) { - silencer->alarms = strdupz(value); - silencer->alarms_pattern = simple_pattern_create(silencer->alarms, NULL, SIMPLE_PATTERN_EXACT); - } else if (hash == hash_chart && !strcasecmp(key, HEALTH_CHART_KEY)) { - silencer->charts = strdupz(value); - silencer->charts_pattern = simple_pattern_create(silencer->charts, NULL, SIMPLE_PATTERN_EXACT); - } else if (hash == hash_context && !strcasecmp(key, HEALTH_CONTEXT_KEY)) { - silencer->contexts = strdupz(value); - silencer->contexts_pattern = simple_pattern_create(silencer->contexts, NULL, SIMPLE_PATTERN_EXACT); - } else if (hash == hash_host && !strcasecmp(key, HEALTH_HOST_KEY)) { - silencer->hosts = strdupz(value); - silencer->hosts_pattern = simple_pattern_create(silencer->hosts, NULL, SIMPLE_PATTERN_EXACT); - } else if (hash == hash_families && !strcasecmp(key, HEALTH_FAMILIES_KEY)) { - silencer->families = strdupz(value); - silencer->families_pattern = simple_pattern_create(silencer->families, NULL, SIMPLE_PATTERN_EXACT); - } else { - buffer_strcat(wb, HEALTH_CMDAPI_MSG_INVALID_KEY); - } + silencer = health_silencers_addparam(silencer, key, value); } - } if (likely(silencer)) { - // Add the created instance to the linked list in silencers - silencer->next = silencers->silencers; - silencers->silencers = silencer; - debug(D_HEALTH, "HEALTH command API: Added silencer %s:%s:%s:%s:%s", silencer->alarms, - silencer->charts, silencer->contexts, silencer->hosts, silencer->families - ); + health_silencers_add(silencer); buffer_strcat(wb, HEALTH_CMDAPI_MSG_ADDED); if (silencers->stype == STYPE_NONE) { buffer_strcat(wb, HEALTH_CMDAPI_MSG_STYPEWARNING); @@ -162,5 +194,11 @@ int web_client_api_request_v1_mgmt_health(RRDHOST *host, struct web_client *w, c } w->response.data = wb; buffer_no_cacheable(w->response.data); + if (ret == 200 && config_changed) { + BUFFER *jsonb = buffer_create(200); + health_silencers2json(jsonb); + health_silencers2file(jsonb); + } + return ret; } diff --git a/web/api/health/health_cmdapi.h b/web/api/health/health_cmdapi.h index d0f30401c..d8ec6aaa0 100644 --- a/web/api/health/health_cmdapi.h +++ b/web/api/health/health_cmdapi.h @@ -12,6 +12,7 @@ #define HEALTH_CMDAPI_CMD_SILENCE "SILENCE" #define HEALTH_CMDAPI_CMD_DISABLE "DISABLE" #define HEALTH_CMDAPI_CMD_RESET "RESET" +#define HEALTH_CMDAPI_CMD_LIST "LIST" #define HEALTH_CMDAPI_MSG_AUTHERROR "Auth Error\n" #define HEALTH_CMDAPI_MSG_SILENCEALL "All alarm notifications are silenced\n" @@ -20,7 +21,6 @@ #define HEALTH_CMDAPI_MSG_DISABLE "Health checks disabled for alarms matching the selectors\n" #define HEALTH_CMDAPI_MSG_SILENCE "Alarm notifications silenced for alarms matching the selectors\n" #define HEALTH_CMDAPI_MSG_ADDED "Alarm selector added\n" -#define HEALTH_CMDAPI_MSG_INVALID_KEY "Invalid key. Ignoring it.\n" #define HEALTH_CMDAPI_MSG_STYPEWARNING "WARNING: Added alarm selector to silence/disable alarms without a SILENCE or DISABLE command.\n" #define HEALTH_CMDAPI_MSG_NOSELECTORWARNING "WARNING: SILENCE or DISABLE command is ineffective without defining any alarm selectors.\n" diff --git a/web/api/netdata-swagger.json b/web/api/netdata-swagger.json index 2fa55c4fa..63bc5638d 100644 --- a/web/api/netdata-swagger.json +++ b/web/api/netdata-swagger.json @@ -77,6 +77,39 @@ } } }, + "/alarm_variables": { + "get": { + "summary": "List variables available to configure alarms for a chart", + "description": "Returns the basic information of a chart and all the variables that can be used in alarm and template health configurations for the particular chart or family", + "parameters": [ + { + "name": "chart", + "in": "query", + "description": "The id of the chart as returned by the /charts call.", + "required": true, + "type": "string", + "format": "as returned by /charts" + } + ], + "responses": { + "200": { + "description": "A javascript object with information about the chart and the available variables", + "schema": { + "$ref": "#/definitions/alarm_variables" + } + }, + "400": { + "description": "Bad request - the body will include a message stating what is wrong." + }, + "404": { + "description": "No chart with the given id is found." + }, + "500": { + "description": "Internal server error. This usually means the server is out of memory." + } + } + } + }, "/data": { "get": { "summary": "Get collected data for a specific chart", @@ -631,7 +664,7 @@ { "name": "cmd", "in": "query", - "description": "DISABLE ALL: No alarm criteria are evaluated, nothing is written in the alarm log. SILENCE ALL: No notifications are sent. RESET: Return to the default state. DISABLE/SILENCE: Set the mode to be used for the alarms matching the criteria of the alarm selectors.", + "description": "DISABLE ALL: No alarm criteria are evaluated, nothing is written in the alarm log. SILENCE ALL: No notifications are sent. RESET: Return to the default state. DISABLE/SILENCE: Set the mode to be used for the alarms matching the criteria of the alarm selectors. LIST: Show active configuration.", "required": false, "type": "string", "enum": [ @@ -639,7 +672,8 @@ "SILENCE ALL", "DISABLE", "SILENCE", - "RESET" + "RESET", + "LIST" ] }, { @@ -951,6 +985,70 @@ } } }, + "alarm_variables": { + "type": "object", + "properties": { + "chart": { + "type": "string", + "description": "The unique id of the chart" + }, + "chart_name": { + "type": "string", + "description": "The name of the chart" + }, + "cnart_context": { + "type": "string", + "description": "The context of the chart. It is shared across multiple monitored software or hardware instances and used in alarm templates" + }, + "family": { + "type": "string", + "description": "The family of the chart." + }, + "host": { + "type": "string", + "description": "The host containing the chart." + }, + "chart_variables": { + "type": "object", + "properties": { + "varname1": { + "type": "number", + "format": "float" + }, + "varname2": { + "type": "number", + "format": "float" + } + } + }, + "family_variables": { + "type": "object", + "properties": { + "varname1": { + "type": "number", + "format": "float" + }, + "varname2": { + "type": "number", + "format": "float" + } + } + }, + "host_variables": { + "type": "object", + "properties": { + "varname1": { + "type": "number", + "format": "float" + }, + "varname2": { + "type": "number", + "format": "float" + } + } + } + } + }, "dimension": { "type": "object", "properties": { @@ -1208,6 +1306,14 @@ "crit_parsed": { "type": "string" }, + "warn_repeat_every": { + "type": "integer", + "format": "int32" + }, + "crit_repeat_every": { + "type": "integer", + "format": "int32" + }, "green": { "type": "string", "format": "nullable" diff --git a/web/api/netdata-swagger.yaml b/web/api/netdata-swagger.yaml index c021efefa..3386e01a7 100644 --- a/web/api/netdata-swagger.yaml +++ b/web/api/netdata-swagger.yaml @@ -63,6 +63,28 @@ paths: $ref: '#/definitions/chart' '404': description: 'No chart with the given id is found.' + /alarm_variables: + get: + summary: 'List variables available to configure alarms for a chart' + description: 'Returns the basic information of a chart and all the variables that can be used in alarm and template health configurations for the particular chart or family' + parameters: + - name: chart + in: query + description: 'The id of the chart as returned by the /charts call.' + required: true + type: string + format: 'as returned by /charts' + responses: + '200': + description: 'A javascript object with information about the chart and the available variables' + schema: + $ref: '#/definitions/alarm_variables' + '400': + description: 'Bad request - the body will include a message stating what is wrong.' + '404': + description: 'No chart with the given id is found.' + '500': + description: 'Internal server error. This usually means the server is out of memory.' /data: get: summary: 'Get collected data for a specific chart' @@ -415,10 +437,10 @@ paths: parameters: - name: cmd in: query - description: 'DISABLE ALL: No alarm criteria are evaluated, nothing is written in the alarm log. SILENCE ALL: No notifications are sent. RESET: Return to the default state. DISABLE/SILENCE: Set the mode to be used for the alarms matching the criteria of the alarm selectors.' + description: 'DISABLE ALL: No alarm criteria are evaluated, nothing is written in the alarm log. SILENCE ALL: No notifications are sent. RESET: Return to the default state. DISABLE/SILENCE: Set the mode to be used for the alarms matching the criteria of the alarm selectors. LIST: Show active configuration.' required: false type: string - enum: ['DISABLE ALL', 'SILENCE ALL', 'DISABLE', 'SILENCE', 'RESET'] + enum: ['DISABLE ALL', 'SILENCE ALL', 'DISABLE', 'SILENCE', 'RESET', 'LIST'] - name: alarm in: query description: 'The expression provided will match both `alarm` and `template` names.' @@ -638,6 +660,51 @@ definitions: red: type: number description: 'Chart health red trheshold' + alarm_variables: + type: object + properties: + chart: + type: string + description: 'The unique id of the chart' + chart_name: + type: string + description: 'The name of the chart' + cnart_context: + type: string + description: 'The context of the chart. It is shared across multiple monitored software or hardware instances and used in alarm templates' + family: + type: string + description: 'The family of the chart.' + host: + type: string + description: 'The host containing the chart.' + chart_variables: + type: object + properties: + varname1: + type: number + format: float + varname2: + type: number + format: float + family_variables: + type: object + properties: + varname1: + type: number + format: float + varname2: + type: number + format: float + host_variables: + type: object + properties: + varname1: + type: number + format: float + varname2: + type: number + format: float dimension: type: object properties: @@ -825,6 +892,12 @@ definitions: type: string crit_parsed: type: string + warn_repeat_every: + type: integer + format: int32 + crit_repeat_every: + type: integer + format: int32 green: type: string format: nullable |