From aa2fe8ccbfcb117efa207d10229eeeac5d0f97c7 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Mon, 6 Feb 2023 17:11:30 +0100 Subject: Adding upstream version 1.38.0. Signed-off-by: Daniel Baumann --- web/README.md | 18 +- web/api/README.md | 6 +- web/api/badges/README.md | 2 +- web/api/badges/web_buffer_svg.c | 8 +- web/api/exporters/prometheus/README.md | 2 +- web/api/exporters/shell/allmetrics_shell.c | 2 +- web/api/formatters/README.md | 24 +- web/api/formatters/charts2json.c | 10 +- web/api/formatters/csv/README.md | 2 +- web/api/formatters/json/README.md | 2 +- web/api/formatters/json_wrapper.c | 60 ++- web/api/formatters/rrd2json.c | 4 +- web/api/formatters/rrd2json.h | 1 + web/api/formatters/rrdset2json.c | 6 +- web/api/formatters/ssv/README.md | 2 +- web/api/formatters/value/README.md | 6 +- web/api/formatters/value/value.c | 3 +- web/api/formatters/value/value.h | 2 +- web/api/health/README.md | 6 +- web/api/health/health_cmdapi.c | 2 +- web/api/queries/README.md | 2 +- web/api/queries/query.c | 712 +++++++++++++++++++---------- web/api/queries/rrdr.h | 6 +- web/api/queries/weights.c | 18 +- web/api/web_api_v1.c | 53 +-- web/api/web_api_v1.h | 2 - web/gui/README.md | 22 +- web/gui/confluence/README.md | 2 +- web/gui/custom/README.md | 17 +- web/gui/dashboard_info.js | 394 +++++++++++++++- web/gui/main.js | 3 +- web/server/README.md | 20 +- web/server/static/static-threaded.c | 69 +-- web/server/web_client.c | 68 ++- web/server/web_client.h | 7 +- web/server/web_client_cache.c | 19 +- web/server/web_server.c | 2 +- 37 files changed, 1127 insertions(+), 457 deletions(-) (limited to 'web') diff --git a/web/README.md b/web/README.md index 7093ca18f..eae579346 100644 --- a/web/README.md +++ b/web/README.md @@ -14,17 +14,17 @@ team and the community, but you can also customize them yourself. There are two primary ways to view Netdata's dashboards: -1. The [local Agent dashboard](/web/gui/README.md) that comes pre-configured with every Netdata installation. You can +1. The [local Agent dashboard](https://github.com/netdata/netdata/blob/master/web/gui/README.md) that comes pre-configured with every Netdata installation. You can see it at `http://NODE:19999`, replacing `NODE` with `localhost`, the hostname of your node, or its IP address. You can customize the contents and colors of the standard dashboard [using - JavaScript](/web/gui/README.md#customizing-the-local-dashboard). + JavaScript](https://github.com/netdata/netdata/blob/master/web/gui/README.md#customizing-the-local-dashboard). 2. The [`dashboard.js` JavaScript library](#dashboardjs), which helps you - [customize the standard dashboards](/web/gui/README.md#customizing-the-local-dashboard) - using JavaScript, or create entirely new [custom dashboards](/web/gui/custom/README.md) or - [Atlassian Confluence dashboards](/web/gui/confluence/README.md). + [customize the standard dashboards](https://github.com/netdata/netdata/blob/master/web/gui/README.md#customizing-the-local-dashboard) + using JavaScript, or create entirely new [custom dashboards](https://github.com/netdata/netdata/blob/master/web/gui/custom/README.md) or + [Atlassian Confluence dashboards](https://github.com/netdata/netdata/blob/master/web/gui/confluence/README.md). -You can also view all the data Netdata collects through the [REST API v1](/web/api/README.md#netdata-rest-api). +You can also view all the data Netdata collects through the [REST API v1](https://github.com/netdata/netdata/blob/master/web/api/README.md#netdata-rest-api). No matter where you use Netdata's charts, you'll want to know how to [use](#using-charts) them. You'll also want to understand how Netdata defines [charts](#charts), [dimensions](#dimensions), [families](#families), and @@ -84,7 +84,7 @@ Netdata organizes metrics into charts, dimensions, families, and contexts. A **chart** is an individual, interactive, always-updating graphic displaying one or more collected/calculated metrics. Charts are generated by -[collectors](/collectors/README.md). +[collectors](https://github.com/netdata/netdata/blob/master/collectors/README.md). Here's the system CPU chart, the first chart displayed on the standard dashboard: @@ -182,7 +182,7 @@ hover over the date above the list of dimensions. A tooltip will appear that shows you two pieces of information: the collector that produces the chart, and the chart's context. -Netdata also uses [contexts for alarm templates](/health/REFERENCE.md#alarm-line-on). You can create an alarm for the +Netdata also uses [contexts for alarm templates](https://github.com/netdata/netdata/blob/master/health/REFERENCE.md#alarm-line-on). You can create an alarm for the `net.packets` context to receive alerts for any chart with that context, no matter which family it's attached to. ## Positive and negative values on charts @@ -215,7 +215,7 @@ all the charts and other visualizations that appear on any Netdata dashboard. You need to put `dashboard.js` on any HTML page that's going to render Netdata charts. -The [custom dashboards documentation](/web/gui/custom/README.md) contains examples of such +The [custom dashboards documentation](https://github.com/netdata/netdata/blob/master/web/gui/custom/README.md) contains examples of such custom HTML pages. ### Generating dashboard.js diff --git a/web/api/README.md b/web/api/README.md index fc520a09a..82a55eb25 100644 --- a/web/api/README.md +++ b/web/api/README.md @@ -1,6 +1,10 @@ # API diff --git a/web/api/badges/README.md b/web/api/badges/README.md index 84409471a..8f6eca62a 100644 --- a/web/api/badges/README.md +++ b/web/api/badges/README.md @@ -25,7 +25,7 @@ Similarly, there is [a chart that shows outbound bandwidth per class](http://lon The right one is a **volume** calculation. Netdata calculated the total of the last 86.400 seconds (a day) which gives `kilobits`, then divided it by 8 to make it KB, then by 1024 to make it MB and then by 1024 to make it GB. Calculations like this are quite accurate, since for every value collected, every second, Netdata interpolates it to second boundary using microsecond calculations. -Let's see a few more badge examples (they come from the [Netdata registry](/registry/README.md)): +Let's see a few more badge examples (they come from the [Netdata registry](https://github.com/netdata/netdata/blob/master/registry/README.md)): - **cpu usage of user `root`** (you can pick any user; 100% = 1 core). This will be `green <10%`, `yellow <20%`, `orange <50%`, `blue <100%` (1 core), `red` otherwise (you define thresholds and colors on the URL). diff --git a/web/api/badges/web_buffer_svg.c b/web/api/badges/web_buffer_svg.c index 080f2240f..ca0f4b7a0 100644 --- a/web/api/badges/web_buffer_svg.c +++ b/web/api/badges/web_buffer_svg.c @@ -913,7 +913,7 @@ int web_client_api_request_v1_badge(RRDHOST *host, struct web_client *w, char *u if(!strcmp(name, "chart")) chart = value; else if(!strcmp(name, "dimension") || !strcmp(name, "dim") || !strcmp(name, "dimensions") || !strcmp(name, "dims")) { if(!dimensions) - dimensions = buffer_create(100); + dimensions = buffer_create(100, &netdata_buffers_statistics.buffers_api); buffer_strcat(dimensions, "|"); buffer_strcat(dimensions, value); @@ -969,7 +969,7 @@ int web_client_api_request_v1_badge(RRDHOST *host, struct web_client *w, char *u ret = HTTP_RESP_OK; goto cleanup; } - st->last_accessed_time = now_realtime_sec(); + st->last_accessed_time_s = now_realtime_sec(); if(alarm) { rca = rrdcalc_from_rrdset_get(st, alarm); @@ -1110,14 +1110,14 @@ int web_client_api_request_v1_badge(RRDHOST *host, struct web_client *w, char *u ret = HTTP_RESP_INTERNAL_SERVER_ERROR; // if the collected value is too old, don't calculate its value - if (rrdset_last_entry_t(st) >= (now_realtime_sec() - (st->update_every * st->gap_when_lost_iterations_above))) + if (rrdset_last_entry_s(st) >= (now_realtime_sec() - (st->update_every * gap_when_lost_iterations_above))) ret = rrdset2value_api_v1(st, w->response.data, &n, (dimensions) ? buffer_tostring(dimensions) : NULL, points, after, before, group, group_options, 0, options, NULL, &latest_timestamp, NULL, NULL, NULL, &value_is_null, NULL, 0, 0, - QUERY_SOURCE_API_BADGE); + QUERY_SOURCE_API_BADGE, STORAGE_PRIORITY_NORMAL); // if the value cannot be calculated, show empty badge if (ret != HTTP_RESP_OK) { diff --git a/web/api/exporters/prometheus/README.md b/web/api/exporters/prometheus/README.md index cf7e2caa8..1ff86f4e0 100644 --- a/web/api/exporters/prometheus/README.md +++ b/web/api/exporters/prometheus/README.md @@ -5,6 +5,6 @@ custom_edit_url: https://github.com/netdata/netdata/edit/master/web/api/exporter # Prometheus exporter -Read the Prometheus exporter documentation: [Using Netdata with Prometheus](/exporting/prometheus/README.md). +Read the Prometheus exporter documentation: [Using Netdata with Prometheus](https://github.com/netdata/netdata/blob/master/exporting/prometheus/README.md). diff --git a/web/api/exporters/shell/allmetrics_shell.c b/web/api/exporters/shell/allmetrics_shell.c index 0ffbac67b..dded5a536 100644 --- a/web/api/exporters/shell/allmetrics_shell.c +++ b/web/api/exporters/shell/allmetrics_shell.c @@ -127,7 +127,7 @@ void rrd_stats_api_v1_charts_allmetrics_json(RRDHOST *host, const char *filter_s rrdset_family(st), rrdset_context(st), rrdset_units(st), - (int64_t)rrdset_last_entry_t(st)); + (int64_t) rrdset_last_entry_s(st)); chart_counter++; dimension_counter = 0; diff --git a/web/api/formatters/README.md b/web/api/formatters/README.md index 3e67ac6ee..4c281f064 100644 --- a/web/api/formatters/README.md +++ b/web/api/formatters/README.md @@ -12,18 +12,18 @@ The following formats are supported: | format|module|content type|description| |:----:|:----:|:----------:|:----------| -| `array`|[ssv](/web/api/formatters/ssv/README.md)|application/json|a JSON array| -| `csv`|[csv](/web/api/formatters/csv/README.md)|text/plain|a text table, comma separated, with a header line (dimension names) and `\r\n` at the end of the lines| -| `csvjsonarray`|[csv](/web/api/formatters/csv/README.md)|application/json|a JSON array, with each row as another array (the first row has the dimension names)| -| `datasource`|[json](/web/api/formatters/json/README.md)|application/json|a Google Visualization Provider `datasource` javascript callback| -| `datatable`|[json](/web/api/formatters/json/README.md)|application/json|a Google `datatable`| -| `html`|[csv](/web/api/formatters/csv/README.md)|text/html|an html table| -| `json`|[json](/web/api/formatters/json/README.md)|application/json|a JSON object| -| `jsonp`|[json](/web/api/formatters/json/README.md)|application/json|a JSONP javascript callback| -| `markdown`|[csv](/web/api/formatters/csv/README.md)|text/plain|a markdown table| -| `ssv`|[ssv](/web/api/formatters/ssv/README.md)|text/plain|a space separated list of values| -| `ssvcomma`|[ssv](/web/api/formatters/ssv/README.md)|text/plain|a comma separated list of values| -| `tsv`|[csv](/web/api/formatters/csv/README.md)|text/plain|a TAB delimited `csv` (MS Excel flavor)| +| `array`|[ssv](https://github.com/netdata/netdata/blob/master/web/api/formatters/ssv/README.md)|application/json|a JSON array| +| `csv`|[csv](https://github.com/netdata/netdata/blob/master/web/api/formatters/csv/README.md)|text/plain|a text table, comma separated, with a header line (dimension names) and `\r\n` at the end of the lines| +| `csvjsonarray`|[csv](https://github.com/netdata/netdata/blob/master/web/api/formatters/csv/README.md)|application/json|a JSON array, with each row as another array (the first row has the dimension names)| +| `datasource`|[json](https://github.com/netdata/netdata/blob/master/web/api/formatters/json/README.md)|application/json|a Google Visualization Provider `datasource` javascript callback| +| `datatable`|[json](https://github.com/netdata/netdata/blob/master/web/api/formatters/json/README.md)|application/json|a Google `datatable`| +| `html`|[csv](https://github.com/netdata/netdata/blob/master/web/api/formatters/csv/README.md)|text/html|an html table| +| `json`|[json](https://github.com/netdata/netdata/blob/master/web/api/formatters/json/README.md)|application/json|a JSON object| +| `jsonp`|[json](https://github.com/netdata/netdata/blob/master/web/api/formatters/json/README.md)|application/json|a JSONP javascript callback| +| `markdown`|[csv](https://github.com/netdata/netdata/blob/master/web/api/formatters/csv/README.md)|text/plain|a markdown table| +| `ssv`|[ssv](https://github.com/netdata/netdata/blob/master/web/api/formatters/ssv/README.md)|text/plain|a space separated list of values| +| `ssvcomma`|[ssv](https://github.com/netdata/netdata/blob/master/web/api/formatters/ssv/README.md)|text/plain|a comma separated list of values| +| `tsv`|[csv](https://github.com/netdata/netdata/blob/master/web/api/formatters/csv/README.md)|text/plain|a TAB delimited `csv` (MS Excel flavor)| For examples of each format, check the relative module documentation. diff --git a/web/api/formatters/charts2json.c b/web/api/formatters/charts2json.c index 1fc20b493..61a9ecf2f 100644 --- a/web/api/formatters/charts2json.c +++ b/web/api/formatters/charts2json.c @@ -10,7 +10,7 @@ const char* get_release_channel() { if (use_stable == -1) { char filename[FILENAME_MAX + 1]; snprintfz(filename, FILENAME_MAX, "%s/.environment", netdata_configured_user_config_dir); - procfile *ff = procfile_open(filename, "=", PROCFILE_FLAG_DEFAULT); + procfile *ff = procfile_open(filename, "=", PROCFILE_FLAG_ERROR_ON_ERROR_LOG); if (ff) { procfile_set_quotes(ff, "'\""); ff = procfile_readall(ff); @@ -78,7 +78,7 @@ void charts2json(RRDHOST *host, BUFFER *wb, int skip_volatile, int show_archived rrdset2json(st, wb, &dimensions, &memory, skip_volatile); c++; - st->last_accessed_time = now; + st->last_accessed_time_s = now; } } rrdset_foreach_done(st); @@ -102,10 +102,10 @@ void charts2json(RRDHOST *host, BUFFER *wb, int skip_volatile, int show_archived , dimensions , alarms , memory - , rrd_hosts_available + , rrdhost_hosts_available() ); - if(unlikely(rrd_hosts_available > 1)) { + if(unlikely(rrdhost_hosts_available() > 1)) { rrd_rdlock(); size_t found = 0; @@ -178,7 +178,7 @@ void chartcollectors2json(RRDHOST *host, BUFFER *wb) { }; sprintf(name, "%s:%s", col.plugin, col.module); dictionary_set(dict, name, &col, sizeof(struct collector)); - st->last_accessed_time = now; + st->last_accessed_time_s = now; } } rrdset_foreach_done(st); diff --git a/web/api/formatters/csv/README.md b/web/api/formatters/csv/README.md index df7c11efa..fc5ffec1b 100644 --- a/web/api/formatters/csv/README.md +++ b/web/api/formatters/csv/README.md @@ -5,7 +5,7 @@ custom_edit_url: https://github.com/netdata/netdata/edit/master/web/api/formatte # CSV formatter -The CSV formatter presents [results of database queries](/web/api/queries/README.md) in the following formats: +The CSV formatter presents [results of database queries](https://github.com/netdata/netdata/blob/master/web/api/queries/README.md) in the following formats: | format|content type|description| | :----:|:----------:|:----------| diff --git a/web/api/formatters/json/README.md b/web/api/formatters/json/README.md index a0f8108e7..75f729ada 100644 --- a/web/api/formatters/json/README.md +++ b/web/api/formatters/json/README.md @@ -5,7 +5,7 @@ custom_edit_url: https://github.com/netdata/netdata/edit/master/web/api/formatte # JSON formatter -The CSV formatter presents [results of database queries](/web/api/queries/README.md) in the following formats: +The CSV formatter presents [results of database queries](https://github.com/netdata/netdata/blob/master/web/api/queries/README.md) in the following formats: | format | content type | description| |:----:|:----------:|:----------| diff --git a/web/api/formatters/json_wrapper.c b/web/api/formatters/json_wrapper.c index 8b9b7522c..beb74912e 100644 --- a/web/api/formatters/json_wrapper.c +++ b/web/api/formatters/json_wrapper.c @@ -32,6 +32,53 @@ static int fill_formatted_callback(const char *name, const char *value, RRDLABEL return 1; } +void rrdr_show_plan(RRDR *r, BUFFER *wb, const char *kq, const char *sq __maybe_unused) { + QUERY_TARGET *qt = r->internal.qt; + + buffer_sprintf(wb, "\n\t%squery_plan%s: {", kq, kq); + + for(size_t m = 0; m < qt->query.used; m++) { + QUERY_METRIC *qm = &qt->query.array[m]; + + if(m) + buffer_strcat(wb, ","); + + buffer_sprintf(wb, "\n\t\t%s%s%s: {", kq, string2str(qm->dimension.id), kq); + + buffer_sprintf(wb, "\n\t\t\t%splans%s: [", kq, kq); + for(size_t p = 0; p < qm->plan.used ;p++) { + QUERY_PLAN_ENTRY *qp = &qm->plan.array[p]; + if(p) + buffer_strcat(wb, ","); + + buffer_strcat(wb, "\n\t\t\t\t{"); + buffer_sprintf(wb, "\n\t\t\t\t\t%stier%s: %zu,", kq, kq, qp->tier); + buffer_sprintf(wb, "\n\t\t\t\t\t%safter%s: %ld,", kq, kq, qp->after); + buffer_sprintf(wb, "\n\t\t\t\t\t%sbefore%s: %ld", kq, kq, qp->before); + buffer_strcat(wb, "\n\t\t\t\t}"); + } + buffer_strcat(wb, "\n\t\t\t],"); + + buffer_sprintf(wb, "\n\t\t\t%stiers%s: [", kq, kq); + for(size_t tier = 0; tier < storage_tiers ;tier++) { + if(tier) + buffer_strcat(wb, ","); + + buffer_strcat(wb, "\n\t\t\t\t{"); + buffer_sprintf(wb, "\n\t\t\t\t\t%stier%s: %zu,", kq, kq, tier); + buffer_sprintf(wb, "\n\t\t\t\t\t%sdb_first_time%s: %ld,", kq, kq, qm->tiers[tier].db_first_time_s); + buffer_sprintf(wb, "\n\t\t\t\t\t%sdb_last_time%s: %ld,", kq, kq, qm->tiers[tier].db_last_time_s); + buffer_sprintf(wb, "\n\t\t\t\t\t%sweight%s: %ld", kq, kq, qm->tiers[tier].weight); + buffer_strcat(wb, "\n\t\t\t\t}"); + } + buffer_strcat(wb, "\n\t\t\t]"); + + buffer_strcat(wb, "\n\t\t}"); + } + + buffer_strcat(wb, "\n\t},"); +} + void rrdr_json_wrapper_begin(RRDR *r, BUFFER *wb, uint32_t format, RRDR_OPTIONS options, int string_value, RRDR_GROUPING group_method) { @@ -70,9 +117,9 @@ void rrdr_json_wrapper_begin(RRDR *r, BUFFER *wb, uint32_t format, RRDR_OPTIONS , kq, kq, sq, qt->id, sq , kq, kq, sq, qt->id, sq , kq, kq, (long long)r->update_every - , kq, kq, (long long)qt->db.minimum_latest_update_every - , kq, kq, (long long)qt->db.first_time_t - , kq, kq, (long long)qt->db.last_time_t + , kq, kq, (long long)qt->db.minimum_latest_update_every_s + , kq, kq, (long long)qt->db.first_time_s + , kq, kq, (long long)qt->db.last_time_s , kq, kq, (long long)r->before , kq, kq, (long long)r->after , kq, kq, sq, web_client_api_request_v1_data_group_to_string(group_method), sq @@ -369,9 +416,12 @@ void rrdr_json_wrapper_begin(RRDR *r, BUFFER *wb, uint32_t format, RRDR_OPTIONS for(size_t tier = 0; tier < storage_tiers ; tier++) buffer_sprintf(wb, "%s%zu", tier>0?", ":"", r->internal.tier_points_read[tier]); - buffer_strcat(wb, " ]"); + buffer_strcat(wb, " ],"); + + if(options & RRDR_OPTION_SHOW_PLAN) + rrdr_show_plan(r, wb, kq, sq); - buffer_sprintf(wb, ",\n %sresult%s: ", kq, kq); + buffer_sprintf(wb, "\n %sresult%s: ", kq, kq); if(string_value) buffer_strcat(wb, sq); //info("JSONWRAPPER(): %s: END", r->st->id); diff --git a/web/api/formatters/rrd2json.c b/web/api/formatters/rrd2json.c index 8bf547192..64cde5b2b 100644 --- a/web/api/formatters/rrd2json.c +++ b/web/api/formatters/rrd2json.c @@ -77,6 +77,7 @@ int rrdset2value_api_v1( , time_t timeout , size_t tier , QUERY_SOURCE query_source + , STORAGE_PRIORITY priority ) { int ret = HTTP_RESP_INTERNAL_SERVER_ERROR; @@ -94,7 +95,8 @@ int rrdset2value_api_v1( group_options, timeout, tier, - query_source); + query_source, + priority); if(!r) { if(value_is_null) *value_is_null = 1; diff --git a/web/api/formatters/rrd2json.h b/web/api/formatters/rrd2json.h index 048281d7e..88b9f773f 100644 --- a/web/api/formatters/rrd2json.h +++ b/web/api/formatters/rrd2json.h @@ -79,6 +79,7 @@ int rrdset2value_api_v1( , time_t timeout , size_t tier , QUERY_SOURCE query_source + , STORAGE_PRIORITY priority ); #endif /* NETDATA_RRD2JSON_H */ diff --git a/web/api/formatters/rrdset2json.c b/web/api/formatters/rrdset2json.c index 1e8106335..449d4ddf5 100644 --- a/web/api/formatters/rrdset2json.c +++ b/web/api/formatters/rrdset2json.c @@ -25,8 +25,8 @@ void chart_labels2json(RRDSET *st, BUFFER *wb, size_t indentation) // generate JSON for the /api/v1/chart API call void rrdset2json(RRDSET *st, BUFFER *wb, size_t *dimensions_count, size_t *memory_used, int skip_volatile) { - time_t first_entry_t = rrdset_first_entry_t(st); - time_t last_entry_t = rrdset_last_entry_t(st); + time_t first_entry_t = rrdset_first_entry_s(st); + time_t last_entry_t = rrdset_last_entry_s(st); buffer_sprintf( wb, @@ -83,7 +83,7 @@ void rrdset2json(RRDSET *st, BUFFER *wb, size_t *dimensions_count, size_t *memor "\t\t\t\"dimensions\": {\n", st->update_every); - unsigned long memory = sizeof(RRDSET) + st->memsize; + unsigned long memory = sizeof(RRDSET); size_t dimensions = 0; RRDDIM *rd; diff --git a/web/api/formatters/ssv/README.md b/web/api/formatters/ssv/README.md index d9e193d66..4ca2a64ca 100644 --- a/web/api/formatters/ssv/README.md +++ b/web/api/formatters/ssv/README.md @@ -5,7 +5,7 @@ custom_edit_url: https://github.com/netdata/netdata/edit/master/web/api/formatte # SSV formatter -The SSV formatter sums all dimensions in [results of database queries](/web/api/queries/README.md) +The SSV formatter sums all dimensions in [results of database queries](https://github.com/netdata/netdata/blob/master/web/api/queries/README.md) to a single value and returns a list of such values showing how it changes through time. It supports the following formats: diff --git a/web/api/formatters/value/README.md b/web/api/formatters/value/README.md index a51e32de7..5b75ded7c 100644 --- a/web/api/formatters/value/README.md +++ b/web/api/formatters/value/README.md @@ -5,7 +5,7 @@ custom_edit_url: https://github.com/netdata/netdata/edit/master/web/api/formatte # Value formatter -The Value formatter presents [results of database queries](/web/api/queries/README.md) as a single value. +The Value formatter presents [results of database queries](https://github.com/netdata/netdata/blob/master/web/api/queries/README.md) as a single value. To calculate the single value to be returned, it sums the values of all dimensions. @@ -18,7 +18,7 @@ The Value formatter respects the following API `&options=`: | `min2max` | yes | to return the delta from the minimum value to the maximum value (across dimensions)| The Value formatter is not exposed by the API by itself. -Instead it is used by the [`ssv`](/web/api/formatters/ssv/README.md) formatter -and [health monitoring queries](/health/README.md). +Instead it is used by the [`ssv`](https://github.com/netdata/netdata/blob/master/web/api/formatters/ssv/README.md) formatter +and [health monitoring queries](https://github.com/netdata/netdata/blob/master/health/README.md). diff --git a/web/api/formatters/value/value.c b/web/api/formatters/value/value.c index 46a71303e..915d58ac9 100644 --- a/web/api/formatters/value/value.c +++ b/web/api/formatters/value/value.c @@ -106,7 +106,7 @@ QUERY_VALUE rrdmetric2value(RRDHOST *host, struct rrdcontext_acquired *rca, struct rrdinstance_acquired *ria, struct rrdmetric_acquired *rma, time_t after, time_t before, RRDR_OPTIONS options, RRDR_GROUPING group_method, const char *group_options, - size_t tier, time_t timeout, QUERY_SOURCE query_source + size_t tier, time_t timeout, QUERY_SOURCE query_source, STORAGE_PRIORITY priority ) { QUERY_TARGET_REQUEST qtr = { .host = host, @@ -122,6 +122,7 @@ QUERY_VALUE rrdmetric2value(RRDHOST *host, .tier = tier, .timeout = timeout, .query_source = query_source, + .priority = priority, }; ONEWAYALLOC *owa = onewayalloc_create(16 * 1024); diff --git a/web/api/formatters/value/value.h b/web/api/formatters/value/value.h index 76b1869f3..3f7f51ccb 100644 --- a/web/api/formatters/value/value.h +++ b/web/api/formatters/value/value.h @@ -23,7 +23,7 @@ QUERY_VALUE rrdmetric2value(RRDHOST *host, struct rrdcontext_acquired *rca, struct rrdinstance_acquired *ria, struct rrdmetric_acquired *rma, time_t after, time_t before, RRDR_OPTIONS options, RRDR_GROUPING group_method, const char *group_options, - size_t tier, time_t timeout, QUERY_SOURCE query_source + size_t tier, time_t timeout, QUERY_SOURCE query_source, STORAGE_PRIORITY priority ); NETDATA_DOUBLE rrdr2value(RRDR *r, long i, RRDR_OPTIONS options, int *all_values_are_null, NETDATA_DOUBLE *anomaly_rate); diff --git a/web/api/health/README.md b/web/api/health/README.md index 9ec8f31c0..bfdd0ac68 100644 --- a/web/api/health/README.md +++ b/web/api/health/README.md @@ -72,7 +72,7 @@ You can access the API via GET requests, by adding the bearer token to an `Autho curl "http://NODE:19999/api/v1/manage/health?cmd=RESET" -H "X-Auth-Token: Mytoken" ``` -By default access to the health management API is only allowed from `localhost`. Accessing the API from anything else will return a 403 error with the message `You are not allowed to access this resource.`. You can change permissions by editing the `allow management from` variable in `netdata.conf` within the [web] section. See [web server access lists](/web/server/README.md#access-lists) for more information. +By default access to the health management API is only allowed from `localhost`. Accessing the API from anything else will return a 403 error with the message `You are not allowed to access this resource.`. You can change permissions by editing the `allow management from` variable in `netdata.conf` within the [web] section. See [web server access lists](https://github.com/netdata/netdata/blob/master/web/server/README.md#access-lists) for more information. The command `RESET` just returns Netdata to the default operation, with all health checks and notifications enabled. If you've configured and entered your token correctly, you should see the plain text response `All health checks and notifications are enabled`. @@ -126,7 +126,7 @@ curl "http://NODE:19999/api/v1/manage/health?cmd=SILENCE&context=load" -H "X-Aut #### Selection criteria -The `selection criteria` are key/value pairs, in the format `key : value`, where value is a Netdata [simple pattern](/libnetdata/simple_pattern/README.md). This means that you can create very powerful selectors (you will rarely need more than one or two). +The `selection criteria` are key/value pairs, in the format `key : value`, where value is a Netdata [simple pattern](https://github.com/netdata/netdata/blob/master/libnetdata/simple_pattern/README.md). This means that you can create very powerful selectors (you will rarely need more than one or two). The accepted keys for the `selection criteria` are the following: @@ -220,6 +220,6 @@ The file's location is configurable in `netdata.conf`. The default is shown belo ### Further reading -The test script under [tests/health_mgmtapi](/tests/health_mgmtapi/README.md) contains a series of tests that you can either run or read through to understand the various calls and responses better. +The test script under [tests/health_mgmtapi](https://github.com/netdata/netdata/blob/master/tests/health_mgmtapi/README.md) contains a series of tests that you can either run or read through to understand the various calls and responses better. diff --git a/web/api/health/health_cmdapi.c b/web/api/health/health_cmdapi.c index bad3e960a..7a939bc0f 100644 --- a/web/api/health/health_cmdapi.c +++ b/web/api/health/health_cmdapi.c @@ -196,7 +196,7 @@ int web_client_api_request_v1_mgmt_health(RRDHOST *host, struct web_client *w, c w->response.data = wb; buffer_no_cacheable(w->response.data); if (ret == HTTP_RESP_OK && config_changed) { - BUFFER *jsonb = buffer_create(200); + BUFFER *jsonb = buffer_create(200, &netdata_buffers_statistics.buffers_health); health_silencers2json(jsonb); health_silencers2file(jsonb); buffer_free(jsonb); diff --git a/web/api/queries/README.md b/web/api/queries/README.md index 44cdd05b4..2a17ac784 100644 --- a/web/api/queries/README.md +++ b/web/api/queries/README.md @@ -88,7 +88,7 @@ To disable alignment, pass `&options=unaligned` to the query. To execute the query, the engine evaluates all dimensions of the chart, one after another. -The engine does not evaluate dimensions that do not match the [simple pattern](/libnetdata/simple_pattern/README.md) +The engine does not evaluate dimensions that do not match the [simple pattern](https://github.com/netdata/netdata/blob/master/libnetdata/simple_pattern/README.md) given at the `dimensions` parameter, except when `options=percentage` is given (this option requires all the dimensions to be evaluated to find the percentage of each dimension vs to chart total). diff --git a/web/api/queries/query.c b/web/api/queries/query.c index 0365b6e96..1f10c5137 100644 --- a/web/api/queries/query.c +++ b/web/api/queries/query.c @@ -17,6 +17,8 @@ #include "percentile/percentile.h" #include "trimmed_mean/trimmed_mean.h" +#define POINTS_TO_EXPAND_QUERY 5 + // ---------------------------------------------------------------------------- static struct { @@ -694,7 +696,7 @@ static inline void rrdr_done(RRDR *r, long rrdr_line) { // tier management static bool query_metric_is_valid_tier(QUERY_METRIC *qm, size_t tier) { - if(!qm->tiers[tier].db_metric_handle || !qm->tiers[tier].db_first_time_t || !qm->tiers[tier].db_last_time_t || !qm->tiers[tier].db_update_every) + if(!qm->tiers[tier].db_metric_handle || !qm->tiers[tier].db_first_time_s || !qm->tiers[tier].db_last_time_s || !qm->tiers[tier].db_update_every_s) return false; return true; @@ -705,11 +707,11 @@ static size_t query_metric_first_working_tier(QUERY_METRIC *qm) { // find the db time-range for this tier for all metrics STORAGE_METRIC_HANDLE *db_metric_handle = qm->tiers[tier].db_metric_handle; - time_t first_t = qm->tiers[tier].db_first_time_t; - time_t last_t = qm->tiers[tier].db_last_time_t; - time_t update_every = qm->tiers[tier].db_update_every; + time_t first_time_s = qm->tiers[tier].db_first_time_s; + time_t last_time_s = qm->tiers[tier].db_last_time_s; + time_t update_every_s = qm->tiers[tier].db_update_every_s; - if(!db_metric_handle || !first_t || !last_t || !update_every) + if(!db_metric_handle || !first_time_s || !last_time_s || !update_every_s) continue; return tier; @@ -718,19 +720,23 @@ static size_t query_metric_first_working_tier(QUERY_METRIC *qm) { return 0; } -static long query_plan_points_coverage_weight(time_t db_first_t, time_t db_last_t, time_t db_update_every, time_t after_wanted, time_t before_wanted, size_t points_wanted, size_t tier __maybe_unused) { - if(db_first_t == 0 || db_last_t == 0 || db_update_every == 0) +static long query_plan_points_coverage_weight(time_t db_first_time_s, time_t db_last_time_s, time_t db_update_every_s, time_t after_wanted, time_t before_wanted, size_t points_wanted, size_t tier __maybe_unused) { + if(db_first_time_s == 0 || + db_last_time_s == 0 || + db_update_every_s == 0 || + db_first_time_s > before_wanted || + db_last_time_s < after_wanted) return -LONG_MAX; - time_t common_first_t = MAX(db_first_t, after_wanted); - time_t common_last_t = MIN(db_last_t, before_wanted); + long long common_first_t = MAX(db_first_time_s, after_wanted); + long long common_last_t = MIN(db_last_time_s, before_wanted); - long time_coverage = (common_last_t - common_first_t) * 1000000 / (before_wanted - after_wanted); - size_t points_wanted_in_coverage = points_wanted * time_coverage / 1000000; + long long time_coverage = (common_last_t - common_first_t) * 1000000LL / (before_wanted - after_wanted); + long long points_wanted_in_coverage = (long long)points_wanted * time_coverage / 1000000LL; - long points_available = (common_last_t - common_first_t) / db_update_every; - long points_delta = (long)(points_available - points_wanted_in_coverage); - long points_coverage = (points_delta < 0) ? (long)(points_available * time_coverage / points_wanted_in_coverage) : time_coverage; + long long points_available = (common_last_t - common_first_t) / db_update_every_s; + long long points_delta = (long)(points_available - points_wanted_in_coverage); + long long points_coverage = (points_delta < 0) ? (long)(points_available * time_coverage / points_wanted_in_coverage) : time_coverage; // a way to benefit higher tiers // points_coverage += (long)tier * 10000; @@ -738,7 +744,7 @@ static long query_plan_points_coverage_weight(time_t db_first_t, time_t db_last_ if(points_available <= 0) return -LONG_MAX; - return points_coverage; + return (long)(points_coverage + (25000LL * tier)); // 2.5% benefit for each higher tier } static size_t query_metric_best_tier_for_timeframe(QUERY_METRIC *qm, time_t after_wanted, time_t before_wanted, size_t points_wanted) { @@ -748,27 +754,49 @@ static size_t query_metric_best_tier_for_timeframe(QUERY_METRIC *qm, time_t afte if(unlikely(after_wanted == before_wanted || points_wanted <= 0)) return query_metric_first_working_tier(qm); - long weight[storage_tiers]; + time_t min_first_time_s = 0; + time_t max_last_time_s = 0; + + for(size_t tier = 0; tier < storage_tiers ; tier++) { + time_t first_time_s = qm->tiers[tier].db_first_time_s; + time_t last_time_s = qm->tiers[tier].db_last_time_s; + + if(!min_first_time_s || (first_time_s && first_time_s < min_first_time_s)) + min_first_time_s = first_time_s; + + if(!max_last_time_s || (last_time_s && last_time_s > max_last_time_s)) + max_last_time_s = last_time_s; + } for(size_t tier = 0; tier < storage_tiers ; tier++) { // find the db time-range for this tier for all metrics STORAGE_METRIC_HANDLE *db_metric_handle = qm->tiers[tier].db_metric_handle; - time_t first_t = qm->tiers[tier].db_first_time_t; - time_t last_t = qm->tiers[tier].db_last_time_t; - time_t update_every = qm->tiers[tier].db_update_every; - - if(!db_metric_handle || !first_t || !last_t || !update_every) { - weight[tier] = -LONG_MAX; + time_t first_time_s = qm->tiers[tier].db_first_time_s; + time_t last_time_s = qm->tiers[tier].db_last_time_s; + time_t update_every_s = qm->tiers[tier].db_update_every_s; + + if( !db_metric_handle || + !first_time_s || + !last_time_s || + !update_every_s || + first_time_s > before_wanted || + last_time_s < after_wanted + ) { + qm->tiers[tier].weight = -LONG_MAX; continue; } - weight[tier] = query_plan_points_coverage_weight(first_t, last_t, update_every, after_wanted, before_wanted, points_wanted, tier); + internal_fatal(first_time_s > before_wanted || last_time_s < after_wanted, "QUERY: invalid db durations"); + + qm->tiers[tier].weight = query_plan_points_coverage_weight( + min_first_time_s, max_last_time_s, update_every_s, + after_wanted, before_wanted, points_wanted, tier); } size_t best_tier = 0; for(size_t tier = 1; tier < storage_tiers ; tier++) { - if(weight[tier] >= weight[best_tier]) + if(qm->tiers[tier].weight >= qm->tiers[best_tier].weight) best_tier = tier; } @@ -788,38 +816,38 @@ static size_t rrddim_find_best_tier_for_timeframe(QUERY_TARGET *qt, time_t after for(size_t tier = 0; tier < storage_tiers ; tier++) { - time_t common_first_t = 0; - time_t common_last_t = 0; - time_t common_update_every = 0; + time_t common_first_time_s = 0; + time_t common_last_time_s = 0; + time_t common_update_every_s = 0; // find the db time-range for this tier for all metrics for(size_t i = 0, used = qt->query.used; i < used ; i++) { QUERY_METRIC *qm = &qt->query.array[i]; - time_t first_t = qm->tiers[tier].db_first_time_t; - time_t last_t = qm->tiers[tier].db_last_time_t; - time_t update_every = qm->tiers[tier].db_update_every; + time_t first_time_s = qm->tiers[tier].db_first_time_s; + time_t last_time_s = qm->tiers[tier].db_last_time_s; + time_t update_every_s = qm->tiers[tier].db_update_every_s; - if(!first_t || !last_t || !update_every) + if(!first_time_s || !last_time_s || !update_every_s) continue; - if(!common_first_t) - common_first_t = first_t; + if(!common_first_time_s) + common_first_time_s = first_time_s; else - common_first_t = MIN(first_t, common_first_t); + common_first_time_s = MIN(first_time_s, common_first_time_s); - if(!common_last_t) - common_last_t = last_t; + if(!common_last_time_s) + common_last_time_s = last_time_s; else - common_last_t = MAX(last_t, common_last_t); + common_last_time_s = MAX(last_time_s, common_last_time_s); - if(!common_update_every) - common_update_every = update_every; + if(!common_update_every_s) + common_update_every_s = update_every_s; else - common_update_every = MIN(update_every, common_update_every); + common_update_every_s = MIN(update_every_s, common_update_every_s); } - weight[tier] = query_plan_points_coverage_weight(common_first_t, common_last_t, common_update_every, after_wanted, before_wanted, points_wanted, tier); + weight[tier] = query_plan_points_coverage_weight(common_first_time_s, common_last_time_s, common_update_every_s, after_wanted, before_wanted, points_wanted, tier); } size_t best_tier = 0; @@ -842,19 +870,19 @@ static time_t rrdset_find_natural_update_every_for_timeframe(QUERY_TARGET *qt, t best_tier = rrddim_find_best_tier_for_timeframe(qt, after_wanted, before_wanted, points_wanted); // find the db minimum update every for this tier for all metrics - time_t common_update_every = default_rrd_update_every; + time_t common_update_every_s = default_rrd_update_every; for(size_t i = 0, used = qt->query.used; i < used ; i++) { QUERY_METRIC *qm = &qt->query.array[i]; - time_t update_every = qm->tiers[best_tier].db_update_every; + time_t update_every_s = qm->tiers[best_tier].db_update_every_s; if(!i) - common_update_every = update_every; + common_update_every_s = update_every_s; else - common_update_every = MIN(update_every, common_update_every); + common_update_every_s = MIN(update_every_s, common_update_every_s); } - return common_update_every; + return common_update_every_s; } // ---------------------------------------------------------------------------- @@ -888,17 +916,6 @@ QUERY_POINT QUERY_POINT_EMPTY = { #define query_point_set_id(point, point_id) debug_dummy() #endif -typedef struct query_plan_entry { - size_t tier; - time_t after; - time_t before; -} QUERY_PLAN_ENTRY; - -typedef struct query_plan { - size_t entries; - QUERY_PLAN_ENTRY data[RRD_STORAGE_TIERS*2]; -} QUERY_PLAN; - typedef struct query_engine_ops { // configuration RRDR *r; @@ -908,14 +925,15 @@ typedef struct query_engine_ops { TIER_QUERY_FETCH tier_query_fetch; // query planer - QUERY_PLAN plan; size_t current_plan; time_t current_plan_expire_time; + time_t plan_expanded_after; + time_t plan_expanded_before; // storage queries size_t tier; struct query_metric_tier *tier_ptr; - struct storage_engine_query_handle handle; + struct storage_engine_query_handle *handle; STORAGE_POINT (*next_metric)(struct storage_engine_query_handle *handle); int (*is_finished)(struct storage_engine_query_handle *handle); void (*finalize)(struct storage_engine_query_handle *handle); @@ -937,31 +955,128 @@ typedef struct query_engine_ops { // ---------------------------------------------------------------------------- // query planer -#define query_plan_should_switch_plan(ops, now) ((now) >= (ops).current_plan_expire_time) +#define query_plan_should_switch_plan(ops, now) ((now) >= (ops)->current_plan_expire_time) + +static size_t query_planer_expand_duration_in_points(time_t this_update_every, time_t next_update_every) { + + time_t delta = this_update_every - next_update_every; + if(delta < 0) delta = -delta; + + size_t points; + if(delta < this_update_every * POINTS_TO_EXPAND_QUERY) + points = POINTS_TO_EXPAND_QUERY; + else + points = (delta + this_update_every - 1) / this_update_every; + + return points; +} -static void query_planer_activate_plan(QUERY_ENGINE_OPS *ops, size_t plan_id, time_t overwrite_after) { - if(unlikely(plan_id >= ops->plan.entries)) - plan_id = ops->plan.entries - 1; +static void query_planer_initialize_plans(QUERY_ENGINE_OPS *ops) { + QUERY_METRIC *qm = ops->qm; + + for(size_t p = 0; p < qm->plan.used ; p++) { + size_t tier = qm->plan.array[p].tier; + time_t update_every = qm->tiers[tier].db_update_every_s; + + size_t points_to_add_to_after; + if(p > 0) { + // there is another plan before to this + + size_t tier0 = qm->plan.array[p - 1].tier; + time_t update_every0 = qm->tiers[tier0].db_update_every_s; + + points_to_add_to_after = query_planer_expand_duration_in_points(update_every, update_every0); + } + else + points_to_add_to_after = (tier == 0) ? 0 : POINTS_TO_EXPAND_QUERY; - time_t after = ops->plan.data[plan_id].after; - time_t before = ops->plan.data[plan_id].before; + size_t points_to_add_to_before; + if(p + 1 < qm->plan.used) { + // there is another plan after to this - if(overwrite_after > after && overwrite_after < before) - after = overwrite_after; + size_t tier1 = qm->plan.array[p+1].tier; + time_t update_every1 = qm->tiers[tier1].db_update_every_s; - ops->tier = ops->plan.data[plan_id].tier; - ops->tier_ptr = &ops->qm->tiers[ops->tier]; - ops->tier_ptr->eng->api.query_ops.init(ops->tier_ptr->db_metric_handle, &ops->handle, after, before); - ops->next_metric = ops->tier_ptr->eng->api.query_ops.next_metric; - ops->is_finished = ops->tier_ptr->eng->api.query_ops.is_finished; - ops->finalize = ops->tier_ptr->eng->api.query_ops.finalize; + points_to_add_to_before = query_planer_expand_duration_in_points(update_every, update_every1); + } + else + points_to_add_to_before = POINTS_TO_EXPAND_QUERY; + + time_t after = qm->plan.array[p].after - (time_t)(update_every * points_to_add_to_after); + time_t before = qm->plan.array[p].before + (time_t)(update_every * points_to_add_to_before); + + qm->plan.array[p].expanded_after = after; + qm->plan.array[p].expanded_before = before; + + struct query_metric_tier *tier_ptr = &qm->tiers[tier]; + tier_ptr->eng->api.query_ops.init( + tier_ptr->db_metric_handle, + &qm->plan.array[p].handle, + after, before, + ops->r->internal.qt->request.priority); + + qm->plan.array[p].next_metric = tier_ptr->eng->api.query_ops.next_metric; + qm->plan.array[p].is_finished = tier_ptr->eng->api.query_ops.is_finished; + qm->plan.array[p].finalize = tier_ptr->eng->api.query_ops.finalize; + qm->plan.array[p].initialized = true; + qm->plan.array[p].finalized = false; + } +} + +static void query_planer_finalize_plan(QUERY_ENGINE_OPS *ops, size_t plan_id) { + QUERY_METRIC *qm = ops->qm; + + if(qm->plan.array[plan_id].initialized && !qm->plan.array[plan_id].finalized) { + qm->plan.array[plan_id].finalize(&qm->plan.array[plan_id].handle); + qm->plan.array[plan_id].initialized = false; + qm->plan.array[plan_id].finalized = true; + qm->plan.array[plan_id].next_metric = NULL; + qm->plan.array[plan_id].is_finished = NULL; + qm->plan.array[plan_id].finalize = NULL; + + if(ops->current_plan == plan_id) { + ops->next_metric = NULL; + ops->is_finished = NULL; + ops->finalize = NULL; + } + } +} + +static void query_planer_finalize_remaining_plans(QUERY_ENGINE_OPS *ops) { + QUERY_METRIC *qm = ops->qm; + + for(size_t p = 0; p < qm->plan.used ; p++) + query_planer_finalize_plan(ops, p); +} + +static void query_planer_activate_plan(QUERY_ENGINE_OPS *ops, size_t plan_id, time_t overwrite_after __maybe_unused) { + QUERY_METRIC *qm = ops->qm; + + internal_fatal(plan_id >= qm->plan.used, "QUERY: invalid plan_id given"); + internal_fatal(!qm->plan.array[plan_id].initialized, "QUERY: plan has not been initialized"); + internal_fatal(qm->plan.array[plan_id].finalized, "QUERY: plan has been finalized"); + + internal_fatal(qm->plan.array[plan_id].after > qm->plan.array[plan_id].before, "QUERY: flipped after/before"); + + ops->tier = qm->plan.array[plan_id].tier; + ops->tier_ptr = &qm->tiers[ops->tier]; + ops->handle = &qm->plan.array[plan_id].handle; + ops->next_metric = qm->plan.array[plan_id].next_metric; + ops->is_finished = qm->plan.array[plan_id].is_finished; + ops->finalize = qm->plan.array[plan_id].finalize; ops->current_plan = plan_id; - ops->current_plan_expire_time = ops->plan.data[plan_id].before; + + if(plan_id + 1 < qm->plan.used && qm->plan.array[plan_id + 1].after < qm->plan.array[plan_id].before) + ops->current_plan_expire_time = qm->plan.array[plan_id + 1].after; + else + ops->current_plan_expire_time = qm->plan.array[plan_id].before; + + ops->plan_expanded_after = qm->plan.array[plan_id].expanded_after; + ops->plan_expanded_before = qm->plan.array[plan_id].expanded_before; } -static void query_planer_next_plan(QUERY_ENGINE_OPS *ops, time_t now, time_t last_point_end_time) { - internal_error(now < ops->current_plan_expire_time && now < ops->plan.data[ops->current_plan].before, - "QUERY: switching query plan too early!"); +static bool query_planer_next_plan(QUERY_ENGINE_OPS *ops, time_t now, time_t last_point_end_time) { + QUERY_METRIC *qm = ops->qm; size_t old_plan = ops->current_plan; @@ -969,32 +1084,26 @@ static void query_planer_next_plan(QUERY_ENGINE_OPS *ops, time_t now, time_t las do { ops->current_plan++; - if (ops->current_plan >= ops->plan.entries) { + if (ops->current_plan >= qm->plan.used) { ops->current_plan = old_plan; ops->current_plan_expire_time = ops->r->internal.qt->window.before; // let the query run with current plan // we will not switch it - return; + return false; } - next_plan_before_time = ops->plan.data[ops->current_plan].before; + next_plan_before_time = qm->plan.array[ops->current_plan].before; } while(now >= next_plan_before_time || last_point_end_time >= next_plan_before_time); - if(!query_metric_is_valid_tier(ops->qm, ops->plan.data[ops->current_plan].tier)) { + if(!query_metric_is_valid_tier(qm, qm->plan.array[ops->current_plan].tier)) { ops->current_plan = old_plan; ops->current_plan_expire_time = ops->r->internal.qt->window.before; - return; - } - - if(ops->finalize) { - ops->finalize(&ops->handle); - ops->finalize = NULL; - ops->is_finished = NULL; + return false; } - // internal_error(true, "QUERY: switched plan to %zu (all is %zu), previous expiration was %ld, this starts at %ld, now is %ld, last_point_end_time %ld", ops->current_plan, ops->plan.entries, ops->plan.data[ops->current_plan-1].before, ops->plan.data[ops->current_plan].after, now, last_point_end_time); - + query_planer_finalize_plan(ops, old_plan); query_planer_activate_plan(ops, ops->current_plan, MIN(now, last_point_end_time)); + return true; } static int compare_query_plan_entries_on_start_time(const void *a, const void *b) { @@ -1004,59 +1113,66 @@ static int compare_query_plan_entries_on_start_time(const void *a, const void *b } static bool query_plan(QUERY_ENGINE_OPS *ops, time_t after_wanted, time_t before_wanted, size_t points_wanted) { - //BUFFER *wb = buffer_create(1000); - //buffer_sprintf(wb, "QUERY PLAN for chart '%s' dimension '%s', from %ld to %ld:", rd->rrdset->name, rd->name, after_wanted, before_wanted); + QUERY_METRIC *qm = ops->qm; // put our selected tier as the first plan size_t selected_tier; if(ops->r->internal.query_options & RRDR_OPTION_SELECTED_TIER && ops->r->internal.qt->window.tier < storage_tiers - && query_metric_is_valid_tier(ops->qm, ops->r->internal.qt->window.tier)) { + && query_metric_is_valid_tier(qm, ops->r->internal.qt->window.tier)) { selected_tier = ops->r->internal.qt->window.tier; } else { - selected_tier = query_metric_best_tier_for_timeframe(ops->qm, after_wanted, before_wanted, points_wanted); + selected_tier = query_metric_best_tier_for_timeframe(qm, after_wanted, before_wanted, points_wanted); if(ops->r->internal.query_options & RRDR_OPTION_SELECTED_TIER) ops->r->internal.query_options &= ~RRDR_OPTION_SELECTED_TIER; + + if(!query_metric_is_valid_tier(qm, selected_tier)) + return false; + + if(qm->tiers[selected_tier].db_first_time_s > before_wanted || + qm->tiers[selected_tier].db_last_time_s < after_wanted) + return false; } - ops->plan.entries = 1; - ops->plan.data[0].tier = selected_tier; - ops->plan.data[0].after = ops->qm->tiers[selected_tier].db_first_time_t; - ops->plan.data[0].before = ops->qm->tiers[selected_tier].db_last_time_t; + qm->plan.used = 1; + qm->plan.array[0].tier = selected_tier; + qm->plan.array[0].after = (qm->tiers[selected_tier].db_first_time_s < after_wanted) ? after_wanted : qm->tiers[selected_tier].db_first_time_s; + qm->plan.array[0].before = (qm->tiers[selected_tier].db_last_time_s > before_wanted) ? before_wanted : qm->tiers[selected_tier].db_last_time_s; if(!(ops->r->internal.query_options & RRDR_OPTION_SELECTED_TIER)) { // the selected tier - time_t selected_tier_first_time_t = ops->plan.data[0].after; - time_t selected_tier_last_time_t = ops->plan.data[0].before; - - //buffer_sprintf(wb, ": SELECTED tier %zu, from %ld to %ld", selected_tier, ops->plan.data[0].after, ops->plan.data[0].before); + time_t selected_tier_first_time_s = qm->plan.array[0].after; + time_t selected_tier_last_time_s = qm->plan.array[0].before; // check if our selected tier can start the query - if (selected_tier_first_time_t > after_wanted) { + if (selected_tier_first_time_s > after_wanted) { // we need some help from other tiers for (size_t tr = (int)selected_tier + 1; tr < storage_tiers; tr++) { - if(!query_metric_is_valid_tier(ops->qm, tr)) + if(!query_metric_is_valid_tier(qm, tr)) continue; // find the first time of this tier - time_t first_time_t = ops->qm->tiers[tr].db_first_time_t; - - //buffer_sprintf(wb, ": EVAL AFTER tier %d, %ld", tier, first_time_t); + time_t tier_first_time_s = qm->tiers[tr].db_first_time_s; // can it help? - if (first_time_t < selected_tier_first_time_t) { + if (tier_first_time_s < selected_tier_first_time_s) { // it can help us add detail at the beginning of the query QUERY_PLAN_ENTRY t = { .tier = tr, - .after = (first_time_t < after_wanted) ? after_wanted : first_time_t, - .before = selected_tier_first_time_t}; - ops->plan.data[ops->plan.entries++] = t; + .after = (tier_first_time_s < after_wanted) ? after_wanted : tier_first_time_s, + .before = selected_tier_first_time_s, + .initialized = false, + .finalized = false, + }; + qm->plan.array[qm->plan.used++] = t; + + internal_fatal(!t.after || !t.before, "QUERY: invalid plan selected"); // prepare for the tier - selected_tier_first_time_t = t.after; + selected_tier_first_time_s = t.after; if (t.after <= after_wanted) break; @@ -1065,28 +1181,33 @@ static bool query_plan(QUERY_ENGINE_OPS *ops, time_t after_wanted, time_t before } // check if our selected tier can finish the query - if (selected_tier_last_time_t < before_wanted) { + if (selected_tier_last_time_s < before_wanted) { // we need some help from other tiers for (int tr = (int)selected_tier - 1; tr >= 0; tr--) { - if(!query_metric_is_valid_tier(ops->qm, tr)) + if(!query_metric_is_valid_tier(qm, tr)) continue; // find the last time of this tier - time_t last_time_t = ops->qm->tiers[tr].db_last_time_t; + time_t tier_last_time_s = qm->tiers[tr].db_last_time_s; - //buffer_sprintf(wb, ": EVAL BEFORE tier %d, %ld", tier, last_time_t); + //buffer_sprintf(wb, ": EVAL BEFORE tier %d, %ld", tier, last_time_s); // can it help? - if (last_time_t > selected_tier_last_time_t) { + if (tier_last_time_s > selected_tier_last_time_s) { // it can help us add detail at the end of the query QUERY_PLAN_ENTRY t = { .tier = tr, - .after = selected_tier_last_time_t, - .before = (last_time_t > before_wanted) ? before_wanted : last_time_t}; - ops->plan.data[ops->plan.entries++] = t; + .after = selected_tier_last_time_s, + .before = (tier_last_time_s > before_wanted) ? before_wanted : tier_last_time_s, + .initialized = false, + .finalized = false, + }; + qm->plan.array[qm->plan.used++] = t; // prepare for the tier - selected_tier_last_time_t = t.before; + selected_tier_last_time_s = t.before; + + internal_fatal(!t.after || !t.before, "QUERY: invalid plan selected"); if (t.before >= before_wanted) break; @@ -1096,26 +1217,21 @@ static bool query_plan(QUERY_ENGINE_OPS *ops, time_t after_wanted, time_t before } // sort the query plan - if(ops->plan.entries > 1) - qsort(&ops->plan.data, ops->plan.entries, sizeof(QUERY_PLAN_ENTRY), compare_query_plan_entries_on_start_time); + if(qm->plan.used > 1) + qsort(&qm->plan.array, qm->plan.used, sizeof(QUERY_PLAN_ENTRY), compare_query_plan_entries_on_start_time); - // make sure it has the whole timeframe we need - if(ops->plan.data[0].after < after_wanted) - ops->plan.data[0].after = after_wanted; - - if(ops->plan.data[ops->plan.entries - 1].before > before_wanted) - ops->plan.data[ops->plan.entries - 1].before = before_wanted; - - //buffer_sprintf(wb, ": FINAL STEPS %zu", ops->plan.entries); - - //for(size_t i = 0; i < ops->plan.entries ;i++) - // buffer_sprintf(wb, ": STEP %zu = use tier %zu from %ld to %ld", i+1, ops->plan.data[i].tier, ops->plan.data[i].after, ops->plan.data[i].before); - - //internal_error(true, "%s", buffer_tostring(wb)); - - if(!query_metric_is_valid_tier(ops->qm, ops->plan.data[0].tier)) + if(!query_metric_is_valid_tier(qm, qm->plan.array[0].tier)) return false; +#ifdef NETDATA_INTERNAL_CHECKS + for(size_t p = 0; p < qm->plan.used ;p++) { + internal_fatal(qm->plan.array[p].after > qm->plan.array[p].before, "QUERY: flipped after/before"); + internal_fatal(qm->plan.array[p].after < after_wanted, "QUERY: too small plan first time"); + internal_fatal(qm->plan.array[p].before > before_wanted, "QUERY: too big plan last time"); + } +#endif + + query_planer_initialize_plans(ops); query_planer_activate_plan(ops, 0, 0); return true; @@ -1146,24 +1262,45 @@ static bool query_plan(QUERY_ENGINE_OPS *ops, time_t after_wanted, time_t before #define query_add_point_to_group(r, point, ops) do { \ if(likely(netdata_double_isnumber((point).value))) { \ if(likely(fpclassify((point).value) != FP_ZERO)) \ - (ops).group_points_non_zero++; \ + (ops)->group_points_non_zero++; \ \ if(unlikely((point).flags & SN_FLAG_RESET)) \ - (ops).group_value_flags |= RRDR_VALUE_RESET; \ + (ops)->group_value_flags |= RRDR_VALUE_RESET; \ \ - (ops).grouping_add(r, (point).value); \ + (ops)->grouping_add(r, (point).value); \ } \ \ - (ops).group_points_added++; \ - (ops).group_anomaly_rate += (point).anomaly; \ + (ops)->group_points_added++; \ + (ops)->group_anomaly_rate += (point).anomaly; \ } while(0) -static inline void rrd2rrdr_do_dimension(RRDR *r, size_t dim_id_in_rrdr) { +static QUERY_ENGINE_OPS *rrd2rrdr_query_prep(RRDR *r, size_t dim_id_in_rrdr) { QUERY_TARGET *qt = r->internal.qt; - QUERY_METRIC *qm = &qt->query.array[dim_id_in_rrdr]; + + QUERY_ENGINE_OPS *ops = onewayalloc_mallocz(r->internal.owa, sizeof(QUERY_ENGINE_OPS)); + *ops = (QUERY_ENGINE_OPS) { + .r = r, + .qm = &qt->query.array[dim_id_in_rrdr], + .grouping_add = r->internal.grouping_add, + .grouping_flush = r->internal.grouping_flush, + .tier_query_fetch = r->internal.tier_query_fetch, + .view_update_every = r->update_every, + .query_granularity = (time_t)(r->update_every / r->group), + .group_value_flags = RRDR_VALUE_NOTHING, + }; + + if(!query_plan(ops, qt->window.after, qt->window.before, qt->window.points)) + return NULL; + + return ops; +} + +static void rrd2rrdr_query_execute(RRDR *r, size_t dim_id_in_rrdr, QUERY_ENGINE_OPS *ops) { + QUERY_TARGET *qt = r->internal.qt; + QUERY_METRIC *qm = &qt->query.array[dim_id_in_rrdr]; (void)qm; size_t points_wanted = qt->window.points; time_t after_wanted = qt->window.after; - time_t before_wanted = qt->window.before; + time_t before_wanted = qt->window.before; (void)before_wanted; // bool debug_this = false; // if(strcmp("user", string2str(rd->id)) == 0 && strcmp("system.cpu", string2str(rd->rrdset->id)) == 0) @@ -1174,39 +1311,30 @@ static inline void rrd2rrdr_do_dimension(RRDR *r, size_t dim_id_in_rrdr) { size_t points_added = 0; - QUERY_ENGINE_OPS ops = { - .r = r, - .qm = qm, - .grouping_add = r->internal.grouping_add, - .grouping_flush = r->internal.grouping_flush, - .tier_query_fetch = r->internal.tier_query_fetch, - .view_update_every = r->update_every, - .query_granularity = (time_t)(r->update_every / r->group), - .group_value_flags = RRDR_VALUE_NOTHING - }; - long rrdr_line = -1; bool use_anomaly_bit_as_value = (r->internal.query_options & RRDR_OPTION_ANOMALY_BIT) ? true : false; - if(!query_plan(&ops, after_wanted, before_wanted, points_wanted)) - return; - NETDATA_DOUBLE min = r->min, max = r->max; QUERY_POINT last2_point = QUERY_POINT_EMPTY; QUERY_POINT last1_point = QUERY_POINT_EMPTY; QUERY_POINT new_point = QUERY_POINT_EMPTY; - time_t now_start_time = after_wanted - ops.query_granularity; - time_t now_end_time = after_wanted + ops.view_update_every - ops.query_granularity; + // ONE POINT READ-AHEAD + // when we switch plans, we read-ahead a point from the next plan + // to join them smoothly at the exact time the next plan begins + STORAGE_POINT next1_point = STORAGE_POINT_UNSET; + + time_t now_start_time = after_wanted - ops->query_granularity; + time_t now_end_time = after_wanted + ops->view_update_every - ops->query_granularity; size_t db_points_read_since_plan_switch = 0; (void)db_points_read_since_plan_switch; // The main loop, based on the query granularity we need - for( ; points_added < points_wanted ; now_start_time = now_end_time, now_end_time += ops.view_update_every) { + for( ; points_added < points_wanted ; now_start_time = now_end_time, now_end_time += ops->view_update_every) { if(unlikely(query_plan_should_switch_plan(ops, now_end_time))) { - query_planer_next_plan(&ops, now_end_time, new_point.end_time); + query_planer_next_plan(ops, now_end_time, new_point.end_time); db_points_read_since_plan_switch = 0; } @@ -1219,7 +1347,7 @@ static inline void rrd2rrdr_do_dimension(RRDR *r, size_t dim_id_in_rrdr) { last1_point = new_point; } - if(unlikely(ops.is_finished(&ops.handle))) { + if(unlikely(ops->is_finished(ops->handle))) { if(count_same_end_time != 0) { last2_point = last1_point; last1_point = new_point; @@ -1235,29 +1363,62 @@ static inline void rrd2rrdr_do_dimension(RRDR *r, size_t dim_id_in_rrdr) { // fetch the new point { - db_points_read_since_plan_switch++; - STORAGE_POINT sp = ops.next_metric(&ops.handle); + STORAGE_POINT sp; + if(likely(storage_point_is_unset(next1_point))) { + db_points_read_since_plan_switch++; + sp = ops->next_metric(ops->handle); + } + else { + // ONE POINT READ-AHEAD + sp = next1_point; + storage_point_unset(next1_point); + db_points_read_since_plan_switch = 1; + } + + // ONE POINT READ-AHEAD + if(unlikely(query_plan_should_switch_plan(ops, sp.end_time_s) && + query_planer_next_plan(ops, now_end_time, new_point.end_time))) { + + // The end time of the current point, crosses our plans (tiers) + // so, we switched plan (tier) + // + // There are 2 cases now: + // + // A. the entire point of the previous plan is to the future of point from the next plan + // B. part of the point of the previous plan overlaps with the point from the next plan + + STORAGE_POINT sp2 = ops->next_metric(ops->handle); + + if(sp.start_time_s > sp2.start_time_s) + // the point from the previous plan is useless + sp = sp2; + else + // let the query run from the previous plan + // but setting this will also cut off the interpolation + // of the point from the previous plan + next1_point = sp2; + } - ops.db_points_read_per_tier[ops.tier]++; - ops.db_total_points_read++; + ops->db_points_read_per_tier[ops->tier]++; + ops->db_total_points_read++; - new_point.start_time = sp.start_time; - new_point.end_time = sp.end_time; + new_point.start_time = sp.start_time_s; + new_point.end_time = sp.end_time_s; new_point.anomaly = sp.count ? (NETDATA_DOUBLE)sp.anomaly_count * 100.0 / (NETDATA_DOUBLE)sp.count : 0.0; - query_point_set_id(new_point, ops.db_total_points_read); + query_point_set_id(new_point, ops->db_total_points_read); // if(debug_this) // info("QUERY: got point %zu, from time %ld to %ld // now from %ld to %ld // query from %ld to %ld", // new_point.id, new_point.start_time, new_point.end_time, now_start_time, now_end_time, after_wanted, before_wanted); // - // set the right value to the point we got - if(likely(!storage_point_is_unset(sp) && !storage_point_is_empty(sp))) { + // get the right value from the point we got + if(likely(!storage_point_is_unset(sp) && !storage_point_is_gap(sp))) { if(unlikely(use_anomaly_bit_as_value)) new_point.value = new_point.anomaly; else { - switch (ops.tier_query_fetch) { + switch (ops->tier_query_fetch) { default: case TIER_QUERY_FETCH_AVERAGE: new_point.value = sp.sum / sp.count; @@ -1284,19 +1445,30 @@ static inline void rrd2rrdr_do_dimension(RRDR *r, size_t dim_id_in_rrdr) { } // check if the db is giving us zero duration points - if(unlikely(new_point.start_time == new_point.end_time)) { - internal_error(true, "QUERY: '%s', dimension '%s' next_metric() returned point %zu start time %ld, end time %ld, that are both equal", - qt->id, string2str(qm->dimension.id), new_point.id, new_point.start_time, new_point.end_time); + if(unlikely(db_points_read_since_plan_switch > 1 && + new_point.start_time == new_point.end_time)) { - new_point.start_time = new_point.end_time - ops.tier_ptr->db_update_every; + internal_error(true, "QUERY: '%s', dimension '%s' next_metric() returned " + "point %zu from %ld to %ld, that are both equal", + qt->id, string2str(qm->dimension.id), + new_point.id, new_point.start_time, new_point.end_time); + + new_point.start_time = new_point.end_time - ops->tier_ptr->db_update_every_s; } // check if the db is advancing the query - if(unlikely(new_point.end_time <= last1_point.end_time)) { - internal_error(db_points_read_since_plan_switch > 1, - "QUERY: '%s', dimension '%s' next_metric() returned point %zu from %ld to %ld, before the last point %zu from %ld to %ld, now is %ld to %ld", - qt->id, string2str(qm->dimension.id), new_point.id, new_point.start_time, new_point.end_time, - last1_point.id, last1_point.start_time, last1_point.end_time, now_start_time, now_end_time); + if(unlikely(db_points_read_since_plan_switch > 1 && + new_point.end_time <= last1_point.end_time)) { + + internal_error(true, + "QUERY: '%s', dimension '%s' next_metric() returned " + "point %zu from %ld to %ld, before the " + "last point %zu from %ld to %ld, " + "now is %ld to %ld", + qt->id, string2str(qm->dimension.id), + new_point.id, new_point.start_time, new_point.end_time, + last1_point.id, last1_point.start_time, last1_point.end_time, + now_start_time, now_end_time); count_same_end_time++; continue; @@ -1321,12 +1493,16 @@ static inline void rrd2rrdr_do_dimension(RRDR *r, size_t dim_id_in_rrdr) { // at exactly the time we will want // we only log if this is not point 1 - internal_error(new_point.end_time < after_wanted && new_point.id > 1, - "QUERY: '%s', dimension '%s' next_metric() returned point %zu from %ld time %ld, which is entirely before our current timeframe %ld to %ld (and before the entire query, after %ld, before %ld)", + internal_error(new_point.end_time < ops->plan_expanded_after && + db_points_read_since_plan_switch > 1, + "QUERY: '%s', dimension '%s' next_metric() " + "returned point %zu from %ld time %ld, " + "which is entirely before our current timeframe %ld to %ld " + "(and before the entire query, after %ld, before %ld)", qt->id, string2str(qm->dimension.id), new_point.id, new_point.start_time, new_point.end_time, now_start_time, now_end_time, - after_wanted, before_wanted); + ops->plan_expanded_after, ops->plan_expanded_before); } } @@ -1339,20 +1515,31 @@ static inline void rrd2rrdr_do_dimension(RRDR *r, size_t dim_id_in_rrdr) { if(unlikely(count_same_end_time)) { internal_error(true, - "QUERY: '%s', dimension '%s', the database does not advance the query, it returned an end time less or equal to the end time of the last point we got %ld, %zu times", - qt->id, string2str(qm->dimension.id), last1_point.end_time, count_same_end_time); + "QUERY: '%s', dimension '%s', the database does not advance the query," + " it returned an end time less or equal to the end time of the last " + "point we got %ld, %zu times", + qt->id, string2str(qm->dimension.id), + last1_point.end_time, count_same_end_time); if(unlikely(new_point.end_time <= last1_point.end_time)) new_point.end_time = now_end_time; } + time_t stop_time = new_point.end_time; + if(unlikely(!storage_point_is_unset(next1_point))) { + // ONE POINT READ-AHEAD + // the point crosses the start time of the + // read ahead storage point we have read + stop_time = next1_point.start_time_s; + } + // the inner loop // we have 3 points in memory: last2, last1, new // we select the one to use based on their timestamps size_t iterations = 0; - for ( ; now_end_time <= new_point.end_time && points_added < points_wanted ; - now_end_time += ops.view_update_every, iterations++) { + for ( ; now_end_time <= stop_time && points_added < points_wanted ; + now_end_time += ops->view_update_every, iterations++) { // now_start_time is wrong in this loop // but, we don't need it @@ -1411,20 +1598,20 @@ static inline void rrd2rrdr_do_dimension(RRDR *r, size_t dim_id_in_rrdr) { RRDR_VALUE_FLAGS *rrdr_value_options_ptr = &r->o[rrdr_o_v_index]; // update the dimension options - if(likely(ops.group_points_non_zero)) + if(likely(ops->group_points_non_zero)) r->od[dim_id_in_rrdr] |= RRDR_DIMENSION_NONZERO; // store the specific point options - *rrdr_value_options_ptr = ops.group_value_flags; + *rrdr_value_options_ptr = ops->group_value_flags; // store the group value - NETDATA_DOUBLE group_value = ops.grouping_flush(r, rrdr_value_options_ptr); + NETDATA_DOUBLE group_value = ops->grouping_flush(r, rrdr_value_options_ptr); r->v[rrdr_o_v_index] = group_value; // we only store uint8_t anomaly rates, // so let's get double precision by storing // anomaly rates in the range 0 - 200 - r->ar[rrdr_o_v_index] = ops.group_anomaly_rate / (NETDATA_DOUBLE)ops.group_points_added; + r->ar[rrdr_o_v_index] = ops->group_anomaly_rate / (NETDATA_DOUBLE)ops->group_points_added; if(likely(points_added || dim_id_in_rrdr)) { // find the min/max across all dimensions @@ -1440,72 +1627,71 @@ static inline void rrd2rrdr_do_dimension(RRDR *r, size_t dim_id_in_rrdr) { } points_added++; - ops.group_points_added = 0; - ops.group_value_flags = RRDR_VALUE_NOTHING; - ops.group_points_non_zero = 0; - ops.group_anomaly_rate = 0; + ops->group_points_added = 0; + ops->group_value_flags = RRDR_VALUE_NOTHING; + ops->group_points_non_zero = 0; + ops->group_anomaly_rate = 0; } // the loop above increased "now" by query_granularity, // but the main loop will increase it too, // so, let's undo the last iteration of this loop if(iterations) - now_end_time -= ops.view_update_every; + now_end_time -= ops->view_update_every; } - ops.finalize(&ops.handle); + query_planer_finalize_remaining_plans(ops); r->internal.result_points_generated += points_added; - r->internal.db_points_read += ops.db_total_points_read; + r->internal.db_points_read += ops->db_total_points_read; for(size_t tr = 0; tr < storage_tiers ; tr++) - r->internal.tier_points_read[tr] += ops.db_points_read_per_tier[tr]; + r->internal.tier_points_read[tr] += ops->db_points_read_per_tier[tr]; r->min = min; r->max = max; r->before = max_date; - r->after = min_date - ops.view_update_every + ops.query_granularity; + r->after = min_date - ops->view_update_every + ops->query_granularity; rrdr_done(r, rrdr_line); internal_error(points_added != points_wanted, "QUERY: '%s', dimension '%s', requested %zu points, but RRDR added %zu (%zu db points read).", qt->id, string2str(qm->dimension.id), - (size_t)points_wanted, (size_t)points_added, ops.db_total_points_read); + (size_t)points_wanted, (size_t)points_added, ops->db_total_points_read); } // ---------------------------------------------------------------------------- // fill the gap of a tier void store_metric_at_tier(RRDDIM *rd, size_t tier, struct rrddim_tier *t, STORAGE_POINT sp, usec_t now_ut); -void store_metric_collection_completed(void); -void rrdr_fill_tier_gap_from_smaller_tiers(RRDDIM *rd, size_t tier, time_t now) { +void rrdr_fill_tier_gap_from_smaller_tiers(RRDDIM *rd, size_t tier, time_t now_s) { if(unlikely(tier >= storage_tiers)) return; if(storage_tiers_backfill[tier] == RRD_BACKFILL_NONE) return; - struct rrddim_tier *t = rd->tiers[tier]; + struct rrddim_tier *t = &rd->tiers[tier]; if(unlikely(!t)) return; - time_t latest_time_t = t->query_ops->latest_time(t->db_metric_handle); + time_t latest_time_s = t->query_ops->latest_time_s(t->db_metric_handle); time_t granularity = (time_t)t->tier_grouping * (time_t)rd->update_every; - time_t time_diff = now - latest_time_t; + time_t time_diff = now_s - latest_time_s; // if the user wants only NEW backfilling, and we don't have any data - if(storage_tiers_backfill[tier] == RRD_BACKFILL_NEW && latest_time_t <= 0) return; + if(storage_tiers_backfill[tier] == RRD_BACKFILL_NEW && latest_time_s <= 0) return; // there is really nothing we can do - if(now <= latest_time_t || time_diff < granularity) return; + if(now_s <= latest_time_s || time_diff < granularity) return; struct storage_engine_query_handle handle; // for each lower tier for(int read_tier = (int)tier - 1; read_tier >= 0 ; read_tier--){ - time_t smaller_tier_first_time = rd->tiers[read_tier]->query_ops->oldest_time(rd->tiers[read_tier]->db_metric_handle); - time_t smaller_tier_last_time = rd->tiers[read_tier]->query_ops->latest_time(rd->tiers[read_tier]->db_metric_handle); - if(smaller_tier_last_time <= latest_time_t) continue; // it is as bad as we are + time_t smaller_tier_first_time = rd->tiers[read_tier].query_ops->oldest_time_s(rd->tiers[read_tier].db_metric_handle); + time_t smaller_tier_last_time = rd->tiers[read_tier].query_ops->latest_time_s(rd->tiers[read_tier].db_metric_handle); + if(smaller_tier_last_time <= latest_time_s) continue; // it is as bad as we are - long after_wanted = (latest_time_t < smaller_tier_first_time) ? smaller_tier_first_time : latest_time_t; + long after_wanted = (latest_time_s < smaller_tier_first_time) ? smaller_tier_first_time : latest_time_s; long before_wanted = smaller_tier_last_time; - struct rrddim_tier *tmp = rd->tiers[read_tier]; - tmp->query_ops->init(tmp->db_metric_handle, &handle, after_wanted, before_wanted); + struct rrddim_tier *tmp = &rd->tiers[read_tier]; + tmp->query_ops->init(tmp->db_metric_handle, &handle, after_wanted, before_wanted, STORAGE_PRIORITY_HIGH); size_t points_read = 0; @@ -1514,9 +1700,9 @@ void rrdr_fill_tier_gap_from_smaller_tiers(RRDDIM *rd, size_t tier, time_t now) STORAGE_POINT sp = tmp->query_ops->next_metric(&handle); points_read++; - if(sp.end_time > latest_time_t) { - latest_time_t = sp.end_time; - store_metric_at_tier(rd, tier, t, sp, sp.end_time * USEC_PER_SEC); + if(sp.end_time_s > latest_time_s) { + latest_time_s = sp.end_time_s; + store_metric_at_tier(rd, tier, t, sp, sp.end_time_s * USEC_PER_SEC); } } @@ -1551,12 +1737,12 @@ static void rrd2rrdr_log_request_response_metadata(RRDR *r , const char *msg ) { - time_t first_entry_t = r->internal.qt->db.first_time_t; - time_t last_entry_t = r->internal.qt->db.last_time_t; + time_t first_entry_s = r->internal.qt->db.first_time_s; + time_t last_entry_s = r->internal.qt->db.last_time_s; internal_error( - true, - "rrd2rrdr() on %s update every %ld with %s grouping %s (group: %zu, resampling_time: %ld, resampling_group: %zu), " + true, + "rrd2rrdr() on %s update every %ld with %s grouping %s (group: %zu, resampling_time: %ld, resampling_group: %zu), " "after (got: %ld, want: %ld, req: %ld, db: %ld), " "before (got: %ld, want: %ld, req: %ld, db: %ld), " "duration (got: %ld, want: %ld, req: %ld, db: %ld), " @@ -1576,19 +1762,19 @@ static void rrd2rrdr_log_request_response_metadata(RRDR *r , r->after , after_wanted , after_requested - , first_entry_t + , first_entry_s // before , r->before , before_wanted , before_requested - , last_entry_t + , last_entry_s // duration , (long)(r->before - r->after + r->internal.qt->window.query_granularity) , (long)(before_wanted - after_wanted + r->internal.qt->window.query_granularity) , (long)before_requested - after_requested - , (long)((last_entry_t - first_entry_t) + r->internal.qt->window.query_granularity) + , (long)((last_entry_s - first_entry_s) + r->internal.qt->window.query_granularity) // points , r->rows @@ -1708,7 +1894,7 @@ bool query_target_calculate_window(QUERY_TARGET *qt) { time_t resampling_time_requested = qt->request.resampling_time; RRDR_OPTIONS options = qt->request.options; size_t tier = qt->request.tier; - time_t update_every = qt->db.minimum_latest_update_every; + time_t update_every = qt->db.minimum_latest_update_every_s; // RULES // points_requested = 0 @@ -1763,30 +1949,30 @@ bool query_target_calculate_window(QUERY_TARGET *qt) { if (after_wanted == 0 || before_wanted == 0) { relative_period_requested = true; - time_t first_entry_t = qt->db.first_time_t; - time_t last_entry_t = qt->db.last_time_t; + time_t first_entry_s = qt->db.first_time_s; + time_t last_entry_s = qt->db.last_time_s; - if (first_entry_t == 0 || last_entry_t == 0) { - internal_error(true, "QUERY: no data detected on query '%s' (db first_entry_t = %ld, last_entry_t = %ld", qt->id, first_entry_t, last_entry_t); + if (first_entry_s == 0 || last_entry_s == 0) { + internal_error(true, "QUERY: no data detected on query '%s' (db first_entry_t = %ld, last_entry_t = %ld", qt->id, first_entry_s, last_entry_s); query_debug_log_free(); return false; } - query_debug_log(":first_entry_t %ld, last_entry_t %ld", first_entry_t, last_entry_t); + query_debug_log(":first_entry_t %ld, last_entry_t %ld", first_entry_s, last_entry_s); if (after_wanted == 0) { - after_wanted = first_entry_t; + after_wanted = first_entry_s; query_debug_log(":zero after_wanted %ld", after_wanted); } if (before_wanted == 0) { - before_wanted = last_entry_t; + before_wanted = last_entry_s; before_is_aligned_to_db_end = true; query_debug_log(":zero before_wanted %ld", before_wanted); } if (points_wanted == 0) { - points_wanted = (last_entry_t - first_entry_t) / update_every; + points_wanted = (last_entry_s - first_entry_s) / update_every; query_debug_log(":zero points_wanted %zu", points_wanted); } } @@ -1804,7 +1990,7 @@ bool query_target_calculate_window(QUERY_TARGET *qt) { update_every = rrdset_find_natural_update_every_for_timeframe( qt, after_wanted, before_wanted, points_wanted, options, tier); - if (update_every <= 0) update_every = qt->db.minimum_latest_update_every; + if (update_every <= 0) update_every = qt->db.minimum_latest_update_every_s; query_debug_log(":natural update every %ld", update_every); } @@ -1975,7 +2161,8 @@ RRDR *rrd2rrdr_legacy( ONEWAYALLOC *owa, RRDSET *st, size_t points, time_t after, time_t before, RRDR_GROUPING group_method, time_t resampling_time, RRDR_OPTIONS options, const char *dimensions, - const char *group_options, time_t timeout, size_t tier, QUERY_SOURCE query_source) { + const char *group_options, time_t timeout, size_t tier, QUERY_SOURCE query_source, + STORAGE_PRIORITY priority) { QUERY_TARGET_REQUEST qtr = { .st = st, @@ -1990,6 +2177,7 @@ RRDR *rrd2rrdr_legacy( .timeout = timeout, .tier = tier, .query_source = query_source, + .priority = priority, }; return rrd2rrdr(owa, query_target_create(&qtr)); @@ -2056,16 +2244,48 @@ RRDR *rrd2rrdr(ONEWAYALLOC *owa, QUERY_TARGET *qt) { if (qt->request.timeout) now_realtime_timeval(&query_start_time); + size_t last_db_points_read = 0; + size_t last_result_points_generated = 0; + + QUERY_ENGINE_OPS **ops = onewayalloc_callocz(r->internal.owa, qt->query.used, sizeof(QUERY_ENGINE_OPS *)); + + size_t capacity = libuv_worker_threads * 2; + size_t max_queries_to_prepare = (qt->query.used > (capacity - 1)) ? (capacity - 1) : qt->query.used; + size_t queries_prepared = 0; + while(queries_prepared < max_queries_to_prepare) { + // preload another query + ops[queries_prepared] = rrd2rrdr_query_prep(r, queries_prepared); + queries_prepared++; + } + for(size_t c = 0, max = qt->query.used; c < max ; c++) { + + if(queries_prepared < max) { + // preload another query + ops[queries_prepared] = rrd2rrdr_query_prep(r, queries_prepared); + queries_prepared++; + } + // set the query target dimension options to rrdr r->od[c] = qt->query.array[c].dimension.options; - r->od[c] |= RRDR_DIMENSION_SELECTED; - // reset the grouping for the new dimension r->internal.grouping_reset(r); - rrd2rrdr_do_dimension(r, c); + if(ops[c]) { + r->od[c] |= RRDR_DIMENSION_SELECTED; + rrd2rrdr_query_execute(r, c, ops[c]); + } + + global_statistics_rrdr_query_completed( + 1, + r->internal.db_points_read - last_db_points_read, + r->internal.result_points_generated - last_result_points_generated, + qt->request.query_source); + + last_db_points_read = r->internal.db_points_read; + last_result_points_generated = r->internal.result_points_generated; + if (qt->request.timeout) now_realtime_timeval(&query_current_time); @@ -2106,6 +2326,12 @@ RRDR *rrd2rrdr(ONEWAYALLOC *owa, QUERY_TARGET *qt) { log_access("QUERY CANCELED RUNTIME EXCEEDED %0.2f ms (LIMIT %lld ms)", (NETDATA_DOUBLE)dt_usec(&query_start_time, &query_current_time) / 1000.0, (long long)qt->request.timeout); r->result_options |= RRDR_RESULT_OPTION_CANCEL; + + for(size_t i = c + 1; i < queries_prepared ; i++) { + if(ops[i]) + query_planer_finalize_remaining_plans(ops[i]); + } + break; } } @@ -2169,7 +2395,5 @@ RRDR *rrd2rrdr(ONEWAYALLOC *owa, QUERY_TARGET *qt) { } } - global_statistics_rrdr_query_completed(dimensions_used, r->internal.db_points_read, - r->internal.result_points_generated, qt->request.query_source); return r; } diff --git a/web/api/queries/rrdr.h b/web/api/queries/rrdr.h index 6151cddc7..e31a98099 100644 --- a/web/api/queries/rrdr.h +++ b/web/api/queries/rrdr.h @@ -40,7 +40,8 @@ typedef enum rrdr_options { RRDR_OPTION_RETURN_RAW = 0x00100000, // Return raw data for aggregating across multiple nodes RRDR_OPTION_RETURN_JWAR = 0x00200000, // Return anomaly rates in jsonwrap RRDR_OPTION_SELECTED_TIER = 0x00400000, // Use the selected tier for the query - RRDR_OPTION_ALL_DIMENSIONS = 0x00800000, // Return the full dimensions list + RRDR_OPTION_ALL_DIMENSIONS = 0x00800000, // Return the full dimensions list + RRDR_OPTION_SHOW_PLAN = 0x01000000, // Return the query plan in jsonwrap // internal ones - not to be exposed to the API RRDR_OPTION_INTERNAL_AR = 0x10000000, // internal use only, to let the formatters we want to render the anomaly rate @@ -138,7 +139,8 @@ RRDR *rrd2rrdr_legacy( ONEWAYALLOC *owa, RRDSET *st, size_t points, time_t after, time_t before, RRDR_GROUPING group_method, time_t resampling_time, RRDR_OPTIONS options, const char *dimensions, - const char *group_options, time_t timeout, size_t tier, QUERY_SOURCE query_source); + const char *group_options, time_t timeout, size_t tier, QUERY_SOURCE query_source, + STORAGE_PRIORITY priority); RRDR *rrd2rrdr(ONEWAYALLOC *owa, struct query_target *qt); bool query_target_calculate_window(struct query_target *qt); diff --git a/web/api/queries/weights.c b/web/api/queries/weights.c index a9555a66b..dc98aeedf 100644 --- a/web/api/queries/weights.c +++ b/web/api/queries/weights.c @@ -520,6 +520,7 @@ NETDATA_DOUBLE *rrd2rrdr_ks2( .group_options = group_options, .tier = tier, .query_source = QUERY_SOURCE_API_WEIGHTS, + .priority = STORAGE_PRIORITY_NORMAL, }; RRDR *r = rrd2rrdr(owa, query_target_create(&qtr)); @@ -638,7 +639,9 @@ static void rrdset_metric_correlations_volume( options |= RRDR_OPTION_MATCH_IDS | RRDR_OPTION_ABSOLUTE | RRDR_OPTION_NATURAL_POINTS; - QUERY_VALUE baseline_average = rrdmetric2value(host, rca, ria, rma, baseline_after, baseline_before, options, group_method, group_options, tier, 0, QUERY_SOURCE_API_WEIGHTS); + QUERY_VALUE baseline_average = rrdmetric2value(host, rca, ria, rma, baseline_after, baseline_before, + options, group_method, group_options, tier, 0, + QUERY_SOURCE_API_WEIGHTS, STORAGE_PRIORITY_NORMAL); merge_query_value_to_stats(&baseline_average, stats); if(!netdata_double_isnumber(baseline_average.value)) { @@ -646,7 +649,9 @@ static void rrdset_metric_correlations_volume( baseline_average.value = 0.0; } - QUERY_VALUE highlight_average = rrdmetric2value(host, rca, ria, rma, after, before, options, group_method, group_options, tier, 0, QUERY_SOURCE_API_WEIGHTS); + QUERY_VALUE highlight_average = rrdmetric2value(host, rca, ria, rma, after, before, + options, group_method, group_options, tier, 0, + QUERY_SOURCE_API_WEIGHTS, STORAGE_PRIORITY_NORMAL); merge_query_value_to_stats(&highlight_average, stats); if(!netdata_double_isnumber(highlight_average.value)) @@ -659,7 +664,9 @@ static void rrdset_metric_correlations_volume( char highlight_countif_options[50 + 1]; snprintfz(highlight_countif_options, 50, "%s" NETDATA_DOUBLE_FORMAT, highlight_average.value < baseline_average.value ? "<" : ">", baseline_average.value); - QUERY_VALUE highlight_countif = rrdmetric2value(host, rca, ria, rma, after, before, options, RRDR_GROUPING_COUNTIF, highlight_countif_options, tier, 0, QUERY_SOURCE_API_WEIGHTS); + QUERY_VALUE highlight_countif = rrdmetric2value(host, rca, ria, rma, after, before, + options, RRDR_GROUPING_COUNTIF, highlight_countif_options, tier, 0, + QUERY_SOURCE_API_WEIGHTS, STORAGE_PRIORITY_NORMAL); merge_query_value_to_stats(&highlight_countif, stats); if(!netdata_double_isnumber(highlight_countif.value)) { @@ -700,7 +707,10 @@ static void rrdset_weights_anomaly_rate( options |= RRDR_OPTION_MATCH_IDS | RRDR_OPTION_ANOMALY_BIT | RRDR_OPTION_NATURAL_POINTS; - QUERY_VALUE qv = rrdmetric2value(host, rca, ria, rma, after, before, options, group_method, group_options, tier, 0, QUERY_SOURCE_API_WEIGHTS); + QUERY_VALUE qv = rrdmetric2value(host, rca, ria, rma, after, before, + options, group_method, group_options, tier, 0, + QUERY_SOURCE_API_WEIGHTS, STORAGE_PRIORITY_NORMAL); + merge_query_value_to_stats(&qv, stats); if(netdata_double_isnumber(qv.value)) diff --git a/web/api/web_api_v1.c b/web/api/web_api_v1.c index 93f501f9e..1b38a33b1 100644 --- a/web/api/web_api_v1.c +++ b/web/api/web_api_v1.c @@ -41,6 +41,7 @@ static struct { , {"natural-points" , 0 , RRDR_OPTION_NATURAL_POINTS} , {"virtual-points" , 0 , RRDR_OPTION_VIRTUAL_POINTS} , {"all-dimensions" , 0 , RRDR_OPTION_ALL_DIMENSIONS} + , {"plan" , 0 , RRDR_OPTION_SHOW_PLAN} , {NULL , 0 , 0} }; @@ -311,7 +312,7 @@ inline int web_client_api_request_v1_alarm_count(RRDHOST *host, struct web_clien else if (!strcmp("CLEAR", value)) status = RRDCALC_STATUS_CLEAR; } else if(!strcmp(name, "context") || !strcmp(name, "ctx")) { - if(!contexts) contexts = buffer_create(255); + if(!contexts) contexts = buffer_create(255, &netdata_buffers_statistics.buffers_api); buffer_strcat(contexts, "|"); buffer_strcat(contexts, value); } @@ -388,7 +389,7 @@ inline int web_client_api_request_single_chart(RRDHOST *host, struct web_client } w->response.data->contenttype = CT_APPLICATION_JSON; - st->last_accessed_time = now_realtime_sec(); + st->last_accessed_time_s = now_realtime_sec(); callback(st, w->response.data); return HTTP_RESP_OK; @@ -459,7 +460,7 @@ static int web_client_api_request_v1_context(RRDHOST *host, struct web_client *w else if(!strcmp(name, "chart_label_key")) chart_label_key = value; else if(!strcmp(name, "chart_labels_filter")) chart_labels_filter = value; else if(!strcmp(name, "dimension") || !strcmp(name, "dim") || !strcmp(name, "dimensions") || !strcmp(name, "dims")) { - if(!dimensions) dimensions = buffer_create(100); + if(!dimensions) dimensions = buffer_create(100, &netdata_buffers_statistics.buffers_api); buffer_strcat(dimensions, "|"); buffer_strcat(dimensions, value); } @@ -520,7 +521,7 @@ static int web_client_api_request_v1_contexts(RRDHOST *host, struct web_client * else if(!strcmp(name, "chart_label_key")) chart_label_key = value; else if(!strcmp(name, "chart_labels_filter")) chart_labels_filter = value; else if(!strcmp(name, "dimension") || !strcmp(name, "dim") || !strcmp(name, "dimensions") || !strcmp(name, "dims")) { - if(!dimensions) dimensions = buffer_create(100); + if(!dimensions) dimensions = buffer_create(100, &netdata_buffers_statistics.buffers_api); buffer_strcat(dimensions, "|"); buffer_strcat(dimensions, value); } @@ -560,18 +561,6 @@ inline int web_client_api_request_v1_charts(RRDHOST *host, struct web_client *w, return HTTP_RESP_OK; } -inline int web_client_api_request_v1_archivedcharts(RRDHOST *host __maybe_unused, struct web_client *w, char *url) { - (void)url; - - buffer_flush(w->response.data); - w->response.data->contenttype = CT_APPLICATION_JSON; -#ifdef ENABLE_DBENGINE - if (host->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) - sql_rrdset2json(host, w->response.data); -#endif - return HTTP_RESP_OK; -} - inline int web_client_api_request_v1_chart(RRDHOST *host, struct web_client *w, char *url) { return web_client_api_request_single_chart(host, w, url, rrd_stats_api_v1_chart); } @@ -637,7 +626,7 @@ inline int web_client_api_request_v1_data(RRDHOST *host, struct web_client *w, c else if(!strcmp(name, "chart_labels_filter")) chart_labels_filter = value; else if(!strcmp(name, "chart")) chart = value; else if(!strcmp(name, "dimension") || !strcmp(name, "dim") || !strcmp(name, "dimensions") || !strcmp(name, "dims")) { - if(!dimensions) dimensions = buffer_create(100); + if(!dimensions) dimensions = buffer_create(100, &netdata_buffers_statistics.buffers_api); buffer_strcat(dimensions, "|"); buffer_strcat(dimensions, value); } @@ -752,6 +741,7 @@ inline int web_client_api_request_v1_data(RRDHOST *host, struct web_client *w, c .chart_label_key = chart_label_key, .charts_labels_filter = chart_labels_filter, .query_source = QUERY_SOURCE_API_DATA, + .priority = STORAGE_PRIORITY_NORMAL, }; qt = query_target_create(&qtr); @@ -1068,15 +1058,13 @@ static inline void web_client_api_request_v1_info_mirrored_hosts(BUFFER *wb) { if (count > 0) buffer_strcat(wb, ",\n"); - netdata_mutex_lock(&host->receiver_lock); buffer_sprintf( wb, "\t\t{ \"guid\": \"%s\", \"hostname\": \"%s\", \"reachable\": %s, \"hops\": %d" , host->machine_guid , rrdhost_hostname(host) - , (host->receiver || host == localhost) ? "true" : "false" + , (host == localhost || !rrdhost_flag_check(host, RRDHOST_FLAG_ORPHAN)) ? "true" : "false" , host->system_info ? host->system_info->hops : (host == localhost) ? 0 : 1 ); - netdata_mutex_unlock(&host->receiver_lock); rrdhost_aclk_state_lock(host); if (host->aclk_state.claimed_id) @@ -1510,7 +1498,7 @@ int web_client_api_request_v1_functions(RRDHOST *host, struct web_client *w, cha } #ifndef ENABLE_DBENGINE -int web_client_api_request_v1_dbengine_stats(RRDHOST *host, struct web_client *w, char *url) { +int web_client_api_request_v1_dbengine_stats(RRDHOST *host __maybe_unused, struct web_client *w __maybe_unused, char *url __maybe_unused) { return HTTP_RESP_NOT_FOUND; } #else @@ -1519,12 +1507,6 @@ static void web_client_api_v1_dbengine_stats_for_tier(BUFFER *wb, size_t tier) { buffer_sprintf(wb, "\n\t\t\"default_granularity_secs\":%zu" - ",\n\t\t\"sizeof_metric\":%zu" - ",\n\t\t\"sizeof_metric_in_index\":%zu" - ",\n\t\t\"sizeof_page\":%zu" - ",\n\t\t\"sizeof_page_in_index\":%zu" - ",\n\t\t\"sizeof_extent\":%zu" - ",\n\t\t\"sizeof_page_in_extent\":%zu" ",\n\t\t\"sizeof_datafile\":%zu" ",\n\t\t\"sizeof_page_in_cache\":%zu" ",\n\t\t\"sizeof_point_data\":%zu" @@ -1540,8 +1522,8 @@ static void web_client_api_v1_dbengine_stats_for_tier(BUFFER *wb, size_t tier) { ",\n\t\t\"pages_uncompressed_bytes\":%zu" ",\n\t\t\"pages_duration_secs\":%lld" ",\n\t\t\"single_point_pages\":%zu" - ",\n\t\t\"first_t\":%llu" - ",\n\t\t\"last_t\":%llu" + ",\n\t\t\"first_t\":%ld" + ",\n\t\t\"last_t\":%ld" ",\n\t\t\"database_retention_secs\":%lld" ",\n\t\t\"average_compression_savings\":%0.2f" ",\n\t\t\"average_point_duration_secs\":%0.2f" @@ -1550,16 +1532,9 @@ static void web_client_api_v1_dbengine_stats_for_tier(BUFFER *wb, size_t tier) { ",\n\t\t\"average_page_size_bytes\":%0.2f" ",\n\t\t\"estimated_concurrently_collected_metrics\":%zu" ",\n\t\t\"currently_collected_metrics\":%zu" - ",\n\t\t\"max_concurrently_collected_metrics\":%zu" ",\n\t\t\"disk_space\":%zu" ",\n\t\t\"max_disk_space\":%zu" , stats.default_granularity_secs - , stats.sizeof_metric - , stats.sizeof_metric_in_index - , stats.sizeof_page - , stats.sizeof_page_in_index - , stats.sizeof_extent - , stats.sizeof_page_in_extent , stats.sizeof_datafile , stats.sizeof_page_in_cache , stats.sizeof_point_data @@ -1575,8 +1550,8 @@ static void web_client_api_v1_dbengine_stats_for_tier(BUFFER *wb, size_t tier) { , stats.pages_uncompressed_bytes , (long long)stats.pages_duration_secs , stats.single_point_pages - , stats.first_t - , stats.last_t + , stats.first_time_s + , stats.last_time_s , (long long)stats.database_retention_secs , stats.average_compression_savings , stats.average_point_duration_secs @@ -1585,7 +1560,6 @@ static void web_client_api_v1_dbengine_stats_for_tier(BUFFER *wb, size_t tier) { , stats.average_page_size_bytes , stats.estimated_concurrently_collected_metrics , stats.currently_collected_metrics - , stats.max_concurrently_collected_metrics , stats.disk_space , stats.max_disk_space ); @@ -1634,7 +1608,6 @@ static struct api_command { { "charts", 0, WEB_CLIENT_ACL_DASHBOARD | WEB_CLIENT_ACL_ACLK, web_client_api_request_v1_charts }, { "context", 0, WEB_CLIENT_ACL_DASHBOARD | WEB_CLIENT_ACL_ACLK, web_client_api_request_v1_context }, { "contexts", 0, WEB_CLIENT_ACL_DASHBOARD | WEB_CLIENT_ACL_ACLK, web_client_api_request_v1_contexts }, - { "archivedcharts", 0, WEB_CLIENT_ACL_DASHBOARD | WEB_CLIENT_ACL_ACLK, web_client_api_request_v1_archivedcharts }, // registry checks the ACL by itself, so we allow everything { "registry", 0, WEB_CLIENT_ACL_NOCHECK, web_client_api_request_v1_registry }, diff --git a/web/api/web_api_v1.h b/web/api/web_api_v1.h index e6682c99c..9dd6a1c23 100644 --- a/web/api/web_api_v1.h +++ b/web/api/web_api_v1.h @@ -9,7 +9,6 @@ #include "web/api/health/health_cmdapi.h" #include "web/api/queries/weights.h" -#define MAX_CHART_LABELS_FILTER (32) RRDR_OPTIONS web_client_api_request_v1_data_options(char *o); void web_client_api_request_v1_data_options_to_buffer(BUFFER *wb, RRDR_OPTIONS options); void web_client_api_request_v1_data_options_to_string(char *buf, size_t size, RRDR_OPTIONS options); @@ -24,7 +23,6 @@ int web_client_api_request_single_chart(RRDHOST *host, struct web_client *w, cha int web_client_api_request_v1_alarm_variables(RRDHOST *host, struct web_client *w, char *url); int web_client_api_request_v1_alarm_count(RRDHOST *host, struct web_client *w, char *url); int web_client_api_request_v1_charts(RRDHOST *host, struct web_client *w, char *url); -int web_client_api_request_v1_archivedcharts(RRDHOST *host, struct web_client *w, char *url); int web_client_api_request_v1_chart(RRDHOST *host, struct web_client *w, char *url); int web_client_api_request_v1_data(RRDHOST *host, struct web_client *w, char *url); int web_client_api_request_v1_registry(RRDHOST *host, struct web_client *w, char *url); diff --git a/web/gui/README.md b/web/gui/README.md index 69db6becb..fbd7da4df 100644 --- a/web/gui/README.md +++ b/web/gui/README.md @@ -13,16 +13,16 @@ before: action](https://user-images.githubusercontent.com/1153921/101513938-fae28380-3939-11eb-9434-8ad86a39be62.gif) Learn more about how dashboards work and how they're populated using the `dashboards.js` file in our [web dashboards -overview](/web/README.md). +overview](https://github.com/netdata/netdata/blob/master/web/README.md). By default, Netdata starts a web server for its dashboard at port `19999`. Open up your web browser of choice and navigate to `http://NODE:19999`, replacing `NODE` with the IP address or hostname of your Agent. If you're unsure, try `http://localhost:19999` first. -Netdata uses an [internal, static-threaded web server](/web/server/README.md) to host the HTML, CSS, and JavaScript +Netdata uses an [internal, static-threaded web server](https://github.com/netdata/netdata/blob/master/web/server/README.md) to host the HTML, CSS, and JavaScript files that make up the local Agent dashboard. You don't have to configure anything to access it, although you can adjust -[your settings](/web/server/README.md#other-netdataconf-web-section-options) in the `netdata.conf` file, or run Netdata -behind an [Nginx proxy](https://learn.netdata.cloud/docs/agent/running-behind-nginx), and so on. +[your settings](https://github.com/netdata/netdata/blob/master/web/server/README.md#other-netdataconf-web-section-options) in the `netdata.conf` file, or run Netdata +behind an [Nginx proxy](https://github.com/netdata/netdata/blob/master/docs/Running-behind-nginx.md), and so on. ## Navigating the local dashboard @@ -40,8 +40,8 @@ dashboard](https://user-images.githubusercontent.com/1153921/101509403-f7e59400- Netdata is broken up into multiple **sections**, such as **System Overview**, **CPU**, **Disk**, and more. Inside each section you'll find a number of charts, -broken down into [contexts](/web/README.md#contexts) and -[families](/web/README.md#families). +broken down into [contexts](https://github.com/netdata/netdata/blob/master/web/README.md#contexts) and +[families](https://github.com/netdata/netdata/blob/master/web/README.md#families). An example of the **Memory** section on a Linux desktop system. @@ -69,7 +69,7 @@ Use the calendar to select multiple days. Click on a date to begin the timeframe Click **Apply** to re-render all visualizations with new metrics data, or **Clear** to restore the default timeframe. -[Increase the metrics retention policy](/docs/store/change-metrics-storage.md) for your node to see more historical +[Increase the metrics retention policy](https://github.com/netdata/netdata/blob/master/docs/store/change-metrics-storage.md) for your node to see more historical timeframes. ### Metrics menus @@ -80,7 +80,7 @@ section, and menus link to the section they're associated with. ![A screenshot of metrics menus](https://user-images.githubusercontent.com/1153921/80834638-f08f2880-8ba5-11ea-99ae-f610b2885fd6.png) Most metrics menu items will contain several **submenu** entries, which represent any -[families](/web/README.md#families) from that section. Netdata automatically +[families](https://github.com/netdata/netdata/blob/master/web/README.md#families) from that section. Netdata automatically generates these submenu entries. Here's a **Disks** menu with several submenu entries for each disk drive and @@ -100,7 +100,7 @@ a War Room's name to jump to the Netdata Cloud web interface. menus](https://user-images.githubusercontent.com/1153921/80837210-3f8b8c80-8bab-11ea-9c75-128c2d823ef8.png) If you want to know more about how Cloud populates this menu, and the Agent-Cloud integration at a high level, see our -document on [using the Agent with Netdata Cloud](/docs/agent-cloud.md). +document on [using the Agent with Netdata Cloud](https://github.com/netdata/netdata/blob/master/docs/agent-cloud.md). ## Customizing the local dashboard @@ -163,5 +163,5 @@ file](https://user-images.githubusercontent.com/1153921/62798924-570e6c80-ba94-1 ## Custom dashboards -For information on creating custom dashboards from scratch, see the [custom dashboards](/web/gui/custom/README.md) or -[Atlassian Confluence dashboards](/web/gui/confluence/README.md) guides. +For information on creating custom dashboards from scratch, see the [custom dashboards](https://github.com/netdata/netdata/blob/master/web/gui/custom/README.md) or +[Atlassian Confluence dashboards](https://github.com/netdata/netdata/blob/master/web/gui/confluence/README.md) guides. diff --git a/web/gui/confluence/README.md b/web/gui/confluence/README.md index 64dacdf38..9e7b8025f 100644 --- a/web/gui/confluence/README.md +++ b/web/gui/confluence/README.md @@ -85,7 +85,7 @@ This badge is now auto-refreshing. It will update itself based on the update fre > Keep in mind you can add badges with custom Netdata queries too. Netdata automatically creates badges for all the > alarms, but every chart, every dimension on every chart, can be used for a badge. And Netdata badges are quite -> powerful! Check [Creating Badges](/web/api/badges/README.md) for more information on badges. +> powerful! Check [Creating Badges](https://github.com/netdata/netdata/blob/master/web/api/badges/README.md) for more information on badges. So, let's create a table and add this badge for both our web servers: diff --git a/web/gui/custom/README.md b/web/gui/custom/README.md index cdd5d4260..0751f2087 100644 --- a/web/gui/custom/README.md +++ b/web/gui/custom/README.md @@ -1,7 +1,11 @@ # Custom dashboards @@ -28,7 +32,7 @@ monitoring two servers on the same page: ![image](https://cloud.githubusercontent.com/assets/2662304/14252187/d8d5f78e-fa8e-11e5-990d-99821d38c874.png) --- + ## Web directory @@ -72,7 +76,6 @@ header: ``` ---- ## dashboard.js @@ -163,7 +166,7 @@ that do not specify a Netdata server, add this before loading `dashboard.js`: ``` ---- + ## Adding charts @@ -242,7 +245,7 @@ Each chart can get data from a different Netdata server. You can specify the Net > ``` -If you have ephemeral monitoring setup ([More info here](/streaming/README.md#monitoring-ephemeral-nodes)) and have no +If you have ephemeral monitoring setup ([More info here](https://github.com/netdata/netdata/blob/master/streaming/README.md#monitoring-ephemeral-nodes)) and have no direct access to the nodes dashboards, you can use the following: ```html @@ -366,7 +369,7 @@ select specific dimensions using this: ``` Netdata supports coma (`,`) or pipe (`|`) separated [simple -patterns](/libnetdata/simple_pattern/README.md) for dimensions. By default it +patterns](https://github.com/netdata/netdata/blob/master/libnetdata/simple_pattern/README.md) for dimensions. By default it searches for both dimension IDs and dimension NAMEs. You can control the target of the match with: `data-append-options="match-ids"` or `data-append-options="match-names"`. Spaces in `data-dimensions=""` are matched @@ -434,7 +437,7 @@ it, using this: ### API options -You can append Netdata **[REST API v1](/web/api/README.md)** data options, using this: +You can append Netdata **[REST API v1](https://github.com/netdata/netdata/blob/master/web/api/README.md)** data options, using this: ```html
', - info: 'Network latency statistics, via fping. fping is a program to send ICMP echo probes to network hosts, similar to ping, but much better performing when pinging multiple hosts. fping versions after 3.15 can be directly used as netdata plugins.' - }, - 'ping': { title: 'Ping', icon: '', @@ -585,6 +579,36 @@ netdataDashboard.menu = { info: undefined }, + 'iis': { + title: 'IIS', + icon: '', + info: undefined + }, + + 'mssql': { + title: 'MS SQL Server', + icon: '', + info: undefined + }, + + 'ad': { + title: 'Active Directory', + icon: '', + info: undefined + }, + + 'adcs': { + title: 'AD Certification Service', + icon: '', + info: undefined + }, + + 'adfs': { + title: 'AD Federation Service', + icon: '', + info: undefined + }, + 'perf': { title: 'Perf Counters', icon: '', @@ -738,6 +762,12 @@ netdataDashboard.menu = { title: 'Cassandra', icon: '', info: 'Performance metrics for Cassandra, the open source distributed NoSQL database management system' + }, + + 'consul': { + title: 'Consul', + icon: '', + info: 'Consul performance and health metrics. For details, see Key Metrics.' } }; @@ -4224,6 +4254,114 @@ netdataDashboard.context = { info: 'Requests for which a storage exception was encountered.' }, + // ------------------------------------------------------------------------ + // Consul + 'consul.node_health_check_status': { + info: 'The current status of the node health check. A node health check monitors the health of the entire node. If the node health check fails, Consul marks the node as unhealthy.' + }, + 'consul.service_health_check_status': { + info: 'The current status of the service health check. A service check only affects the health of the service it is associated with. If the service health check fails, the DNS interface stops returning that service.' + }, + 'consul.client_rpc_requests_rate': { + info: 'The number of RPC requests to a Consul server.' + }, + 'consul.client_rpc_requests_exceeded_rate': { + info: 'The number of rate-limited RPC requests to a Consul server. An Increase of this metric either indicates the load is getting high enough to limit the rate or a incorrectly configured Consul agent.' + }, + 'consul.client_rpc_requests_failed_rate': { + info: 'The number of failed RPC requests to a Consul server.' + }, + 'consul.memory_allocated': { + info: 'The amount of memory allocated by the Consul process.' + }, + 'consul.memory_sys': { + info: 'The amount of memory obtained from the OS.' + }, + 'consul.gc_pause_time': { + info: 'The amount of time spent in garbage collection (GC) pauses. GC pause is a "stop-the-world" event, meaning that all runtime threads are blocked until GC completes. If memory usage is high, the Go runtime may GC so frequently that it starts to slow down Consul.' + }, + 'consul.kvs_apply_time': { + info: 'The time it takes to complete an update to the KV store.' + }, + 'consul.kvs_apply_operations_rate': { + info: 'The number of KV store updates.' + }, + 'consul.txn_apply_time': { + info: 'The time spent applying a transaction operation.' + }, + 'consul.txn_apply_operations_rate': { + info: 'The number of applied transaction operations.' + }, + 'consul.raft_commit_time': { + info: 'The time it takes to commit a new entry to the Raft log on the leader.' + }, + 'consul.raft_commits_rate': { + info: 'The number of applied Raft transactions.' + }, + 'consul.autopilot_health_status': { + info: 'The overall health of the local server cluster. The status is healthy if all servers are considered healthy by Autopilot.' + }, + 'consul.autopilot_server_health_status': { + info: 'Whether the server is healthy according to the current Autopilot configuration.' + }, + 'consul.autopilot_server_stable_time': { + info: 'The time this server has been in its current state.' + }, + 'consul.autopilot_server_serf_status': { + info: 'The SerfHealth check status for the server.' + }, + 'consul.autopilot_server_voter_status': { + info: 'Whether the server is a voting member of the Raft cluster.' + }, + 'consul.autopilot_failure_tolerance': { + info: 'The number of voting servers that the cluster can lose while continuing to function.' + }, + 'consul.network_lan_rtt': { + info: 'Estimated network round-trip time between this node and other nodes of the cluster.' + }, + 'consul.raft_leader_last_contact_time': { + info: 'The time since the leader was last able to contact the follower nodes when checking its leader lease.' + }, + 'consul.raft_follower_last_contact_leader_time': { + info: 'The time elapsed since this server last contacted the leader.' + }, + 'consul.raft_leader_elections_rate': { + info: 'The number of leadership elections. Increments whenever a Consul server starts an election.' + }, + 'consul.raft_leadership_transitions_rate': { + info: 'The number of leadership elections. Increments whenever a Consul server becomes a leader.' + }, + 'consul.server_leadership_status': { + info: 'The Consul server leadership status.' + }, + 'consul.raft_thread_main_saturation_perc': { + info: 'An approximate measurement of the proportion of time the main Raft goroutine is busy and unavailable to accept new work.' + }, + 'consul.raft_thread_fsm_saturation_perc': { + info: 'An approximate measurement of the proportion of time the Raft FSM goroutine is busy and unavailable to accept new work.' + }, + 'consul.raft_fsm_last_restore_duration': { + info: 'The time taken to restore the FSM from a snapshot on an agent restart or from the leader calling installSnapshot.' + }, + 'consul.raft_leader_oldest_log_age': { + info: 'The time elapsed since the oldest journal was written to the leader\'s journal storage. This can be important for the health of replication when the write rate is high and the snapshot is large, because followers may not be able to recover from a restart if recovery takes longer than the minimum for the current leader.' + }, + 'consul.raft_rpc_install_snapshot_time': { + info: 'The time it takes to process the installSnapshot RPC call.' + }, + 'consul.raft_boltdb_freelist_bytes': { + info: 'The number of bytes necessary to encode the freelist metadata. When raft_boltdb.NoFreelistSync is set to false these metadata bytes must also be written to disk for each committed log.' + }, + 'consul.raft_boltdb_logs_per_batch_rate': { + info: 'The number of logs written per batch to the database.' + }, + 'consul.raft_boltdb_store_logs_time': { + info: 'The amount of time spent writing logs to the database.' + }, + 'consul.license_expiration_time': { + info: 'The amount of time remaining before Consul Enterprise license expires. When the license expires, some Consul Enterprise features will stop working.' + }, + // ------------------------------------------------------------------------ // WMI (Process) @@ -4280,6 +4418,93 @@ netdataDashboard.context = { info: 'Rate at which segments are sent, including those on current connections, but excluding those containing only retransmitted bytes.' }, + // ------------------------------------------------------------------------ + // WMI (IIS) + + 'iis.website_isapi_extension_requests_count': { + info: 'The number of ISAPI extension requests that are processed concurrently by the web service.' + }, + 'iis.website_errors_rate': { + info: '

The number of requests that cannot be satisfied by the server.

DocumentLocked - the requested document was locked. Usually reported as HTTP error 423. DocumentNotFound - the requested document was not found. Usually reported as HTTP error 404.

' + }, + + // ------------------------------------------------------------------------ + // WMI (Service) + + 'wmi.service_status': { + info: 'The current status of the service.' + }, + + // ------------------------------------------------------------------------ + // WMI (MSSQL) + + 'mssql.instance_accessmethods_page_splits': { + info : 'Page split happens when the page does not have more space. This chart shows the number of page splits per second that occur as the result of overflowing index pages.' + }, + + 'mssql.instance_cache_hit_ratio': { + info : 'Indicates the percentage of pages found in the buffer cache without having to read from disk. The ratio is the total number of cache hits divided by the total number of cache lookups over the last few thousand page accesses. After a long period of time, the ratio moves very little. Because reading from the cache is much less expensive than reading from disk, you want this ratio to be high.' + }, + + 'mssql.instance_bufman_checkpoint_pages': { + info : 'Indicates the number of pages flushed to disk per second by a checkpoint or other operation that require all dirty pages to be flushed.' + }, + + 'mssql.instance_bufman_page_life_expectancy': { + info : 'Indicates the number of seconds a page will stay in the buffer pool without references.' + }, + + 'mssql.instance_memmgr_external_benefit_of_memory': { + info : 'It is used by the engine to balance memory usage between cache and is useful to support when troubleshooting cases with unexpected cache growth. The value is presented as an integer based on an internal calculation.' + }, + + 'mssql.instance_sql_errors': { + info: 'Errors in Microsoft SQL Server.

Db_offline - Tracks severe errors that cause SQL Server to take the current database offline. Info - Information related to error messages that provide information to users but do not cause errors. Kill_connection - Tracks severe errors that cause SQL Server to kill the current connection. User - User errors.

' + }, + + 'mssql.instance_sqlstats_auto_parameterization_attempts': { + info: 'Auto-parameterization occurs when an instance of SQL Server tries to parameterize a Transact-SQL request by replacing some literals with parameters so that reuse of the resulting cached execution plan across multiple similar-looking requests is possible. Note that auto-parameterizations are also known as simple parameterizations in newer versions of SQL Server. This counter does not include forced parameterizations.' + }, + + 'mssql.instance_sqlstats_batch_requests': { + info: 'This statistic is affected by all constraints (such as I/O, number of users, cache size, complexity of requests, and so on). High batch requests mean good throughput.' + }, + + 'mssql.instance_sqlstats_safe_auto_parameterization_attempts': { + info: 'Note that auto-parameterizations are also known as simple parameterizations in later versions of SQL Server.' + }, + + 'mssql.instance_sqlstats_sql_compilations': { + info: 'Indicates the number of times the compile code path is entered. Includes compiles caused by statement-level recompilations in SQL Server. After SQL Server user activity is stable, this value reaches a steady state.' + }, + + // ------------------------------------------------------------------------ + // WMI (AD) + + 'ad.dra_replication_intersite_compressed_traffic': { + info: 'The compressed size, in bytes, of inbound and outbound compressed replication data (size after compression, from DSAs in other sites).' + }, + + 'ad.dra_replication_intrasite_compressed_traffic': { + info: 'The number of bytes replicated that were not compressed (that is., from DSAs in the same site).' + }, + + 'ad.dra_replication_properties_updated': { + info: 'The number of properties that are updated due to incoming property winning the reconciliation logic that determines the final value to be replicated.' + }, + + 'ad.dra_replication_objects_filtered': { + info: 'The number of objects received from inbound replication partners that contained no updates that needed to be applied.' + }, + + 'ad.dra_replication_pending_syncs': { + info: 'The number of directory synchronizations that are queued for this server but not yet processed.' + }, + + 'ad.dra_replication_sync_requests': { + info: 'The number of directory synchronizations that are queued for this server but not yet processed.' + }, + // ------------------------------------------------------------------------ // APACHE @@ -4418,6 +4643,150 @@ netdataDashboard.context = { ] }, + // ------------------------------------------------------------------------ + // NGINX Plus + 'nginxplus.client_connections_rate': { + info: 'Accepted and dropped (not handled) connections. A connection is considered dropped if the worker process is unable to get a connection for the request by establishing a new connection or reusing an open one.' + }, + 'nginxplus.client_connections_count': { + info: 'The current number of client connections. A connection is considered idle if there are currently no active requests.' + }, + 'nginxplus.ssl_handshakes_rate': { + info: 'Successful and failed SSL handshakes.' + }, + 'nginxplus.ssl_session_reuses_rate': { + info: 'The number of session reuses during SSL handshake.' + }, + 'nginxplus.ssl_handshakes_failures_rate': { + info: '

SSL handshake failures.

NoCommonProtocol - failed because of no common protocol. NoCommonCipher - failed because of no shared cipher. Timeout - failed because of a timeout. PeerRejectedCert - failed because a client rejected the certificate.

' + }, + 'nginxplus.ssl_verification_errors_rate': { + info: '

SSL verification errors.

NoCert - a client did not provide the required certificate. ExpiredCert - an expired or not yet valid certificate was presented by a client. RevokedCert - a revoked certificate was presented by a client. HostnameMismatch - server\'s certificate does not match the hostname. Other - other SSL certificate verification errors.

' + }, + 'nginxplus.http_requests_rate': { + info: 'The number of HTTP requests received from clients.' + }, + 'nginxplus.http_requests_count': { + info: 'The current number of client requests.' + }, + 'nginxplus.uptime': { + info: 'The time elapsed since the NGINX process was started.' + }, + 'nginxplus.http_server_zone_requests_rate': { + info: 'The number of requests to the HTTP Server Zone.' + }, + 'nginxplus.http_server_zone_responses_per_code_class_rate': { + info: 'The number of responses from the HTTP Server Zone. Responses grouped by HTTP status code class.' + }, + 'nginxplus.http_server_zone_traffic_rate': { + info: 'The amount of data transferred to and from the HTTP Server Zone.' + }, + 'nginxplus.http_server_zone_requests_processing_count': { + info: 'The number of client requests that are currently being processed by the HTTP Server Zone.' + }, + 'nginxplus.http_server_zone_requests_discarded_rate': { + info: 'The number of requests to the HTTP Server Zone completed without sending a response.' + }, + 'nginxplus.http_location_zone_requests_rate': { + info: 'The number of requests to the HTTP Location Zone.' + }, + 'nginxplus.http_location_zone_responses_per_code_class_rate': { + info: 'The number of responses from the HTTP Location Zone. Responses grouped by HTTP status code class.' + }, + 'nginxplus.http_location_zone_traffic_rate': { + info: 'The amount of data transferred to and from the HTTP Location Zone.' + }, + 'nginxplus.http_location_zone_requests_discarded_rate': { + info: 'The number of requests to the HTTP Location Zone completed without sending a response.' + }, + 'nginxplus.http_upstream_peers_count': { + info: 'The number of HTTP Upstream servers.' + }, + 'nginxplus.http_upstream_zombies_count': { + info: 'The current number of HTTP Upstream servers removed from the group but still processing active client requests.' + }, + 'nginxplus.http_upstream_keepalive_count': { + info: 'The current number of idle keepalive connections to the HTTP Upstream.' + }, + 'nginxplus.http_upstream_server_requests_rate': { + info: 'The number of client requests forwarded to the HTTP Upstream Server.' + }, + 'nginxplus.http_upstream_server_responses_per_code_class_rate': { + info: 'The number of responses received from the HTTP Upstream Server. Responses grouped by HTTP status code class.' + }, + 'nginxplus.http_upstream_server_response_time': { + info: 'The average time to get a complete response from the HTTP Upstream Server.' + }, + 'nginxplus.http_upstream_server_response_header_time': { + info: 'The average time to get a response header from the HTTP Upstream Server.' + }, + 'nginxplus.http_upstream_server_traffic_rate': { + info: 'The amount of traffic transferred to and from the HTTP Upstream Server.' + }, + 'nginxplus.http_upstream_server_state': { + info: 'The current state of the HTTP Upstream Server. Status active if set to 1.' + }, + 'nginxplus.http_upstream_server_connections_count': { + info: 'The current number of active connections to the HTTP Upstream Server.' + }, + 'nginxplus.http_upstream_server_downtime': { + info: 'The time the HTTP Upstream Server has spent in the unavail, checking, and unhealthy states.' + }, + 'nginxplus.http_cache_state': { + info: 'HTTP cache current state. Cold means that the cache loader process is still loading data from disk into the cache.' + }, + 'nginxplus.http_cache_iops': { + info: '

HTTP cache IOPS.

Served - valid, expired, and revalidated responses read from the cache. Written - miss, expired, and bypassed responses written to the cache. Bypassed - miss, expired, and bypass responses.

' + }, + 'nginxplus.http_cache_io': { + info: '

HTTP cache IO.

Served - valid, expired, and revalidated responses read from the cache. Written - miss, expired, and bypassed responses written to the cache. Bypassed - miss, expired, and bypass responses.

' + }, + 'nginxplus.http_cache_size': { + info: 'The current size of the cache.' + }, + 'nginxplus.stream_server_zone_connections_rate': { + info: 'The number of accepted connections to the Stream Server Zone.' + }, + 'nginxplus.stream_server_zone_sessions_per_code_class_rate': { + info: 'The number of completed sessions for the Stream Server Zone. Sessions grouped by status code class.' + }, + 'nginxplus.stream_server_zone_traffic_rate': { + info: 'The amount of data transferred to and from the Stream Server Zone.' + }, + 'nginxplus.stream_server_zone_connections_processing_count': { + info: 'The number of client connections to the Stream Server Zone that are currently being processed.' + }, + 'nginxplus.stream_server_zone_connections_discarded_rate': { + info: 'The number of connections to the Stream Server Zone completed without creating a session.' + }, + 'nginxplus.stream_upstream_peers_count': { + info: 'The number of Stream Upstream servers.' + }, + 'nginxplus.stream_upstream_zombies_count': { + info: 'The current number of HTTP Upstream servers removed from the group but still processing active client connections.' + }, + 'nginxplus.stream_upstream_server_connections_rate': { + info: 'The number of connections forwarded to the Stream Upstream Server.' + }, + 'nginxplus.stream_upstream_server_traffic_rate': { + info: 'The amount of traffic transferred to and from the Stream Upstream Server.' + }, + 'nginxplus.stream_upstream_server_state': { + info: 'The current state of the Stream Upstream Server. Status active if set to 1.' + }, + 'nginxplus.stream_upstream_server_downtime': { + info: 'The time the Stream Upstream Server has spent in the unavail, checking, and unhealthy states.' + }, + 'nginxplus.stream_upstream_server_connections_count': { + info: 'The current number of connections to the Stream Upstream Server.' + }, + 'nginxplus.resolver_zone_requests_rate': { + info: '

Resolver zone DNS requests.

Name - requests to resolve names to addresses. Srv - requests to resolve SRV records. Addr - requests to resolve addresses to names.

' + }, + 'nginxplus.resolver_zone_responses_rate': { + info: '

Resolver zone DNS responses.

NoError - successful responses. FormErr - format error responses. ServFail - server failure responses. NXDomain - host not found responses. NotImp - unimplemented responses. Refused - operation refused responses. TimedOut - timed out requests. Unknown - requests completed with an unknown error.

' + }, + // ------------------------------------------------------------------------ // HTTP check @@ -4487,19 +4856,6 @@ netdataDashboard.context = { info: 'Statistics about RetroShare\'s DHT. These values are estimated!' }, - // ------------------------------------------------------------------------ - // fping - - 'fping.quality': { - colors: NETDATA.colors[10], - height: 0.5 - }, - - 'fping.packets': { - height: 0.5 - }, - - // ------------------------------------------------------------------------ // containers diff --git a/web/gui/main.js b/web/gui/main.js index a2a186703..20f455fda 100644 --- a/web/gui/main.js +++ b/web/gui/main.js @@ -4932,8 +4932,7 @@ function handleSignInMessage(e) { netdataRegistryCallback(registryAgents); if (e.data.redirectURI && !window.location.href.includes(e.data.redirectURI)) { - // lgtm false-positive - redirectURI does not come from user input, but from iframe callback - window.location.replace(e.data.redirectURI); // lgtm[js/client-side-unvalidated-url-redirection] + window.location.replace(e.data.redirectURI); } } diff --git a/web/server/README.md b/web/server/README.md index 6485b84bc..407df6c03 100644 --- a/web/server/README.md +++ b/web/server/README.md @@ -1,7 +1,11 @@ # Web server @@ -47,7 +51,7 @@ Using the above, Netdata will bind to: - IPv4 127.0.0.1 at port 19999 (port was used from `default port`). Only the UI (dashboard) and the read API will be accessible on this port. Both HTTP and HTTPS requests will be accepted. - IPv4 10.1.1.1 at port 19998. The management API and `netdata.conf` will be accessible on this port. - All the IPs `hostname` resolves to (both IPv4 and IPv6 depending on the resolved IPs) at port 19997. Only badges will be accessible on this port. -- All IPv6 IPs at port 19996. Only metric streaming requests from other Netdata agents will be accepted on this port. Only encrypted streams will be allowed (i.e. child nodes also need to be [configured for TLS](/streaming/README.md). +- All IPv6 IPs at port 19996. Only metric streaming requests from other Netdata agents will be accepted on this port. Only encrypted streams will be allowed (i.e. child nodes also need to be [configured for TLS](https://github.com/netdata/netdata/blob/master/streaming/README.md). - All the IPs `localhost` resolves to (both IPv4 and IPv6 depending the resolved IPs) at port 19996. This port will only accept registry API requests. - All IPv4 and IPv6 IPs at port `http` as set in `/etc/services`. Only the UI (dashboard) and the read API will be accessible on this port. - Unix domain socket `/run/netdata/netdata.sock`. All requests are serviceable on this socket. Note that in some OSs like Fedora, every service sees a different `/tmp`, so don't create a Unix socket under `/tmp`. `/run` or `/var/run` is suggested. @@ -136,7 +140,7 @@ Example: bind to = *=dashboard|registry|badges|management|streaming|netdata.conf^SSL=force ``` -For information how to configure the child to use TLS, check [securing the communication](/streaming/README.md#securing-streaming-communications) in the streaming documentation. There you will find additional details on the expected behavior for client and server nodes, when their respective TLS options are enabled. +For information how to configure the child to use TLS, check [securing the communication](https://github.com/netdata/netdata/blob/master/streaming/README.md#securing-streaming-communications) in the streaming documentation. There you will find additional details on the expected behavior for client and server nodes, when their respective TLS options are enabled. When we define the use of SSL in a Netdata agent for different ports, Netdata will apply the behavior specified on each port. For example, using the configuration line below: @@ -192,7 +196,7 @@ Netdata supports access lists in `netdata.conf`: - `allow netdata.conf from` checks the IP to allow `http://netdata.host:19999/netdata.conf`. The IPs listed are all the private IPv4 addresses, including link local IPv6 addresses. Keep in mind that connections to Netdata API ports are filtered by `allow connections from`. So, IPs allowed by `allow netdata.conf from` should also be allowed by `allow connections from`. -- `allow management from` checks the IPs to allow API management calls. Management via the API is currently supported for [health](/web/api/health/README.md#health-management-api) +- `allow management from` checks the IPs to allow API management calls. Management via the API is currently supported for [health](https://github.com/netdata/netdata/blob/master/web/api/health/README.md#health-management-api) In order to check the FQDN of the connection without opening the Netdata agent to DNS-spoofing, a reverse-dns record must be setup for the connecting host. At connection time the reverse-dns of the peer IP address is resolved, and @@ -218,13 +222,13 @@ present that may match DNS FQDNs. |setting|default|info| |:-----:|:-----:|:---| -|ses max window|`15`|See [single exponential smoothing](/web/api/queries/des/README.md)| -|des max window|`15`|See [double exponential smoothing](/web/api/queries/des/README.md)| +|ses max window|`15`|See [single exponential smoothing](https://github.com/netdata/netdata/blob/master/web/api/queries/des/README.md)| +|des max window|`15`|See [double exponential smoothing](https://github.com/netdata/netdata/blob/master/web/api/queries/des/README.md)| |listen backlog|`4096`|The port backlog. Check `man 2 listen`.| |disconnect idle clients after seconds|`60`|The time in seconds to disconnect web clients after being totally idle.| |timeout for first request|`60`|How long to wait for a client to send a request before closing the socket. Prevents slow request attacks.| -|accept a streaming request every seconds|`0`|Can be used to set a limit on how often a parent node will accept streaming requests from child nodes in a [streaming and replication setup](/streaming/README.md)| -|respect do not track policy|`no`|If set to `yes`, Netdata will respect the user's browser preferences for [Do Not Track](https://www.eff.org/issues/do-not-track) (DNT) and storing cookies. If DNT is _enabled_ in the browser, and this option is set to `yes`, users will not be able to sign in to Netdata Cloud via their local Agent dashboard, and their node will not connect to any [registry](/registry/README.md). For certain browsers, users must disable DNT and change this option to `yes` for full functionality.| +|accept a streaming request every seconds|`0`|Can be used to set a limit on how often a parent node will accept streaming requests from child nodes in a [streaming and replication setup](https://github.com/netdata/netdata/blob/master/streaming/README.md)| +|respect do not track policy|`no`|If set to `yes`, Netdata will respect the user's browser preferences for [Do Not Track](https://www.eff.org/issues/do-not-track) (DNT) and storing cookies. If DNT is _enabled_ in the browser, and this option is set to `yes`, users will not be able to sign in to Netdata Cloud via their local Agent dashboard, and their node will not connect to any [registry](https://github.com/netdata/netdata/blob/master/registry/README.md). For certain browsers, users must disable DNT and change this option to `yes` for full functionality.| |x-frame-options response header||[Avoid clickjacking attacks, by ensuring that the content is not embedded into other sites](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Frame-Options).| |enable gzip compression|`yes`|When set to `yes`, Netdata web responses will be GZIP compressed, if the web client accepts such responses.| |gzip compression strategy|`default`|Valid strategies are `default`, `filtered`, `huffman only`, `rle` and `fixed`| diff --git a/web/server/static/static-threaded.c b/web/server/static/static-threaded.c index 26e9a47bd..aca7d7ec0 100644 --- a/web/server/static/static-threaded.c +++ b/web/server/static/static-threaded.c @@ -307,7 +307,7 @@ static int web_server_rcv_callback(POLLINFO *pi, short int *events) { web_client_send(w); } - if(unlikely(w->mode == WEB_CLIENT_MODE_FILECOPY)) { + else if(unlikely(w->mode == WEB_CLIENT_MODE_FILECOPY)) { if(w->pollinfo_filecopy_slot == 0) { debug(D_WEB_CLIENT, "%llu: FILECOPY DETECTED ON FD %d", w->id, pi->fd); @@ -408,6 +408,10 @@ static void socket_listen_main_static_threaded_worker_cleanup(void *ptr) { worker_unregister(); } +static bool web_server_should_stop(void) { + return !service_running(SERVICE_WEB_SERVER); +} + void *socket_listen_main_static_threaded_worker(void *ptr) { worker_private = (struct web_server_static_threaded_worker *)ptr; worker_private->running = 1; @@ -430,6 +434,7 @@ void *socket_listen_main_static_threaded_worker(void *ptr) { , web_server_rcv_callback , web_server_snd_callback , NULL + , web_server_should_stop , web_allow_connections_from , web_allow_connections_dns , NULL @@ -452,35 +457,35 @@ static void socket_listen_main_static_threaded_cleanup(void *ptr) { struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr; static_thread->enabled = NETDATA_MAIN_THREAD_EXITING; - int i, found = 0; - usec_t max = 2 * USEC_PER_SEC, step = 50000; - - // we start from 1, - 0 is self - for(i = 1; i < static_threaded_workers_count; i++) { - if(static_workers_private_data[i].running) { - found++; - info("stopping worker %d", i + 1); - netdata_thread_cancel(static_workers_private_data[i].thread); - } - else - info("found stopped worker %d", i + 1); - } - - while(found && max > 0) { - max -= step; - info("Waiting %d static web threads to finish...", found); - sleep_usec(step); - found = 0; - - // we start from 1, - 0 is self - for(i = 1; i < static_threaded_workers_count; i++) { - if (static_workers_private_data[i].running) - found++; - } - } - - if(found) - error("%d static web threads are taking too long to finish. Giving up.", found); +// int i, found = 0; +// usec_t max = 2 * USEC_PER_SEC, step = 50000; +// +// // we start from 1, - 0 is self +// for(i = 1; i < static_threaded_workers_count; i++) { +// if(static_workers_private_data[i].running) { +// found++; +// info("stopping worker %d", i + 1); +// netdata_thread_cancel(static_workers_private_data[i].thread); +// } +// else +// info("found stopped worker %d", i + 1); +// } +// +// while(found && max > 0) { +// max -= step; +// info("Waiting %d static web threads to finish...", found); +// sleep_usec(step); +// found = 0; +// +// // we start from 1, - 0 is self +// for(i = 1; i < static_threaded_workers_count; i++) { +// if (static_workers_private_data[i].running) +// found++; +// } +// } +// +// if(found) +// error("%d static web threads are taking too long to finish. Giving up.", found); info("closing all web server sockets..."); listen_sockets_close(&api_sockets); @@ -502,7 +507,7 @@ void *socket_listen_main_static_threaded(void *ptr) { // 6 threads is the optimal value // since 6 are the parallel connections browsers will do // so, if the machine has more CPUs, avoid using resources unnecessarily - int def_thread_count = (processors > 6) ? 6 : processors; + int def_thread_count = MIN(get_netdata_cpus(), 6); if (!strcmp(config_get(CONFIG_SECTION_WEB, "mode", ""),"single-threaded")) { info("Running web server with one thread, because mode is single-threaded"); @@ -534,7 +539,7 @@ void *socket_listen_main_static_threaded(void *ptr) { static_workers_private_data[i].max_sockets = max_sockets / static_threaded_workers_count; char tag[50 + 1]; - snprintfz(tag, 50, "WEB_SERVER[static%d]", i+1); + snprintfz(tag, 50, "WEB[%d]", i+1); info("starting worker %d", i+1); netdata_thread_create(&static_workers_private_data[i].thread, tag, NETDATA_THREAD_OPTION_DEFAULT, diff --git a/web/server/web_client.c b/web/server/web_client.c index b3c5ada7a..c14b86f3e 100644 --- a/web/server/web_client.c +++ b/web/server/web_client.c @@ -129,10 +129,10 @@ void web_client_request_done(struct web_client *w) { , mode , sent , size - , -((size > 0) ? ((size - sent) / (double) size * 100.0) : 0.0) - , dt_usec(&w->tv_ready, &w->tv_in) / 1000.0 - , dt_usec(&tv, &w->tv_ready) / 1000.0 - , dt_usec(&tv, &w->tv_in) / 1000.0 + , -((size > 0) ? ((double)(size - sent) / (double) size * 100.0) : 0.0) + , (double)dt_usec(&w->tv_ready, &w->tv_in) / 1000.0 + , (double)dt_usec(&tv, &w->tv_ready) / 1000.0 + , (double)dt_usec(&tv, &w->tv_in) / 1000.0 , w->response.code , strip_control_characters(w->last_url) ); @@ -302,7 +302,7 @@ int mysendfile(struct web_client *w, char *filename) { } } - // if the filename contains a .. refuse to serve it + // if the filename contains a double dot refuse to serve it if(strstr(filename, "..") != 0) { debug(D_WEB_CLIENT_ACCESS, "%llu: File '%s' is not acceptable.", w->id, filename); w->response.data->contenttype = CT_TEXT_HTML; @@ -831,9 +831,8 @@ static inline char *web_client_valid_method(struct web_client *w, char *s) { * @param s is the first address of the string. * @param ptr is the address of the separator. */ -static void web_client_set_path_query(struct web_client *w, char *s, char *ptr) { +static void web_client_set_path_query(struct web_client *w, const char *s, char *ptr) { w->url_path_length = (size_t)(ptr -s); - w->url_search_path = ptr; } @@ -1250,12 +1249,15 @@ static inline void web_client_send_http_header(struct web_client *w) { if(bytes > 0) w->stats_sent_bytes += bytes; - error("HTTP headers failed to be sent (I sent %zu bytes but the system sent %zd bytes). Closing web client." - , buffer_strlen(w->response.header_output) - , bytes); + if (bytes < 0) { - WEB_CLIENT_IS_DEAD(w); - return; + error("HTTP headers failed to be sent (I sent %zu bytes but the system sent %zd bytes). Closing web client." + , buffer_strlen(w->response.header_output) + , bytes); + + WEB_CLIENT_IS_DEAD(w); + return; + } } else w->stats_sent_bytes += bytes; @@ -1314,6 +1316,9 @@ static inline int web_client_switch_host(RRDHOST *host, struct web_client *w, ch } static inline int web_client_process_url(RRDHOST *host, struct web_client *w, char *url) { + if(unlikely(!service_running(ABILITY_WEB_REQUESTS))) + return web_client_permission_denied(w); + static uint32_t hash_api = 0, hash_netdata_conf = 0, @@ -1423,7 +1428,7 @@ static inline int web_client_process_url(RRDHOST *host, struct web_client *w, ch // replace the zero bytes with spaces buffer_char_replace(w->response.data, '\0', ' '); - // just leave the buffer as is + // just leave the buffer as-is // it will be copied back to the client return HTTP_RESP_OK; @@ -1540,7 +1545,7 @@ void web_client_process_request(struct web_client *w) { break; } - // keep track of the time we done processing + // keep track of the processing time now_realtime_timeval(&w->tv_ready); w->response.sent = 0; @@ -1612,7 +1617,6 @@ ssize_t web_client_send_chunk_header(struct web_client *w, size_t len) else if(bytes == 0) { debug(D_WEB_CLIENT, "%llu: Did not send chunk header to the client.", w->id); - WEB_CLIENT_IS_DEAD(w); } else { debug(D_WEB_CLIENT, "%llu: Failed to send chunk header to client.", w->id); @@ -1635,7 +1639,6 @@ ssize_t web_client_send_chunk_close(struct web_client *w) else if(bytes == 0) { debug(D_WEB_CLIENT, "%llu: Did not send chunk suffix to the client.", w->id); - WEB_CLIENT_IS_DEAD(w); } else { debug(D_WEB_CLIENT, "%llu: Failed to send chunk suffix to client.", w->id); @@ -1658,7 +1661,6 @@ ssize_t web_client_send_chunk_finalize(struct web_client *w) else if(bytes == 0) { debug(D_WEB_CLIENT, "%llu: Did not send chunk finalize suffix to the client.", w->id); - WEB_CLIENT_IS_DEAD(w); } else { debug(D_WEB_CLIENT, "%llu: Failed to send chunk finalize suffix to client.", w->id); @@ -1775,7 +1777,6 @@ ssize_t web_client_send_deflate(struct web_client *w) debug(D_WEB_CLIENT, "%llu: Did not send any bytes to the client (zhave = %zu, zsent = %zu, need to send = %zu).", w->id, w->response.zhave, w->response.zsent, w->response.zhave - w->response.zsent); - WEB_CLIENT_IS_DEAD(w); } else { debug(D_WEB_CLIENT, "%llu: Failed to send data to client.", w->id); @@ -1828,7 +1829,6 @@ ssize_t web_client_send(struct web_client *w) { } else if(likely(bytes == 0)) { debug(D_WEB_CLIENT, "%llu: Did not send any bytes to the client.", w->id); - WEB_CLIENT_IS_DEAD(w); } else { debug(D_WEB_CLIENT, "%llu: Failed to send data to client.", w->id); @@ -1846,7 +1846,7 @@ ssize_t web_client_read_file(struct web_client *w) if(unlikely(w->response.rlen <= w->response.data->len)) return 0; - ssize_t left = w->response.rlen - w->response.data->len; + ssize_t left = (ssize_t)(w->response.rlen - w->response.data->len); ssize_t bytes = read(w->ifd, &w->response.data->buffer[w->response.data->len], (size_t)left); if(likely(bytes > 0)) { size_t old = w->response.data->len; @@ -1896,7 +1896,7 @@ ssize_t web_client_receive(struct web_client *w) return web_client_read_file(w); ssize_t bytes; - ssize_t left = w->response.data->size - w->response.data->len; + ssize_t left = (ssize_t)(w->response.data->size - w->response.data->len); // do we have any space for more data? buffer_need_bytes(w->response.data, NETDATA_WEB_REQUEST_RECEIVE_SIZE); @@ -1928,10 +1928,32 @@ ssize_t web_client_receive(struct web_client *w) debug(D_WEB_CLIENT, "%llu: Received %zd bytes.", w->id, bytes); debug(D_WEB_DATA, "%llu: Received data: '%s'.", w->id, &w->response.data->buffer[old]); } - else { + else if (bytes < 0) { debug(D_WEB_CLIENT, "%llu: receive data failed.", w->id); WEB_CLIENT_IS_DEAD(w); - } + } else + debug(D_WEB_CLIENT, "%llu: Received %zd bytes.", w->id, bytes); return(bytes); } + + +int web_client_socket_is_now_used_for_streaming(struct web_client *w) { + // prevent the web_client from closing the streaming socket + + WEB_CLIENT_IS_DEAD(w); + + if(web_server_mode == WEB_SERVER_MODE_STATIC_THREADED) { + web_client_flag_set(w, WEB_CLIENT_FLAG_DONT_CLOSE_SOCKET); + } + else { + if(w->ifd == w->ofd) + w->ifd = w->ofd = -1; + else + w->ifd = -1; + } + + buffer_flush(w->response.data); + + return HTTP_RESP_OK; +} diff --git a/web/server/web_client.h b/web/server/web_client.h index 630d71a8a..d0360f4f9 100644 --- a/web/server/web_client.h +++ b/web/server/web_client.h @@ -19,13 +19,16 @@ extern int web_enable_gzip, web_gzip_level, web_gzip_strategy; // HTTP_CODES 4XX Client Errors #define HTTP_RESP_BAD_REQUEST 400 +#define HTTP_RESP_UNAUTHORIZED 401 #define HTTP_RESP_FORBIDDEN 403 #define HTTP_RESP_NOT_FOUND 404 +#define HTTP_RESP_CONFLICT 409 #define HTTP_RESP_PRECOND_FAIL 412 // HTTP_CODES 5XX Server Errors #define HTTP_RESP_INTERNAL_SERVER_ERROR 500 -#define HTTP_RESP_BACKEND_FETCH_FAILED 503 +#define HTTP_RESP_BACKEND_FETCH_FAILED 503 // 503 is right +#define HTTP_RESP_SERVICE_UNAVAILABLE 503 // 503 is right #define HTTP_RESP_GATEWAY_TIMEOUT 504 #define HTTP_RESP_BACKEND_RESPONSE_INVALID 591 @@ -206,6 +209,8 @@ int mysendfile(struct web_client *w, char *filename); void web_client_build_http_header(struct web_client *w); char *strip_control_characters(char *url); +int web_client_socket_is_now_used_for_streaming(struct web_client *w); + #include "daemon/common.h" #endif diff --git a/web/server/web_client_cache.c b/web/server/web_client_cache.c index 1fa593580..4344209c8 100644 --- a/web/server/web_client_cache.c +++ b/web/server/web_client_cache.c @@ -11,7 +11,16 @@ static void web_client_reuse_ssl(struct web_client *w) { if (netdata_ssl_srv_ctx) { if (w->ssl.conn) { - SSL_clear(w->ssl.conn); + SSL_SESSION *session = SSL_get_session(w->ssl.conn); + SSL *old = w->ssl.conn; + w->ssl.conn = SSL_new(netdata_ssl_srv_ctx); + if (session) { +#if OPENSSL_VERSION_NUMBER >= OPENSSL_VERSION_111 + if (SSL_SESSION_is_resumable(session)) +#endif + SSL_set_session(w->ssl.conn, session); + } + SSL_free(old); } } } @@ -56,13 +65,15 @@ static void web_client_free(struct web_client *w) { } #endif freez(w); + __atomic_sub_fetch(&netdata_buffers_statistics.buffers_web, sizeof(struct web_client), __ATOMIC_RELAXED); } static struct web_client *web_client_alloc(void) { struct web_client *w = callocz(1, sizeof(struct web_client)); - w->response.data = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE); - w->response.header = buffer_create(NETDATA_WEB_RESPONSE_HEADER_SIZE); - w->response.header_output = buffer_create(NETDATA_WEB_RESPONSE_HEADER_SIZE); + __atomic_add_fetch(&netdata_buffers_statistics.buffers_web, sizeof(struct web_client), __ATOMIC_RELAXED); + w->response.data = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE, &netdata_buffers_statistics.buffers_web); + w->response.header = buffer_create(NETDATA_WEB_RESPONSE_HEADER_SIZE, &netdata_buffers_statistics.buffers_web); + w->response.header_output = buffer_create(NETDATA_WEB_RESPONSE_HEADER_SIZE, &netdata_buffers_statistics.buffers_web); return w; } diff --git a/web/server/web_server.c b/web/server/web_server.c index 4da08d431..d5645a947 100644 --- a/web/server/web_server.c +++ b/web/server/web_server.c @@ -37,7 +37,7 @@ LISTEN_SOCKETS api_sockets = { }; void debug_sockets() { - BUFFER *wb = buffer_create(256 * sizeof(char)); + BUFFER *wb = buffer_create(256 * sizeof(char), NULL); int i; for(i = 0 ; i < (int)api_sockets.opened ; i++) { -- cgit v1.2.3