diff options
Diffstat (limited to '')
-rw-r--r-- | streaming/receiver.c | 88 | ||||
-rw-r--r-- | streaming/rrdpush.c | 88 | ||||
-rw-r--r-- | streaming/rrdpush.h | 21 | ||||
-rw-r--r-- | streaming/sender.c | 146 | ||||
-rw-r--r-- | streaming/stream.conf | 4 |
5 files changed, 298 insertions, 49 deletions
diff --git a/streaming/receiver.c b/streaming/receiver.c index b083766dd..d20658e65 100644 --- a/streaming/receiver.c +++ b/streaming/receiver.c @@ -30,6 +30,8 @@ void destroy_receiver_state(struct receiver_state *rpt) { } static void rrdpush_receiver_thread_cleanup(void *ptr) { + worker_unregister(); + static __thread int executed = 0; if(!executed) { executed = 1; @@ -338,26 +340,31 @@ static char *receiver_next_line(struct receiver_state *r, int *pos) { return NULL; } +static void streaming_parser_thread_cleanup(void *ptr) { + PARSER *parser = (PARSER *)ptr; + parser_destroy(parser); +} + size_t streaming_parser(struct receiver_state *rpt, struct plugind *cd, FILE *fp) { size_t result; - PARSER_USER_OBJECT *user = callocz(1, sizeof(*user)); - user->enabled = cd->enabled; - user->host = rpt->host; - user->opaque = rpt; - user->cd = cd; - user->trust_durations = 0; - - PARSER *parser = parser_init(rpt->host, user, fp, PARSER_INPUT_SPLIT); + + PARSER_USER_OBJECT user = { + .enabled = cd->enabled, + .host = rpt->host, + .opaque = rpt, + .cd = cd, + .trust_durations = 1 + }; + + PARSER *parser = parser_init(rpt->host, &user, fp, PARSER_INPUT_SPLIT); + + // this keeps the parser with its current value + // so, parser needs to be allocated before pushing it + netdata_thread_cleanup_push(streaming_parser_thread_cleanup, parser); + parser_add_keyword(parser, "TIMESTAMP", streaming_timestamp); parser_add_keyword(parser, "CLAIMED_ID", streaming_claimed_id); - if (unlikely(!parser)) { - error("Failed to initialize parser"); - cd->serial_failures++; - freez(user); - return 0; - } - parser->plugins_action->begin_action = &pluginsd_begin_action; parser->plugins_action->flush_action = &pluginsd_flush_action; parser->plugins_action->end_action = &pluginsd_end_action; @@ -371,12 +378,13 @@ size_t streaming_parser(struct receiver_state *rpt, struct plugind *cd, FILE *fp parser->plugins_action->clabel_commit_action = &pluginsd_clabel_commit_action; parser->plugins_action->clabel_action = &pluginsd_clabel_action; - user->parser = parser; + user.parser = parser; #ifdef ENABLE_COMPRESSION if (rpt->decompressor) rpt->decompressor->reset(rpt->decompressor); #endif + do{ if (receiver_read(rpt, fp)) break; @@ -389,10 +397,13 @@ size_t streaming_parser(struct receiver_state *rpt, struct plugind *cd, FILE *fp rpt->last_msg_t = now_realtime_sec(); } while(!netdata_exit); + done: - result= user->count; - freez(user); - parser_destroy(parser); + result = user.count; + + // free parser with the pop function + netdata_thread_cleanup_pop(1); + return result; } @@ -455,9 +466,23 @@ static int rrdpush_receive(struct receiver_state *rpt) if (strcmp(rpt->machine_guid, localhost->machine_guid) == 0) { log_stream_connection(rpt->client_ip, rpt->client_port, rpt->key, rpt->machine_guid, rpt->hostname, "DENIED - ATTEMPT TO RECEIVE METRICS FROM MACHINE_GUID IDENTICAL TO PARENT"); - error("STREAM %s [receive from %s:%s]: denied to receive metrics, machine GUID [%s] is my own. Did you copy the parent/proxy machine GUID to a child?", rpt->hostname, rpt->client_ip, rpt->client_port, rpt->machine_guid); + error("STREAM %s [receive from %s:%s]: denied to receive metrics, machine GUID [%s] is my own. Did you copy the parent/proxy machine GUID to a child, or is this an inter-agent loop?", rpt->hostname, rpt->client_ip, rpt->client_port, rpt->machine_guid); + char initial_response[HTTP_HEADER_SIZE + 1]; + snprintfz(initial_response, HTTP_HEADER_SIZE, "%s", START_STREAMING_ERROR_SAME_LOCALHOST); +#ifdef ENABLE_HTTPS + rpt->host->stream_ssl.conn = rpt->ssl.conn; + rpt->host->stream_ssl.flags = rpt->ssl.flags; + if(send_timeout(&rpt->ssl, rpt->fd, initial_response, strlen(initial_response), 0, 60) != (ssize_t)strlen(initial_response)) { +#else + if(send_timeout(rpt->fd, initial_response, strlen(initial_response), 0, 60) != strlen(initial_response)) { +#endif + log_stream_connection(rpt->client_ip, rpt->client_port, rpt->key, rpt->host->machine_guid, rpt->host->hostname, "FAILED - CANNOT REPLY"); + error("STREAM %s [receive from [%s]:%s]: cannot send command.", rpt->host->hostname, rpt->client_ip, rpt->client_port); + close(rpt->fd); + return 0; + } close(rpt->fd); - return 1; + return 0; } if (rpt->host==NULL) { @@ -609,6 +634,12 @@ static int rrdpush_receive(struct receiver_state *rpt) if(sock_delnonblock(rpt->fd) < 0) error("STREAM %s [receive from [%s]:%s]: cannot remove the non-blocking flag from socket %d", rpt->host->hostname, rpt->client_ip, rpt->client_port, rpt->fd); + struct timeval timeout; + timeout.tv_sec = 120; + timeout.tv_usec = 0; + if (unlikely(setsockopt(rpt->fd, SOL_SOCKET, SO_RCVTIMEO, &timeout, sizeof timeout) != 0)) + error("STREAM %s [receive from [%s]:%s]: cannot set timeout for socket %d", rpt->host->hostname, rpt->client_ip, rpt->client_port, rpt->fd); + // convert the socket to a FILE * FILE *fp = fdopen(rpt->fd, "r"); if(!fp) { @@ -640,6 +671,9 @@ static int rrdpush_receive(struct receiver_state *rpt) rpt->host->hostname); } } + rpt->host->senders_connect_time = now_realtime_sec(); + rpt->host->senders_last_chart_command = 0; + rpt->host->trigger_chart_obsoletion_check = 1; rrdhost_unlock(rpt->host); // call the plugins.d processor to receive the metrics @@ -648,9 +682,9 @@ static int rrdpush_receive(struct receiver_state *rpt) cd.version = rpt->stream_version; -#if defined(ENABLE_ACLK) +#if defined(ENABLE_NEW_CLOUD_PROTOCOL) // in case we have cloud connection we inform cloud - // new slave connected + // new child connected if (netdata_cloud_setting) aclk_host_state_update(rpt->host, 1); #endif @@ -662,9 +696,9 @@ static int rrdpush_receive(struct receiver_state *rpt) error("STREAM %s [receive from [%s]:%s]: disconnected (completed %zu updates).", rpt->hostname, rpt->client_ip, rpt->client_port, count); -#if defined(ENABLE_ACLK) +#if defined(ENABLE_NEW_CLOUD_PROTOCOL) // in case we have cloud connection we inform cloud - // new slave connected + // new child connected if (netdata_cloud_setting) aclk_host_state_update(rpt->host, 0); #endif @@ -675,6 +709,8 @@ static int rrdpush_receive(struct receiver_state *rpt) rrdhost_wrlock(rpt->host); netdata_mutex_lock(&rpt->host->receiver_lock); if (rpt->host->receiver == rpt) { + rpt->host->senders_connect_time = 0; + rpt->host->trigger_chart_obsoletion_check = 0; rpt->host->senders_disconnected_time = now_realtime_sec(); rrdhost_flag_set(rpt->host, RRDHOST_FLAG_ORPHAN); if(health_enabled == CONFIG_BOOLEAN_AUTO) @@ -699,7 +735,9 @@ void *rrdpush_receiver_thread(void *ptr) { struct receiver_state *rpt = (struct receiver_state *)ptr; info("STREAM %s [%s]:%s: receive thread created (task id %d)", rpt->hostname, rpt->client_ip, rpt->client_port, gettid()); + worker_register("STREAMRCV"); rrdpush_receive(rpt); + worker_unregister(); netdata_thread_cleanup_pop(1); return NULL; diff --git a/streaming/rrdpush.c b/streaming/rrdpush.c index 8829d1eea..77774d8d3 100644 --- a/streaming/rrdpush.c +++ b/streaming/rrdpush.c @@ -136,11 +136,7 @@ static inline int should_send_chart_matching(RRDSET *st) { if (rrdset_flag_check(st, RRDSET_FLAG_ANOMALY_DETECTION)) return ml_streaming_enabled(); - if(unlikely(!rrdset_flag_check(st, RRDSET_FLAG_ENABLED))) { - rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_SEND); - rrdset_flag_set(st, RRDSET_FLAG_UPSTREAM_IGNORE); - } - else if(!rrdset_flag_check(st, RRDSET_FLAG_UPSTREAM_SEND|RRDSET_FLAG_UPSTREAM_IGNORE)) { + if(!rrdset_flag_check(st, RRDSET_FLAG_UPSTREAM_SEND|RRDSET_FLAG_UPSTREAM_IGNORE)) { RRDHOST *host = st->rrdhost; if(simple_pattern_matches(host->rrdpush_send_charts_matching, st->id) || @@ -421,6 +417,86 @@ void rrdpush_claimed_id(RRDHOST *host) error("STREAM %s [send]: cannot write to internal pipe", host->hostname); } +int connect_to_one_of_destinations( + struct rrdpush_destinations *destinations, + int default_port, + struct timeval *timeout, + size_t *reconnects_counter, + char *connected_to, + size_t connected_to_size, + struct rrdpush_destinations **destination) +{ + int sock = -1; + + for (struct rrdpush_destinations *d = destinations; d; d = d->next) { + if (d->disabled_no_proper_reply) { + d->disabled_no_proper_reply = 0; + continue; + } else if (d->disabled_because_of_localhost) { + continue; + } else if (d->disabled_already_streaming && (d->disabled_already_streaming + 30 > now_realtime_sec())) { + continue; + } else if (d->disabled_because_of_denied_access) { + d->disabled_because_of_denied_access = 0; + continue; + } + + if (reconnects_counter) + *reconnects_counter += 1; + sock = connect_to_this(d->destination, default_port, timeout); + if (sock != -1) { + if (connected_to && connected_to_size) { + strncpy(connected_to, d->destination, connected_to_size); + connected_to[connected_to_size - 1] = '\0'; + } + *destination = d; + break; + } + } + + return sock; +} + +struct rrdpush_destinations *destinations_init(const char *dests) { + const char *s = dests; + struct rrdpush_destinations *destinations = NULL, *prev = NULL; + while(*s) { + const char *e = s; + + // skip path, moving both s(tart) and e(nd) + if(*e == '/') + while(!isspace(*e) && *e != ',') s = ++e; + + // skip separators, moving both s(tart) and e(nd) + while(isspace(*e) || *e == ',') s = ++e; + + // move e(nd) to the first separator + while(*e && !isspace(*e) && *e != ',' && *e != '/') e++; + + // is there anything? + if(!*s || s == e) break; + + char buf[e - s + 1]; + strncpyz(buf, s, e - s); + struct rrdpush_destinations *d = callocz(1, sizeof(struct rrdpush_destinations)); + strncpyz(d->destination, buf, sizeof(d->destination)-1); + d->disabled_no_proper_reply = 0; + d->disabled_because_of_localhost = 0; + d->disabled_already_streaming = 0; + d->disabled_because_of_denied_access = 0; + d->next = NULL; + if (!destinations) { + destinations = d; + } else { + prev->next = d; + } + prev = d; + + s = e; + } + return destinations; +} + // ---------------------------------------------------------------------------- // rrdpush sender thread @@ -539,6 +615,8 @@ int rrdpush_receiver_thread_spawn(struct web_client *w, char *url) { system_info->ml_capable = strtoul(value, NULL, 0); else if(!strcmp(name, "ml_enabled")) system_info->ml_enabled = strtoul(value, NULL, 0); + else if(!strcmp(name, "mc_version")) + system_info->mc_version = strtoul(value, NULL, 0); else if(!strcmp(name, "tags")) tags = value; else if(!strcmp(name, "ver")) diff --git a/streaming/rrdpush.h b/streaming/rrdpush.h index 7eb2c6e58..6efe8cd6f 100644 --- a/streaming/rrdpush.h +++ b/streaming/rrdpush.h @@ -26,6 +26,10 @@ #define START_STREAMING_PROMPT_V2 "Hit me baby, push them over and bring the host labels..." #define START_STREAMING_PROMPT_VN "Hit me baby, push them over with the version=" +#define START_STREAMING_ERROR_SAME_LOCALHOST "Don't hit me baby, you are trying to stream my localhost back" +#define START_STREAMING_ERROR_ALREADY_STREAMING "This GUID is already streaming to this server" +#define START_STREAMING_ERROR_NOT_PERMITTED "You are not permitted to access this. Check the logs for more info." + #define HTTP_HEADER_SIZE 8192 typedef enum { @@ -138,6 +142,14 @@ struct receiver_state { #endif }; +struct rrdpush_destinations { + char destination[CONNECTED_TO_SIZE + 1]; + int disabled_no_proper_reply; + int disabled_because_of_localhost; + time_t disabled_already_streaming; + int disabled_because_of_denied_access; + struct rrdpush_destinations *next; +}; extern unsigned int default_rrdpush_enabled; #ifdef ENABLE_COMPRESSION @@ -149,6 +161,7 @@ extern char *default_rrdpush_send_charts_matching; extern unsigned int remote_clock_resync_iterations; extern void sender_init(struct sender_state *s, RRDHOST *parent); +extern struct rrdpush_destinations *destinations_init(const char *destinations); void sender_start(struct sender_state *s); void sender_commit(struct sender_state *s); extern int rrdpush_init(); @@ -164,6 +177,14 @@ extern void rrdpush_sender_thread_stop(RRDHOST *host); extern void rrdpush_sender_send_this_host_variable_now(RRDHOST *host, RRDVAR *rv); extern void log_stream_connection(const char *client_ip, const char *client_port, const char *api_key, const char *machine_guid, const char *host, const char *msg); +extern int connect_to_one_of_destinations( + struct rrdpush_destinations *destinations, + int default_port, + struct timeval *timeout, + size_t *reconnects_counter, + char *connected_to, + size_t connected_to_size, + struct rrdpush_destinations **destination); #ifdef ENABLE_COMPRESSION struct compressor_state *create_compressor(); diff --git a/streaming/sender.c b/streaming/sender.c index 72259c3ab..a95cc8673 100644 --- a/streaming/sender.c +++ b/streaming/sender.c @@ -2,6 +2,26 @@ #include "rrdpush.h" +#define WORKER_SENDER_JOB_CONNECT 0 +#define WORKER_SENDER_JOB_PIPE_READ 1 +#define WORKER_SENDER_JOB_SOCKET_RECEIVE 2 +#define WORKER_SENDER_JOB_EXECUTE 3 +#define WORKER_SENDER_JOB_SOCKET_SEND 4 +#define WORKER_SENDER_JOB_DISCONNECT_BAD_HANDSHAKE 5 +#define WORKER_SENDER_JOB_DISCONNECT_OVERFLOW 6 +#define WORKER_SENDER_JOB_DISCONNECT_TIMEOUT 7 +#define WORKER_SENDER_JOB_DISCONNECT_POLL_ERROR 8 +#define WORKER_SENDER_JOB_DISCONNECT_SOCKER_ERROR 9 +#define WORKER_SENDER_JOB_DISCONNECT_SSL_ERROR 10 +#define WORKER_SENDER_JOB_DISCONNECT_PARENT_CLOSED 11 +#define WORKER_SENDER_JOB_DISCONNECT_RECEIVE_ERROR 12 +#define WORKER_SENDER_JOB_DISCONNECT_SEND_ERROR 13 +#define WORKER_SENDER_JOB_DISCONNECT_NO_COMPRESSION 14 + +#if WORKER_UTILIZATION_MAX_JOB_TYPES < 15 +#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 15 +#endif + extern struct config stream_config; extern int netdata_use_ssl_on_stream; extern char *netdata_ssl_ca_path; @@ -21,8 +41,8 @@ static inline void rrdpush_sender_thread_close_socket(RRDHOST *host); * Inform the user through the error log file and * deactivate compression by downgrading the stream protocol. */ -static inline void deactivate_compression(struct sender_state *s) -{ +static inline void deactivate_compression(struct sender_state *s) { + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_NO_COMPRESSION); error("STREAM_COMPRESSION: Deactivating compression to avoid stream corruption"); default_compression_enabled = 0; s->rrdpush_compression = 0; @@ -183,6 +203,18 @@ void rrdpush_clean_encoded(stream_encoded_t *se) freez(se->kernel_version); } +static inline long int parse_stream_version_for_errors(char *http) +{ + if (!memcmp(http, START_STREAMING_ERROR_SAME_LOCALHOST, sizeof(START_STREAMING_ERROR_SAME_LOCALHOST))) + return -2; + else if (!memcmp(http, START_STREAMING_ERROR_ALREADY_STREAMING, sizeof(START_STREAMING_ERROR_ALREADY_STREAMING))) + return -3; + else if (!memcmp(http, START_STREAMING_ERROR_NOT_PERMITTED, sizeof(START_STREAMING_ERROR_NOT_PERMITTED))) + return -4; + else + return -1; +} + static inline long int parse_stream_version(RRDHOST *host, char *http) { long int stream_version = -1; @@ -207,6 +239,9 @@ static inline long int parse_stream_version(RRDHOST *host, char *http) host->labels.labels_flag |= LABEL_FLAG_STOP_STREAM; host->labels.labels_flag &= ~LABEL_FLAG_UPDATE_STREAM; } + else { + stream_version = parse_stream_version_for_errors(http); + } } } return stream_version; @@ -226,13 +261,14 @@ static int rrdpush_sender_thread_connect_to_parent(RRDHOST *host, int default_po debug(D_STREAM, "STREAM: Attempting to connect..."); info("STREAM %s [send to %s]: connecting...", host->hostname, host->rrdpush_send_destination); - host->rrdpush_sender_socket = connect_to_one_of( - host->rrdpush_send_destination + host->rrdpush_sender_socket = connect_to_one_of_destinations( + host->destinations , default_port , &tv , &s->reconnects_counter , s->connected_to , sizeof(s->connected_to)-1 + , &host->destination ); if(unlikely(host->rrdpush_sender_socket == -1)) { @@ -299,6 +335,7 @@ if(!s->rrdpush_compression) "&hops=%d" "&ml_capable=%d" "&ml_enabled=%d" + "&mc_version=%d" "&tags=%s" "&ver=%d" "&NETDATA_INSTANCE_CLOUD_TYPE=%s" @@ -344,6 +381,7 @@ if(!s->rrdpush_compression) , host->system_info->hops + 1 , host->system_info->ml_capable , host->system_info->ml_enabled + , host->system_info->mc_version , (host->tags) ? host->tags : "" , s->version , (host->system_info->cloud_provider_type) ? host->system_info->cloud_provider_type : "" @@ -389,7 +427,10 @@ if(!s->rrdpush_compression) err = SSL_get_error(host->ssl.conn, err); error("SSL cannot connect with the server: %s ",ERR_error_string((long)SSL_get_error(host->ssl.conn,err),NULL)); if (netdata_use_ssl_on_stream == NETDATA_SSL_FORCE) { + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_SSL_ERROR); rrdpush_sender_thread_close_socket(host); + if (host->destination->next) + host->destination->disabled_no_proper_reply = 1; return 0; }else { host->ssl.flags = NETDATA_SSL_NO_HANDSHAKE; @@ -399,8 +440,11 @@ if(!s->rrdpush_compression) if (netdata_use_ssl_on_stream == NETDATA_SSL_FORCE) { if (netdata_validate_server == NETDATA_SSL_VALID_CERTIFICATE) { if ( security_test_certificate(host->ssl.conn)) { + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_SSL_ERROR); error("Closing the stream connection, because the server SSL certificate is not valid."); rrdpush_sender_thread_close_socket(host); + if (host->destination->next) + host->destination->disabled_no_proper_reply = 1; return 0; } } @@ -411,6 +455,7 @@ if(!s->rrdpush_compression) #else if(send_timeout(host->rrdpush_sender_socket, http, strlen(http), 0, timeout) == -1) { #endif + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_TIMEOUT); error("STREAM %s [send to %s]: failed to send HTTP header to remote netdata.", host->hostname, s->connected_to); rrdpush_sender_thread_close_socket(host); return 0; @@ -426,6 +471,7 @@ if(!s->rrdpush_compression) received = recv_timeout(host->rrdpush_sender_socket, http, HTTP_HEADER_SIZE, 0, timeout); if(received == -1) { #endif + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_TIMEOUT); error("STREAM %s [send to %s]: remote netdata does not respond.", host->hostname, s->connected_to); rrdpush_sender_thread_close_socket(host); return 0; @@ -435,8 +481,31 @@ if(!s->rrdpush_compression) debug(D_STREAM, "Response to sender from far end: %s", http); int32_t version = (int32_t)parse_stream_version(host, http); if(version == -1) { + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_BAD_HANDSHAKE); error("STREAM %s [send to %s]: server is not replying properly (is it a netdata?).", host->hostname, s->connected_to); rrdpush_sender_thread_close_socket(host); + //catch other reject reasons and force to check other destinations + if (host->destination->next) + host->destination->disabled_no_proper_reply = 1; + return 0; + } + else if(version == -2) { + error("STREAM %s [send to %s]: remote server is the localhost for [%s].", host->hostname, s->connected_to, host->hostname); + rrdpush_sender_thread_close_socket(host); + host->destination->disabled_because_of_localhost = 1; + return 0; + } + else if(version == -3) { + error("STREAM %s [send to %s]: remote server already receives metrics for [%s].", host->hostname, s->connected_to, host->hostname); + rrdpush_sender_thread_close_socket(host); + host->destination->disabled_already_streaming = now_realtime_sec(); + return 0; + } + else if(version == -4) { + error("STREAM %s [send to %s]: remote server denied access for [%s].", host->hostname, s->connected_to, host->hostname); + rrdpush_sender_thread_close_socket(host); + if (host->destination->next) + host->destination->disabled_because_of_denied_access = 1; return 0; } s->version = version; @@ -541,9 +610,9 @@ void attempt_to_send(struct sender_state *s) { s->last_sent_t = now_monotonic_sec(); } else if (ret == -1 && (errno == EAGAIN || errno == EINTR || errno == EWOULDBLOCK)) - debug(D_STREAM, "STREAM %s [send to %s]: unavailable after polling POLLOUT", s->host->hostname, - s->connected_to); + debug(D_STREAM, "STREAM %s [send to %s]: unavailable after polling POLLOUT", s->host->hostname, s->connected_to); else if (ret == -1) { + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_SEND_ERROR); debug(D_STREAM, "STREAM: Send failed - closing socket..."); error("STREAM %s [send to %s]: failed to send metrics - closing connection - we have sent %zu bytes on this connection.", s->host->hostname, s->connected_to, s->sent_bytes_on_this_connection); rrdpush_sender_thread_close_socket(s->host); @@ -570,6 +639,8 @@ int ret; int sslerrno = SSL_get_error(s->host->ssl.conn, desired); if (sslerrno == SSL_ERROR_WANT_READ || sslerrno == SSL_ERROR_WANT_WRITE) return; + + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_SSL_ERROR); u_long err; char buf[256]; while ((err = ERR_get_error()) != 0) { @@ -581,20 +652,25 @@ int ret; return; } #endif - ret = recv(s->host->rrdpush_sender_socket, s->read_buffer + s->read_len, sizeof(s->read_buffer) - s->read_len - 1, - MSG_DONTWAIT); + ret = recv(s->host->rrdpush_sender_socket, s->read_buffer + s->read_len, sizeof(s->read_buffer) - s->read_len - 1,MSG_DONTWAIT); if (ret>0) { s->read_len += ret; return; } + debug(D_STREAM, "Socket was POLLIN, but req %zu bytes gave %d", sizeof(s->read_buffer) - s->read_len - 1, ret); + if (ret<0 && (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR)) return; - if (ret==0) + + if (ret==0) { + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_PARENT_CLOSED); error("STREAM %s [send to %s]: connection closed by far end. Restarting connection", s->host->hostname, s->connected_to); - else - error("STREAM %s [send to %s]: error during read (%d). Restarting connection", s->host->hostname, s->connected_to, - ret); + } + else { + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_RECEIVE_ERROR); + error("STREAM %s [send to %s]: error during receive (%d). Restarting connection", s->host->hostname, s->connected_to, ret); + } rrdpush_sender_thread_close_socket(s->host); } @@ -615,6 +691,8 @@ void execute_commands(struct sender_state *s) { static void rrdpush_sender_thread_cleanup_callback(void *ptr) { + worker_unregister(); + RRDHOST *host = (RRDHOST *)ptr; netdata_mutex_lock(&host->sender->mutex); @@ -683,7 +761,7 @@ void *rrdpush_sender_thread(void *ptr) { s->timeout = (int)appconfig_get_number(&stream_config, CONFIG_SECTION_STREAM, "timeout seconds", 60); s->default_port = (int)appconfig_get_number(&stream_config, CONFIG_SECTION_STREAM, "default port", 19999); s->buffer->max_size = - (size_t)appconfig_get_number(&stream_config, CONFIG_SECTION_STREAM, "buffer size bytes", 1024 * 1024); + (size_t)appconfig_get_number(&stream_config, CONFIG_SECTION_STREAM, "buffer size bytes", 1024 * 1024 * 10); s->reconnect_delay = (unsigned int)appconfig_get_number(&stream_config, CONFIG_SECTION_STREAM, "reconnect delay seconds", 5); remote_clock_resync_iterations = (unsigned int)appconfig_get_number( @@ -707,6 +785,25 @@ void *rrdpush_sender_thread(void *ptr) { fds[Collector].fd = s->host->rrdpush_sender_pipe[PIPE_READ]; fds[Collector].events = POLLIN; + worker_register("STREAMSND"); + worker_register_job_name(WORKER_SENDER_JOB_CONNECT, "connect"); + worker_register_job_name(WORKER_SENDER_JOB_PIPE_READ, "pipe read"); + worker_register_job_name(WORKER_SENDER_JOB_SOCKET_RECEIVE, "receive"); + worker_register_job_name(WORKER_SENDER_JOB_EXECUTE, "execute"); + worker_register_job_name(WORKER_SENDER_JOB_SOCKET_SEND, "send"); + + // disconnection reasons + worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_TIMEOUT, "disconnect timeout"); + worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_POLL_ERROR, "disconnect poll error"); + worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_SOCKER_ERROR, "disconnect socket error"); + worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_OVERFLOW, "disconnect overflow"); + worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_SSL_ERROR, "disconnect ssl error"); + worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_PARENT_CLOSED, "disconnect parent closed"); + worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_RECEIVE_ERROR, "disconnect receive error"); + worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_SEND_ERROR, "disconnect send error"); + worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_NO_COMPRESSION, "disconnect no compression"); + worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_BAD_HANDSHAKE, "disconnect bad handshake"); + netdata_thread_cleanup_push(rrdpush_sender_thread_cleanup_callback, s->host); for(; s->host->rrdpush_send_enabled && !netdata_exit ;) { // check for outstanding cancellation requests @@ -714,6 +811,7 @@ void *rrdpush_sender_thread(void *ptr) { // The connection attempt blocks (after which we use the socket in nonblocking) if(unlikely(s->host->rrdpush_sender_socket == -1)) { + worker_is_busy(WORKER_SENDER_JOB_CONNECT); s->overflow = 0; s->read_len = 0; s->buffer->read = 0; @@ -731,11 +829,14 @@ void *rrdpush_sender_thread(void *ptr) { // If the TCP window never opened then something is wrong, restart connection if(unlikely(now_monotonic_sec() - s->last_sent_t > s->timeout)) { + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_TIMEOUT); error("STREAM %s [send to %s]: could not send metrics for %d seconds - closing connection - we have sent %zu bytes on this connection via %zu send attempts.", s->host->hostname, s->connected_to, s->timeout, s->sent_bytes_on_this_connection, s->send_attempts); rrdpush_sender_thread_close_socket(s->host); continue; } + worker_is_idle(); + // Wait until buffer opens in the socket or a rrdset_done_push wakes us fds[Collector].revents = 0; fds[Socket].revents = 0; @@ -757,16 +858,18 @@ void *rrdpush_sender_thread(void *ptr) { int retval = poll(fds, 2, 1000); debug(D_STREAM, "STREAM: poll() finished collector=%d socket=%d (current chunk %zu bytes)...", fds[Collector].revents, fds[Socket].revents, outstanding); + if(unlikely(netdata_exit)) break; // Spurious wake-ups without error - loop again - if (retval == 0 || ((retval == -1) && (errno == EAGAIN || errno == EINTR))) - { + if (retval == 0 || ((retval == -1) && (errno == EAGAIN || errno == EINTR))) { debug(D_STREAM, "Spurious wakeup"); continue; } + // Only errors from poll() are internal, but try restarting the connection if(unlikely(retval == -1)) { + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_POLL_ERROR); error("STREAM %s [send to %s]: failed to poll(). Closing socket.", s->host->hostname, s->connected_to); rrdpush_sender_thread_close_socket(s->host); continue; @@ -774,6 +877,7 @@ void *rrdpush_sender_thread(void *ptr) { // If the collector woke us up then empty the pipe to remove the signal if (fds[Collector].revents & POLLIN || fds[Collector].revents & POLLPRI) { + worker_is_busy(WORKER_SENDER_JOB_PIPE_READ); debug(D_STREAM, "STREAM: Data added to send buffer (current buffer chunk %zu bytes)...", outstanding); char buffer[1000 + 1]; @@ -782,13 +886,19 @@ void *rrdpush_sender_thread(void *ptr) { } // Read as much as possible to fill the buffer, split into full lines for execution. - if (fds[Socket].revents & POLLIN) + if (fds[Socket].revents & POLLIN) { + worker_is_busy(WORKER_SENDER_JOB_SOCKET_RECEIVE); attempt_read(s); + } + + worker_is_busy(WORKER_SENDER_JOB_EXECUTE); execute_commands(s); // If we have data and have seen the TCP window open then try to close it by a transmission. - if (outstanding && fds[Socket].revents & POLLOUT) + if (outstanding && fds[Socket].revents & POLLOUT) { + worker_is_busy(WORKER_SENDER_JOB_SOCKET_SEND); attempt_to_send(s); + } // TODO-GAPS - why do we only check this on the socket, not the pipe? if (outstanding) { @@ -800,6 +910,7 @@ void *rrdpush_sender_thread(void *ptr) { else if (unlikely(fds[Socket].revents & POLLNVAL)) error = "connection is invalid (POLLNVAL)"; if(unlikely(error)) { + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_SOCKER_ERROR); error("STREAM %s [send to %s]: restart stream because %s - %zu bytes transmitted.", s->host->hostname, s->connected_to, error, s->sent_bytes_on_this_connection); rrdpush_sender_thread_close_socket(s->host); @@ -808,6 +919,7 @@ void *rrdpush_sender_thread(void *ptr) { // protection from overflow if (s->overflow) { + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_OVERFLOW); errno = 0; error("STREAM %s [send to %s]: buffer full (%zu-bytes) after %zu bytes. Restarting connection", s->host->hostname, s->connected_to, s->buffer->size, s->sent_bytes_on_this_connection); diff --git a/streaming/stream.conf b/streaming/stream.conf index e65e76fa4..33172bbcb 100644 --- a/streaming/stream.conf +++ b/streaming/stream.conf @@ -82,9 +82,9 @@ send charts matching = * # The buffer to use for sending metrics. - # 1MB is good for 10-20 seconds of data, so increase this if you expect latencies. + # 10MB is good for 60 seconds of data, so increase this if you expect latencies. # The buffer is flushed on reconnects (this will not prevent gaps at the charts). - buffer size bytes = 1048576 + buffer size bytes = 10485760 # If the connection fails, or it disconnects, # retry after that many seconds. |