diff options
Diffstat (limited to '')
-rw-r--r-- | streaming/receiver.c | 627 |
1 files changed, 351 insertions, 276 deletions
diff --git a/streaming/receiver.c b/streaming/receiver.c index 61ee33bc4..95652942e 100644 --- a/streaming/receiver.c +++ b/streaming/receiver.c @@ -16,7 +16,8 @@ extern struct config stream_config; -void destroy_receiver_state(struct receiver_state *rpt) { +void receiver_state_free(struct receiver_state *rpt) { + freez(rpt->key); freez(rpt->hostname); freez(rpt->registry_hostname); @@ -29,43 +30,23 @@ void destroy_receiver_state(struct receiver_state *rpt) { freez(rpt->client_port); freez(rpt->program_name); freez(rpt->program_version); + #ifdef ENABLE_HTTPS - if(rpt->ssl.conn){ + if(rpt->ssl.conn) SSL_free(rpt->ssl.conn); - } #endif + #ifdef ENABLE_COMPRESSION if (rpt->decompressor) rpt->decompressor->destroy(&rpt->decompressor); #endif - freez(rpt); -} - -static void rrdpush_receiver_thread_cleanup(void *ptr) { - worker_unregister(); - static __thread int executed = 0; - if(!executed) { - executed = 1; - struct receiver_state *rpt = (struct receiver_state *) ptr; - // If the shutdown sequence has started, and this receiver is still attached to the host then we cannot touch - // the host pointer as it is unpredictable when the RRDHOST is deleted. Do the cleanup from rrdhost_free(). - if (netdata_exit && rpt->host) { - rpt->exited = 1; - return; - } + if(rpt->system_info) + rrdhost_system_info_free(rpt->system_info); - // Make sure that we detach this thread and don't kill a freshly arriving receiver - if (!netdata_exit && rpt->host) { - netdata_mutex_lock(&rpt->host->receiver_lock); - if (rpt->host->receiver == rpt) - rpt->host->receiver = NULL; - netdata_mutex_unlock(&rpt->host->receiver_lock); - } + __atomic_sub_fetch(&netdata_buffers_statistics.rrdhost_receivers, sizeof(*rpt), __ATOMIC_RELAXED); - info("STREAM %s [receive from [%s]:%s]: receive thread ended (task id %d)", rpt->hostname, rpt->client_ip, rpt->client_port, gettid()); - destroy_receiver_state(rpt); - } + freez(rpt); } #include "collectors/plugins.d/pluginsd_parser.h" @@ -105,11 +86,10 @@ PARSER_RC streaming_claimed_id(char **words, size_t num_words, void *user) if (host->aclk_state.claimed_id) freez(host->aclk_state.claimed_id); host->aclk_state.claimed_id = strcmp(claim_id_str, "NULL") ? strdupz(claim_id_str) : NULL; - - metaqueue_store_claim_id(&host->host_uuid, host->aclk_state.claimed_id ? &uuid : NULL); - rrdhost_aclk_state_unlock(host); + rrdhost_flag_set(host, RRDHOST_FLAG_METADATA_CLAIMID |RRDHOST_FLAG_METADATA_UPDATE); + rrdpush_claimed_id(host); return PARSER_RC_OK; @@ -350,11 +330,13 @@ static void streaming_parser_thread_cleanup(void *ptr) { parser_destroy(parser); } +bool plugin_is_enabled(struct plugind *cd); + static size_t streaming_parser(struct receiver_state *rpt, struct plugind *cd, int fd, void *ssl) { size_t result; PARSER_USER_OBJECT user = { - .enabled = cd->enabled, + .enabled = plugin_is_enabled(cd), .host = rpt->host, .opaque = rpt, .cd = cd, @@ -390,39 +372,50 @@ static size_t streaming_parser(struct receiver_state *rpt, struct plugind *cd, i size_t read_buffer_start = 0; char buffer[PLUGINSD_LINE_MAX + 2] = ""; - while(!netdata_exit) { + while(service_running(SERVICE_STREAMING)) { + netdata_thread_testcancel(); + if(!receiver_next_line(rpt, buffer, PLUGINSD_LINE_MAX + 2, &read_buffer_start)) { bool have_new_data; - if(compressed_connection) + if(likely(compressed_connection)) have_new_data = receiver_read_compressed(rpt); else have_new_data = receiver_read_uncompressed(rpt); - if(!have_new_data) + if(unlikely(!have_new_data)) { + if(!rpt->exit.reason) + rpt->exit.reason = "SOCKET READ ERROR"; + break; + } rpt->last_msg_t = now_realtime_sec(); continue; } - if(unlikely(netdata_exit)) { - internal_error(true, "exiting..."); + if(unlikely(!service_running(SERVICE_STREAMING))) { + if(!rpt->exit.reason) + rpt->exit.reason = "NETDATA EXIT"; goto done; } - if(unlikely(rpt->shutdown)) { - internal_error(true, "parser shutdown..."); + if(unlikely(rpt->exit.shutdown)) { + if(!rpt->exit.reason) + rpt->exit.reason = "SHUTDOWN REQUESTED"; + goto done; } if (unlikely(parser_action(parser, buffer))) { internal_error(true, "parser_action() failed on keyword '%s'.", buffer); + + if(!rpt->exit.reason) + rpt->exit.reason = "PARSER FAILED"; + break; } } done: - internal_error(true, "Streaming receiver thread stopping..."); - result = user.count; // free parser with the pop function @@ -431,103 +424,240 @@ done: return result; } -static void rrdpush_receiver_replication_reset(struct receiver_state *rpt) { +static void rrdpush_receiver_replication_reset(RRDHOST *host) { RRDSET *st; - rrdset_foreach_read(st, rpt->host) { + rrdset_foreach_read(st, host) { rrdset_flag_clear(st, RRDSET_FLAG_RECEIVER_REPLICATION_IN_PROGRESS); rrdset_flag_set(st, RRDSET_FLAG_RECEIVER_REPLICATION_FINISHED); } rrdset_foreach_done(st); - rrdhost_receiver_replicating_charts_zero(rpt->host); + rrdhost_receiver_replicating_charts_zero(host); +} + +bool rrdhost_set_receiver(RRDHOST *host, struct receiver_state *rpt) { + bool signal_rrdcontext = false; + bool set_this = false; + + netdata_mutex_lock(&host->receiver_lock); + + if (!host->receiver || host->receiver == rpt) { + rrdhost_flag_clear(host, RRDHOST_FLAG_ORPHAN); + + host->receiver = rpt; + rpt->host = host; + + host->child_connect_time = now_realtime_sec(); + host->child_disconnected_time = 0; + host->child_last_chart_command = 0; + host->trigger_chart_obsoletion_check = 1; + + if (rpt->config.health_enabled != CONFIG_BOOLEAN_NO) { + if (rpt->config.alarms_delay > 0) { + host->health.health_delay_up_to = now_realtime_sec() + rpt->config.alarms_delay; + log_health( + "[%s]: Postponing health checks for %" PRId64 " seconds, because it was just connected.", + rrdhost_hostname(host), + (int64_t) rpt->config.alarms_delay); + } + } + +// this is a test +// if(rpt->hops <= host->sender->hops) +// rrdpush_sender_thread_stop(host, "HOPS MISMATCH", false); + + signal_rrdcontext = true; + rrdpush_receiver_replication_reset(host); + + rrdhost_flag_clear(rpt->host, RRDHOST_FLAG_RRDPUSH_RECEIVER_DISCONNECTED); + + set_this = true; + } + + netdata_mutex_unlock(&host->receiver_lock); + + if(signal_rrdcontext) + rrdcontext_host_child_connected(host); + + return set_this; +} + +static void rrdhost_clear_receiver(struct receiver_state *rpt) { + bool signal_rrdcontext = false; + + RRDHOST *host = rpt->host; + if(host) { + netdata_mutex_lock(&host->receiver_lock); + + // Make sure that we detach this thread and don't kill a freshly arriving receiver + if(host->receiver == rpt) { + host->trigger_chart_obsoletion_check = 0; + host->child_connect_time = 0; + host->child_disconnected_time = now_realtime_sec(); + + if (rpt->config.health_enabled == CONFIG_BOOLEAN_AUTO) + host->health.health_enabled = 0; + + rrdpush_sender_thread_stop(host, "RECEIVER LEFT", false); + + signal_rrdcontext = true; + rrdpush_receiver_replication_reset(host); + + if (host->receiver == rpt) + host->receiver = NULL; + + rrdhost_flag_set(host, RRDHOST_FLAG_ORPHAN); + } + + netdata_mutex_unlock(&host->receiver_lock); + + if(signal_rrdcontext) + rrdcontext_host_child_disconnected(host); + } +} + +bool stop_streaming_receiver(RRDHOST *host, const char *reason) { + bool ret = false; + + netdata_mutex_lock(&host->receiver_lock); + + if(host->receiver) { + if(!host->receiver->exit.shutdown) { + host->receiver->exit.shutdown = true; + host->receiver->exit.reason = reason; + shutdown(host->receiver->fd, SHUT_RDWR); + } + + netdata_thread_cancel(host->receiver->thread); + } + + int count = 2000; + while (host->receiver && count-- > 0) { + netdata_mutex_unlock(&host->receiver_lock); + + // let the lock for the receiver thread to exit + sleep_usec(1 * USEC_PER_MS); + + netdata_mutex_lock(&host->receiver_lock); + } + + if(host->receiver) + error("STREAM '%s' [receive from [%s]:%s]: " + "thread %d takes too long to stop, giving up..." + , rrdhost_hostname(host) + , host->receiver->client_ip, host->receiver->client_port + , gettid()); + else + ret = true; + + netdata_mutex_unlock(&host->receiver_lock); + + return ret; +} + +void rrdpush_receive_log_status(struct receiver_state *rpt, const char *msg, const char *status) { + + log_stream_connection(rpt->client_ip, rpt->client_port, + (rpt->key && *rpt->key)? rpt->key : "-", + (rpt->machine_guid && *rpt->machine_guid) ? rpt->machine_guid : "-", + (rpt->hostname && *rpt->hostname) ? rpt->hostname : "-", + status); + + info("STREAM '%s' [receive from [%s]:%s]: " + "%s. " + "STATUS: %s%s%s%s" + , rpt->hostname + , rpt->client_ip, rpt->client_port + , msg + , status + , rpt->exit.reason?" (":"" + , rpt->exit.reason?rpt->exit.reason:"" + , rpt->exit.reason?")":"" + ); + +} + +static void rrdhost_reset_destinations(RRDHOST *host) { + for (struct rrdpush_destinations *d = host->destinations; d; d = d->next) + d->postpone_reconnection_until = 0; } static int rrdpush_receive(struct receiver_state *rpt) { - int history = default_rrd_history_entries; - RRD_MEMORY_MODE mode = default_rrd_memory_mode; - int health_enabled = default_health_enabled; - int rrdpush_enabled = default_rrdpush_enabled; - char *rrdpush_destination = default_rrdpush_destination; - char *rrdpush_api_key = default_rrdpush_api_key; - char *rrdpush_send_charts_matching = default_rrdpush_send_charts_matching; - bool rrdpush_enable_replication = default_rrdpush_enable_replication; - time_t rrdpush_seconds_to_replicate = default_rrdpush_seconds_to_replicate; - time_t rrdpush_replication_step = default_rrdpush_replication_step; - time_t alarms_delay = 60; - - rpt->update_every = (int)appconfig_get_number(&stream_config, rpt->machine_guid, "update every", rpt->update_every); - if(rpt->update_every < 0) rpt->update_every = 1; - - history = (int)appconfig_get_number(&stream_config, rpt->key, "default history", history); - history = (int)appconfig_get_number(&stream_config, rpt->machine_guid, "history", history); - if(history < 5) history = 5; - - mode = rrd_memory_mode_id(appconfig_get(&stream_config, rpt->key, "default memory mode", rrd_memory_mode_name(mode))); - mode = rrd_memory_mode_id(appconfig_get(&stream_config, rpt->machine_guid, "memory mode", rrd_memory_mode_name(mode))); - - if (unlikely(mode == RRD_MEMORY_MODE_DBENGINE && !dbengine_enabled)) { - error("STREAM %s [receive from %s:%s]: dbengine is not enabled, falling back to default.", rpt->hostname, rpt->client_ip, rpt->client_port); - mode = default_rrd_memory_mode; + rpt->config.mode = default_rrd_memory_mode; + rpt->config.history = default_rrd_history_entries; + + rpt->config.health_enabled = (int)default_health_enabled; + rpt->config.alarms_delay = 60; + + rpt->config.rrdpush_enabled = (int)default_rrdpush_enabled; + rpt->config.rrdpush_destination = default_rrdpush_destination; + rpt->config.rrdpush_api_key = default_rrdpush_api_key; + rpt->config.rrdpush_send_charts_matching = default_rrdpush_send_charts_matching; + + rpt->config.rrdpush_enable_replication = default_rrdpush_enable_replication; + rpt->config.rrdpush_seconds_to_replicate = default_rrdpush_seconds_to_replicate; + rpt->config.rrdpush_replication_step = default_rrdpush_replication_step; + + rpt->config.update_every = (int)appconfig_get_number(&stream_config, rpt->machine_guid, "update every", rpt->config.update_every); + if(rpt->config.update_every < 0) rpt->config.update_every = 1; + + rpt->config.history = (int)appconfig_get_number(&stream_config, rpt->key, "default history", rpt->config.history); + rpt->config.history = (int)appconfig_get_number(&stream_config, rpt->machine_guid, "history", rpt->config.history); + if(rpt->config.history < 5) rpt->config.history = 5; + + rpt->config.mode = rrd_memory_mode_id(appconfig_get(&stream_config, rpt->key, "default memory mode", rrd_memory_mode_name(rpt->config.mode))); + rpt->config.mode = rrd_memory_mode_id(appconfig_get(&stream_config, rpt->machine_guid, "memory mode", rrd_memory_mode_name(rpt->config.mode))); + + if (unlikely(rpt->config.mode == RRD_MEMORY_MODE_DBENGINE && !dbengine_enabled)) { + error("STREAM '%s' [receive from %s:%s]: " + "dbengine is not enabled, falling back to default." + , rpt->hostname + , rpt->client_ip, rpt->client_port + ); + + rpt->config.mode = default_rrd_memory_mode; } - health_enabled = appconfig_get_boolean_ondemand(&stream_config, rpt->key, "health enabled by default", health_enabled); - health_enabled = appconfig_get_boolean_ondemand(&stream_config, rpt->machine_guid, "health enabled", health_enabled); + rpt->config.health_enabled = appconfig_get_boolean_ondemand(&stream_config, rpt->key, "health enabled by default", rpt->config.health_enabled); + rpt->config.health_enabled = appconfig_get_boolean_ondemand(&stream_config, rpt->machine_guid, "health enabled", rpt->config.health_enabled); - alarms_delay = appconfig_get_number(&stream_config, rpt->key, "default postpone alarms on connect seconds", alarms_delay); - alarms_delay = appconfig_get_number(&stream_config, rpt->machine_guid, "postpone alarms on connect seconds", alarms_delay); + rpt->config.alarms_delay = appconfig_get_number(&stream_config, rpt->key, "default postpone alarms on connect seconds", rpt->config.alarms_delay); + rpt->config.alarms_delay = appconfig_get_number(&stream_config, rpt->machine_guid, "postpone alarms on connect seconds", rpt->config.alarms_delay); - rrdpush_enabled = appconfig_get_boolean(&stream_config, rpt->key, "default proxy enabled", rrdpush_enabled); - rrdpush_enabled = appconfig_get_boolean(&stream_config, rpt->machine_guid, "proxy enabled", rrdpush_enabled); + rpt->config.rrdpush_enabled = appconfig_get_boolean(&stream_config, rpt->key, "default proxy enabled", rpt->config.rrdpush_enabled); + rpt->config.rrdpush_enabled = appconfig_get_boolean(&stream_config, rpt->machine_guid, "proxy enabled", rpt->config.rrdpush_enabled); - rrdpush_destination = appconfig_get(&stream_config, rpt->key, "default proxy destination", rrdpush_destination); - rrdpush_destination = appconfig_get(&stream_config, rpt->machine_guid, "proxy destination", rrdpush_destination); + rpt->config.rrdpush_destination = appconfig_get(&stream_config, rpt->key, "default proxy destination", rpt->config.rrdpush_destination); + rpt->config.rrdpush_destination = appconfig_get(&stream_config, rpt->machine_guid, "proxy destination", rpt->config.rrdpush_destination); - rrdpush_api_key = appconfig_get(&stream_config, rpt->key, "default proxy api key", rrdpush_api_key); - rrdpush_api_key = appconfig_get(&stream_config, rpt->machine_guid, "proxy api key", rrdpush_api_key); + rpt->config.rrdpush_api_key = appconfig_get(&stream_config, rpt->key, "default proxy api key", rpt->config.rrdpush_api_key); + rpt->config.rrdpush_api_key = appconfig_get(&stream_config, rpt->machine_guid, "proxy api key", rpt->config.rrdpush_api_key); - rrdpush_send_charts_matching = appconfig_get(&stream_config, rpt->key, "default proxy send charts matching", rrdpush_send_charts_matching); - rrdpush_send_charts_matching = appconfig_get(&stream_config, rpt->machine_guid, "proxy send charts matching", rrdpush_send_charts_matching); + rpt->config.rrdpush_send_charts_matching = appconfig_get(&stream_config, rpt->key, "default proxy send charts matching", rpt->config.rrdpush_send_charts_matching); + rpt->config.rrdpush_send_charts_matching = appconfig_get(&stream_config, rpt->machine_guid, "proxy send charts matching", rpt->config.rrdpush_send_charts_matching); - rrdpush_enable_replication = appconfig_get_boolean(&stream_config, rpt->key, "enable replication", rrdpush_enable_replication); - rrdpush_enable_replication = appconfig_get_boolean(&stream_config, rpt->machine_guid, "enable replication", rrdpush_enable_replication); + rpt->config.rrdpush_enable_replication = appconfig_get_boolean(&stream_config, rpt->key, "enable replication", rpt->config.rrdpush_enable_replication); + rpt->config.rrdpush_enable_replication = appconfig_get_boolean(&stream_config, rpt->machine_guid, "enable replication", rpt->config.rrdpush_enable_replication); - rrdpush_seconds_to_replicate = appconfig_get_number(&stream_config, rpt->key, "seconds to replicate", rrdpush_seconds_to_replicate); - rrdpush_seconds_to_replicate = appconfig_get_number(&stream_config, rpt->machine_guid, "seconds to replicate", rrdpush_seconds_to_replicate); + rpt->config.rrdpush_seconds_to_replicate = appconfig_get_number(&stream_config, rpt->key, "seconds to replicate", rpt->config.rrdpush_seconds_to_replicate); + rpt->config.rrdpush_seconds_to_replicate = appconfig_get_number(&stream_config, rpt->machine_guid, "seconds to replicate", rpt->config.rrdpush_seconds_to_replicate); - rrdpush_replication_step = appconfig_get_number(&stream_config, rpt->key, "seconds per replication step", rrdpush_replication_step); - rrdpush_replication_step = appconfig_get_number(&stream_config, rpt->machine_guid, "seconds per replication step", rrdpush_replication_step); + rpt->config.rrdpush_replication_step = appconfig_get_number(&stream_config, rpt->key, "seconds per replication step", rpt->config.rrdpush_replication_step); + rpt->config.rrdpush_replication_step = appconfig_get_number(&stream_config, rpt->machine_guid, "seconds per replication step", rpt->config.rrdpush_replication_step); #ifdef ENABLE_COMPRESSION - unsigned int rrdpush_compression = default_compression_enabled; - rrdpush_compression = appconfig_get_boolean(&stream_config, rpt->key, "enable compression", rrdpush_compression); - rrdpush_compression = appconfig_get_boolean(&stream_config, rpt->machine_guid, "enable compression", rrdpush_compression); - rpt->rrdpush_compression = (rrdpush_compression && default_compression_enabled); + rpt->config.rrdpush_compression = default_compression_enabled; + rpt->config.rrdpush_compression = appconfig_get_boolean(&stream_config, rpt->key, "enable compression", rpt->config.rrdpush_compression); + rpt->config.rrdpush_compression = appconfig_get_boolean(&stream_config, rpt->machine_guid, "enable compression", rpt->config.rrdpush_compression); + rpt->rrdpush_compression = (rpt->config.rrdpush_compression && default_compression_enabled); #endif //ENABLE_COMPRESSION (void)appconfig_set_default(&stream_config, rpt->machine_guid, "host tags", (rpt->tags)?rpt->tags:""); - if (strcmp(rpt->machine_guid, localhost->machine_guid) == 0) { - log_stream_connection(rpt->client_ip, rpt->client_port, rpt->key, rpt->machine_guid, rpt->hostname, "DENIED - ATTEMPT TO RECEIVE METRICS FROM MACHINE_GUID IDENTICAL TO PARENT"); - error("STREAM %s [receive from %s:%s]: denied to receive metrics, machine GUID [%s] is my own. Did you copy the parent/proxy machine GUID to a child, or is this an inter-agent loop?", rpt->hostname, rpt->client_ip, rpt->client_port, rpt->machine_guid); - char initial_response[HTTP_HEADER_SIZE + 1]; - snprintfz(initial_response, HTTP_HEADER_SIZE, "%s", START_STREAMING_ERROR_SAME_LOCALHOST); -#ifdef ENABLE_HTTPS - if(send_timeout(&rpt->ssl, rpt->fd, initial_response, strlen(initial_response), 0, 60) != (ssize_t)strlen(initial_response)) { -#else - if(send_timeout(rpt->fd, initial_response, strlen(initial_response), 0, 60) != strlen(initial_response)) { -#endif - log_stream_connection(rpt->client_ip, rpt->client_port, rpt->key, rpt->host->machine_guid, rrdhost_hostname(rpt->host), "FAILED - CANNOT REPLY"); - error("STREAM %s [receive from [%s]:%s]: cannot send command.", rrdhost_hostname(rpt->host), rpt->client_ip, rpt->client_port); - close(rpt->fd); - return 0; - } - close(rpt->fd); - return 0; - } - - if (rpt->host==NULL) { - - rpt->host = rrdhost_find_or_create( + // find the host for this receiver + { + // this will also update the host with our system_info + RRDHOST *host = rrdhost_find_or_create( rpt->hostname , rpt->registry_hostname , rpt->machine_guid @@ -538,76 +668,41 @@ static int rrdpush_receive(struct receiver_state *rpt) , rpt->tags , rpt->program_name , rpt->program_version - , rpt->update_every - , history - , mode - , (unsigned int)(health_enabled != CONFIG_BOOLEAN_NO) - , (unsigned int)(rrdpush_enabled && rrdpush_destination && *rrdpush_destination && rrdpush_api_key && *rrdpush_api_key) - , rrdpush_destination - , rrdpush_api_key - , rrdpush_send_charts_matching - , rrdpush_enable_replication - , rrdpush_seconds_to_replicate - , rrdpush_replication_step + , rpt->config.update_every + , rpt->config.history + , rpt->config.mode + , (unsigned int)(rpt->config.health_enabled != CONFIG_BOOLEAN_NO) + , (unsigned int)(rpt->config.rrdpush_enabled && rpt->config.rrdpush_destination && *rpt->config.rrdpush_destination && rpt->config.rrdpush_api_key && *rpt->config.rrdpush_api_key) + , rpt->config.rrdpush_destination + , rpt->config.rrdpush_api_key + , rpt->config.rrdpush_send_charts_matching + , rpt->config.rrdpush_enable_replication + , rpt->config.rrdpush_seconds_to_replicate + , rpt->config.rrdpush_replication_step , rpt->system_info , 0 ); - if(!rpt->host) { + if(!host) { + rrdpush_receive_log_status(rpt, "failed to find/create host structure", "INTERNAL ERROR DROPPING CONNECTION"); close(rpt->fd); - log_stream_connection(rpt->client_ip, rpt->client_port, rpt->key, rpt->machine_guid, rpt->hostname, "FAILED - CANNOT ACQUIRE HOST"); - error("STREAM %s [receive from [%s]:%s]: failed to find/create host structure.", rpt->hostname, rpt->client_ip, rpt->client_port); return 1; } - netdata_mutex_lock(&rpt->host->receiver_lock); - if (rpt->host->receiver == NULL) - rpt->host->receiver = rpt; - else { - error("Multiple receivers connected for %s concurrently, cancelling this one...", rpt->machine_guid); - netdata_mutex_unlock(&rpt->host->receiver_lock); + // system_info has been consumed by the host structure + rpt->system_info = NULL; + + if(!rrdhost_set_receiver(host, rpt)) { + rrdpush_receive_log_status(rpt, "host is already served by another receiver", "DUPLICATE RECEIVER DROPPING CONNECTION"); close(rpt->fd); - log_stream_connection(rpt->client_ip, rpt->client_port, rpt->key, rpt->machine_guid, rpt->hostname, "FAILED - BEATEN TO HOST CREATION"); return 1; } - netdata_mutex_unlock(&rpt->host->receiver_lock); - } - else { - rrd_wrlock(); - rrdhost_update( - rpt->host, - rpt->hostname, - rpt->registry_hostname, - rpt->machine_guid, - rpt->os, - rpt->timezone, - rpt->abbrev_timezone, - rpt->utc_offset, - rpt->tags, - rpt->program_name, - rpt->program_version, - rpt->update_every, - history, - mode, - (unsigned int)(health_enabled != CONFIG_BOOLEAN_NO), - (unsigned int)(rrdpush_enabled && rrdpush_destination && *rrdpush_destination && rrdpush_api_key && *rrdpush_api_key), - rrdpush_destination, - rrdpush_api_key, - rrdpush_send_charts_matching, - rrdpush_enable_replication, - rrdpush_seconds_to_replicate, - rrdpush_replication_step, - rpt->system_info); - rrd_unlock(); } #ifdef NETDATA_INTERNAL_CHECKS - int ssl = 0; -#ifdef ENABLE_HTTPS - if (rpt->ssl.conn != NULL) - ssl = 1; -#endif - info("STREAM %s [receive from [%s]:%s]: client willing to stream metrics for host '%s' with machine_guid '%s': update every = %d, history = %ld, memory mode = %s, health %s,%s tags '%s'" + info("STREAM '%s' [receive from [%s]:%s]: " + "client willing to stream metrics for host '%s' with machine_guid '%s': " + "update every = %d, history = %ld, memory mode = %s, health %s,%s tags '%s'" , rpt->hostname , rpt->client_ip , rpt->client_port @@ -616,20 +711,26 @@ static int rrdpush_receive(struct receiver_state *rpt) , rpt->host->rrd_update_every , rpt->host->rrd_history_entries , rrd_memory_mode_name(rpt->host->rrd_memory_mode) - , (health_enabled == CONFIG_BOOLEAN_NO)?"disabled":((health_enabled == CONFIG_BOOLEAN_YES)?"enabled":"auto") - , ssl ? " SSL," : "" + , (rpt->config.health_enabled == CONFIG_BOOLEAN_NO)?"disabled":((rpt->config.health_enabled == CONFIG_BOOLEAN_YES)?"enabled":"auto") +#ifdef ENABLE_HTTPS + , (rpt->ssl.conn != NULL) ? " SSL," : "" +#else + , "" +#endif , rrdhost_tags(rpt->host) ); #endif // NETDATA_INTERNAL_CHECKS struct plugind cd = { - .enabled = 1, .update_every = default_rrd_update_every, - .pid = 0, .serial_failures = 0, .successful_collections = 0, - .obsolete = 0, + .unsafe = { + .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .running = true, + .enabled = true, + }, .started_t = now_realtime_sec(), .next = NULL, .capabilities = 0, @@ -648,76 +749,60 @@ static int rrdpush_receive(struct receiver_state *rpt) } #endif - info("STREAM %s [receive from [%s]:%s]: initializing communication...", rrdhost_hostname(rpt->host), rpt->client_ip, rpt->client_port); - char initial_response[HTTP_HEADER_SIZE]; - if (stream_has_capability(rpt, STREAM_CAP_VCAPS)) { - log_receiver_capabilities(rpt); - sprintf(initial_response, "%s%u", START_STREAMING_PROMPT_VN, rpt->capabilities); - } - else if (stream_has_capability(rpt, STREAM_CAP_VN)) { - log_receiver_capabilities(rpt); - sprintf(initial_response, "%s%d", START_STREAMING_PROMPT_VN, stream_capabilities_to_vn(rpt->capabilities)); - } else if (stream_has_capability(rpt, STREAM_CAP_V2)) { - log_receiver_capabilities(rpt); - sprintf(initial_response, "%s", START_STREAMING_PROMPT_V2); - } else { // stream_has_capability(rpt, STREAM_CAP_V1) - log_receiver_capabilities(rpt); - sprintf(initial_response, "%s", START_STREAMING_PROMPT_V1); - } - debug(D_STREAM, "Initial response to %s: %s", rpt->client_ip, initial_response); + { + // info("STREAM %s [receive from [%s]:%s]: initializing communication...", rrdhost_hostname(rpt->host), rpt->client_ip, rpt->client_port); + char initial_response[HTTP_HEADER_SIZE]; + if (stream_has_capability(rpt, STREAM_CAP_VCAPS)) { + log_receiver_capabilities(rpt); + sprintf(initial_response, "%s%u", START_STREAMING_PROMPT_VN, rpt->capabilities); + } + else if (stream_has_capability(rpt, STREAM_CAP_VN)) { + log_receiver_capabilities(rpt); + sprintf(initial_response, "%s%d", START_STREAMING_PROMPT_VN, stream_capabilities_to_vn(rpt->capabilities)); + } + else if (stream_has_capability(rpt, STREAM_CAP_V2)) { + log_receiver_capabilities(rpt); + sprintf(initial_response, "%s", START_STREAMING_PROMPT_V2); + } + else { // stream_has_capability(rpt, STREAM_CAP_V1) + log_receiver_capabilities(rpt); + sprintf(initial_response, "%s", START_STREAMING_PROMPT_V1); + } + + debug(D_STREAM, "Initial response to %s: %s", rpt->client_ip, initial_response); + if(send_timeout( #ifdef ENABLE_HTTPS - if(send_timeout(&rpt->ssl, rpt->fd, initial_response, strlen(initial_response), 0, 60) != (ssize_t)strlen(initial_response)) { -#else - if(send_timeout(rpt->fd, initial_response, strlen(initial_response), 0, 60) != strlen(initial_response)) { + &rpt->ssl, #endif - log_stream_connection(rpt->client_ip, rpt->client_port, rpt->key, rpt->host->machine_guid, rrdhost_hostname(rpt->host), "FAILED - CANNOT REPLY"); - error("STREAM %s [receive from [%s]:%s]: cannot send ready command.", rrdhost_hostname(rpt->host), rpt->client_ip, rpt->client_port); - close(rpt->fd); - return 0; - } + rpt->fd, initial_response, strlen(initial_response), 0, 60) != (ssize_t)strlen(initial_response)) { - // remove the non-blocking flag from the socket - if(sock_delnonblock(rpt->fd) < 0) - error("STREAM %s [receive from [%s]:%s]: cannot remove the non-blocking flag from socket %d", rrdhost_hostname(rpt->host), rpt->client_ip, rpt->client_port, rpt->fd); - - struct timeval timeout; - timeout.tv_sec = 600; - timeout.tv_usec = 0; - if (unlikely(setsockopt(rpt->fd, SOL_SOCKET, SO_RCVTIMEO, &timeout, sizeof timeout) != 0)) - error("STREAM %s [receive from [%s]:%s]: cannot set timeout for socket %d", rrdhost_hostname(rpt->host), rpt->client_ip, rpt->client_port, rpt->fd); - - rrdhost_wrlock(rpt->host); -/* if(rpt->host->connected_senders > 0) { - rrdhost_unlock(rpt->host); - log_stream_connection(rpt->client_ip, rpt->client_port, rpt->key, rpt->host->machine_guid, rpt->host->hostname, "REJECTED - ALREADY CONNECTED"); - info("STREAM %s [receive from [%s]:%s]: multiple streaming connections for the same host detected. Rejecting new connection.", rpt->host->hostname, rpt->client_ip, rpt->client_port); - fclose(fp); - return 0; - } -*/ - -// rpt->host->connected_senders++; - if(health_enabled != CONFIG_BOOLEAN_NO) { - if(alarms_delay > 0) { - rpt->host->health_delay_up_to = now_realtime_sec() + alarms_delay; - log_health( - "[%s]: Postponing health checks for %" PRId64 " seconds, because it was just connected.", - rrdhost_hostname(rpt->host), - (int64_t)alarms_delay); + rrdpush_receive_log_status(rpt, "cannot reply back", "CANT REPLY DROPPING CONNECTION"); + close(rpt->fd); + return 0; } } - rpt->host->senders_connect_time = now_realtime_sec(); - rpt->host->senders_last_chart_command = 0; - rpt->host->trigger_chart_obsoletion_check = 1; - rrdhost_unlock(rpt->host); + { + // remove the non-blocking flag from the socket + if(sock_delnonblock(rpt->fd) < 0) + error("STREAM '%s' [receive from [%s]:%s]: " + "cannot remove the non-blocking flag from socket %d" + , rrdhost_hostname(rpt->host) + , rpt->client_ip, rpt->client_port + , rpt->fd); - // call the plugins.d processor to receive the metrics - info("STREAM %s [receive from [%s]:%s]: receiving metrics...", - rrdhost_hostname(rpt->host), rpt->client_ip, rpt->client_port); + struct timeval timeout; + timeout.tv_sec = 600; + timeout.tv_usec = 0; + if (unlikely(setsockopt(rpt->fd, SOL_SOCKET, SO_RCVTIMEO, &timeout, sizeof timeout) != 0)) + error("STREAM '%s' [receive from [%s]:%s]: " + "cannot set timeout for socket %d" + , rrdhost_hostname(rpt->host) + , rpt->client_ip, rpt->client_port + , rpt->fd); + } - log_stream_connection(rpt->client_ip, rpt->client_port, - rpt->key, rpt->host->machine_guid, rrdhost_hostname(rpt->host), "CONNECTED"); + rrdpush_receive_log_status(rpt, "ready to receive data", "CONNECTED"); cd.capabilities = rpt->capabilities; @@ -728,12 +813,10 @@ static int rrdpush_receive(struct receiver_state *rpt) aclk_host_state_update(rpt->host, 1); #endif - rrdhost_set_is_parent_label(++localhost->senders_count); + rrdhost_set_is_parent_label(++localhost->connected_children_count); - rrdpush_receiver_replication_reset(rpt); - rrdcontext_host_child_connected(rpt->host); - - rrdhost_flag_clear(rpt->host, RRDHOST_FLAG_RRDPUSH_RECEIVER_DISCONNECTED); + // let it reconnect to parent immediately + rrdhost_reset_destinations(rpt->host); size_t count = streaming_parser(rpt, &cd, rpt->fd, #ifdef ENABLE_HTTPS @@ -745,15 +828,14 @@ static int rrdpush_receive(struct receiver_state *rpt) rrdhost_flag_set(rpt->host, RRDHOST_FLAG_RRDPUSH_RECEIVER_DISCONNECTED); - log_stream_connection(rpt->client_ip, rpt->client_port, - rpt->key, rpt->host->machine_guid, rpt->hostname, - "DISCONNECTED"); + if(!rpt->exit.reason) + rpt->exit.reason = "PARSER EXIT"; - error("STREAM %s [receive from [%s]:%s]: disconnected (completed %zu updates).", - rpt->hostname, rpt->client_ip, rpt->client_port, count); - - rrdcontext_host_child_disconnected(rpt->host); - rrdpush_receiver_replication_reset(rpt); + { + char msg[100 + 1]; + snprintfz(msg, 100, "disconnected (completed %zu updates)", count); + rrdpush_receive_log_status(rpt, msg, "DISCONNECTED"); + } #ifdef ENABLE_ACLK // in case we have cloud connection we inform cloud @@ -762,48 +844,41 @@ static int rrdpush_receive(struct receiver_state *rpt) aclk_host_state_update(rpt->host, 0); #endif - rrdhost_set_is_parent_label(--localhost->senders_count); - - // During a shutdown there is cleanup code in rrdhost that will cancel the sender thread - if (!netdata_exit && rpt->host) { - rrd_rdlock(); - rrdhost_wrlock(rpt->host); - netdata_mutex_lock(&rpt->host->receiver_lock); - if (rpt->host->receiver == rpt) { - rpt->host->senders_connect_time = 0; - rpt->host->trigger_chart_obsoletion_check = 0; - rpt->host->senders_disconnected_time = now_realtime_sec(); - rrdhost_flag_set(rpt->host, RRDHOST_FLAG_ORPHAN); - if(health_enabled == CONFIG_BOOLEAN_AUTO) - rpt->host->health_enabled = 0; - } - rrdhost_unlock(rpt->host); - if (rpt->host->receiver == rpt) { - rrdpush_sender_thread_stop(rpt->host); - } - netdata_mutex_unlock(&rpt->host->receiver_lock); - rrd_unlock(); - } + rrdhost_set_is_parent_label(--localhost->connected_children_count); // cleanup close(rpt->fd); return (int)count; } +static void rrdpush_receiver_thread_cleanup(void *ptr) { + struct receiver_state *rpt = (struct receiver_state *) ptr; + worker_unregister(); + + rrdhost_clear_receiver(rpt); + + info("STREAM '%s' [receive from [%s]:%s]: " + "receive thread ended (task id %d)" + , rpt->hostname ? rpt->hostname : "-" + , rpt->client_ip ? rpt->client_ip : "-", rpt->client_port ? rpt->client_port : "-" + , gettid()); + + receiver_state_free(rpt); +} + void *rrdpush_receiver_thread(void *ptr) { netdata_thread_cleanup_push(rrdpush_receiver_thread_cleanup, ptr); - struct receiver_state *rpt = (struct receiver_state *)ptr; - info("STREAM %s [%s]:%s: receive thread created (task id %d)", rpt->hostname, rpt->client_ip, rpt->client_port, gettid()); - worker_register("STREAMRCV"); worker_register_job_custom_metric(WORKER_RECEIVER_JOB_BYTES_READ, "received bytes", "bytes/s", WORKER_METRIC_INCREMENT); worker_register_job_custom_metric(WORKER_RECEIVER_JOB_BYTES_UNCOMPRESSED, "uncompressed bytes", "bytes/s", WORKER_METRIC_INCREMENT); worker_register_job_custom_metric(WORKER_RECEIVER_JOB_REPLICATION_COMPLETION, "replication completion", "%", WORKER_METRIC_ABSOLUTE); + + struct receiver_state *rpt = (struct receiver_state *)ptr; + info("STREAM %s [%s]:%s: receive thread created (task id %d)", rpt->hostname, rpt->client_ip, rpt->client_port, gettid()); + rrdpush_receive(rpt); - worker_unregister(); netdata_thread_cleanup_pop(1); return NULL; } - |