diff options
Diffstat (limited to 'src/streaming/sender-connect.c')
-rw-r--r-- | src/streaming/sender-connect.c | 741 |
1 files changed, 741 insertions, 0 deletions
diff --git a/src/streaming/sender-connect.c b/src/streaming/sender-connect.c new file mode 100644 index 000000000..ac5f392a0 --- /dev/null +++ b/src/streaming/sender-connect.c @@ -0,0 +1,741 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "sender-internals.h" + +void rrdpush_sender_thread_close_socket(struct sender_state *s) { + rrdhost_flag_clear(s->host, RRDHOST_FLAG_RRDPUSH_SENDER_CONNECTED | RRDHOST_FLAG_RRDPUSH_SENDER_READY_4_METRICS); + + netdata_ssl_close(&s->ssl); + + if(s->rrdpush_sender_socket != -1) { + close(s->rrdpush_sender_socket); + s->rrdpush_sender_socket = -1; + } + + // do not flush the circular buffer here + // this function is called sometimes with the sender lock, sometimes without the lock +} + +void rrdpush_encode_variable(stream_encoded_t *se, RRDHOST *host) { + se->os_name = (host->system_info->host_os_name)?url_encode(host->system_info->host_os_name):strdupz(""); + se->os_id = (host->system_info->host_os_id)?url_encode(host->system_info->host_os_id):strdupz(""); + se->os_version = (host->system_info->host_os_version)?url_encode(host->system_info->host_os_version):strdupz(""); + se->kernel_name = (host->system_info->kernel_name)?url_encode(host->system_info->kernel_name):strdupz(""); + se->kernel_version = (host->system_info->kernel_version)?url_encode(host->system_info->kernel_version):strdupz(""); +} + +void rrdpush_clean_encoded(stream_encoded_t *se) { + if (se->os_name) { + freez(se->os_name); + se->os_name = NULL; + } + + if (se->os_id) { + freez(se->os_id); + se->os_id = NULL; + } + + if (se->os_version) { + freez(se->os_version); + se->os_version = NULL; + } + + if (se->kernel_name) { + freez(se->kernel_name); + se->kernel_name = NULL; + } + + if (se->kernel_version) { + freez(se->kernel_version); + se->kernel_version = NULL; + } +} + +struct { + const char *response; + const char *status; + size_t length; + int32_t version; + bool dynamic; + const char *error; + int worker_job_id; + int postpone_reconnect_seconds; + ND_LOG_FIELD_PRIORITY priority; +} stream_responses[] = { + { + .response = START_STREAMING_PROMPT_VN, + .length = sizeof(START_STREAMING_PROMPT_VN) - 1, + .status = RRDPUSH_STATUS_CONNECTED, + .version = STREAM_HANDSHAKE_OK_V3, // and above + .dynamic = true, // dynamic = we will parse the version / capabilities + .error = NULL, + .worker_job_id = 0, + .postpone_reconnect_seconds = 0, + .priority = NDLP_INFO, + }, + { + .response = START_STREAMING_PROMPT_V2, + .length = sizeof(START_STREAMING_PROMPT_V2) - 1, + .status = RRDPUSH_STATUS_CONNECTED, + .version = STREAM_HANDSHAKE_OK_V2, + .dynamic = false, + .error = NULL, + .worker_job_id = 0, + .postpone_reconnect_seconds = 0, + .priority = NDLP_INFO, + }, + { + .response = START_STREAMING_PROMPT_V1, + .length = sizeof(START_STREAMING_PROMPT_V1) - 1, + .status = RRDPUSH_STATUS_CONNECTED, + .version = STREAM_HANDSHAKE_OK_V1, + .dynamic = false, + .error = NULL, + .worker_job_id = 0, + .postpone_reconnect_seconds = 0, + .priority = NDLP_INFO, + }, + { + .response = START_STREAMING_ERROR_SAME_LOCALHOST, + .length = sizeof(START_STREAMING_ERROR_SAME_LOCALHOST) - 1, + .status = RRDPUSH_STATUS_LOCALHOST, + .version = STREAM_HANDSHAKE_ERROR_LOCALHOST, + .dynamic = false, + .error = "remote server rejected this stream, the host we are trying to stream is its localhost", + .worker_job_id = WORKER_SENDER_JOB_DISCONNECT_BAD_HANDSHAKE, + .postpone_reconnect_seconds = 60 * 60, // the IP may change, try it every hour + .priority = NDLP_DEBUG, + }, + { + .response = START_STREAMING_ERROR_ALREADY_STREAMING, + .length = sizeof(START_STREAMING_ERROR_ALREADY_STREAMING) - 1, + .status = RRDPUSH_STATUS_ALREADY_CONNECTED, + .version = STREAM_HANDSHAKE_ERROR_ALREADY_CONNECTED, + .dynamic = false, + .error = "remote server rejected this stream, the host we are trying to stream is already streamed to it", + .worker_job_id = WORKER_SENDER_JOB_DISCONNECT_BAD_HANDSHAKE, + .postpone_reconnect_seconds = 2 * 60, // 2 minutes + .priority = NDLP_DEBUG, + }, + { + .response = START_STREAMING_ERROR_NOT_PERMITTED, + .length = sizeof(START_STREAMING_ERROR_NOT_PERMITTED) - 1, + .status = RRDPUSH_STATUS_PERMISSION_DENIED, + .version = STREAM_HANDSHAKE_ERROR_DENIED, + .dynamic = false, + .error = "remote server denied access, probably we don't have the right API key?", + .worker_job_id = WORKER_SENDER_JOB_DISCONNECT_BAD_HANDSHAKE, + .postpone_reconnect_seconds = 1 * 60, // 1 minute + .priority = NDLP_ERR, + }, + { + .response = START_STREAMING_ERROR_BUSY_TRY_LATER, + .length = sizeof(START_STREAMING_ERROR_BUSY_TRY_LATER) - 1, + .status = RRDPUSH_STATUS_RATE_LIMIT, + .version = STREAM_HANDSHAKE_BUSY_TRY_LATER, + .dynamic = false, + .error = "remote server is currently busy, we should try later", + .worker_job_id = WORKER_SENDER_JOB_DISCONNECT_BAD_HANDSHAKE, + .postpone_reconnect_seconds = 2 * 60, // 2 minutes + .priority = NDLP_NOTICE, + }, + { + .response = START_STREAMING_ERROR_INTERNAL_ERROR, + .length = sizeof(START_STREAMING_ERROR_INTERNAL_ERROR) - 1, + .status = RRDPUSH_STATUS_INTERNAL_SERVER_ERROR, + .version = STREAM_HANDSHAKE_INTERNAL_ERROR, + .dynamic = false, + .error = "remote server is encountered an internal error, we should try later", + .worker_job_id = WORKER_SENDER_JOB_DISCONNECT_BAD_HANDSHAKE, + .postpone_reconnect_seconds = 5 * 60, // 5 minutes + .priority = NDLP_CRIT, + }, + { + .response = START_STREAMING_ERROR_INITIALIZATION, + .length = sizeof(START_STREAMING_ERROR_INITIALIZATION) - 1, + .status = RRDPUSH_STATUS_INITIALIZATION_IN_PROGRESS, + .version = STREAM_HANDSHAKE_INITIALIZATION, + .dynamic = false, + .error = "remote server is initializing, we should try later", + .worker_job_id = WORKER_SENDER_JOB_DISCONNECT_BAD_HANDSHAKE, + .postpone_reconnect_seconds = 2 * 60, // 2 minute + .priority = NDLP_NOTICE, + }, + + // terminator + { + .response = NULL, + .length = 0, + .status = RRDPUSH_STATUS_BAD_HANDSHAKE, + .version = STREAM_HANDSHAKE_ERROR_BAD_HANDSHAKE, + .dynamic = false, + .error = "remote node response is not understood, is it Netdata?", + .worker_job_id = WORKER_SENDER_JOB_DISCONNECT_BAD_HANDSHAKE, + .postpone_reconnect_seconds = 1 * 60, // 1 minute + .priority = NDLP_ERR, + } +}; + +static inline bool rrdpush_sender_validate_response(RRDHOST *host, struct sender_state *s, char *http, size_t http_length) { + int32_t version = STREAM_HANDSHAKE_ERROR_BAD_HANDSHAKE; + + int i; + for(i = 0; stream_responses[i].response ; i++) { + if(stream_responses[i].dynamic && + http_length > stream_responses[i].length && http_length < (stream_responses[i].length + 30) && + strncmp(http, stream_responses[i].response, stream_responses[i].length) == 0) { + + version = str2i(&http[stream_responses[i].length]); + break; + } + else if(http_length == stream_responses[i].length && strcmp(http, stream_responses[i].response) == 0) { + version = stream_responses[i].version; + + break; + } + } + + if(version >= STREAM_HANDSHAKE_OK_V1) { + host->destination->reason = version; + host->destination->postpone_reconnection_until = now_realtime_sec() + s->reconnect_delay; + s->capabilities = convert_stream_version_to_capabilities(version, host, true); + return true; + } + + ND_LOG_FIELD_PRIORITY priority = stream_responses[i].priority; + const char *error = stream_responses[i].error; + const char *status = stream_responses[i].status; + int worker_job_id = stream_responses[i].worker_job_id; + int delay = stream_responses[i].postpone_reconnect_seconds; + + worker_is_busy(worker_job_id); + rrdpush_sender_thread_close_socket(s); + host->destination->reason = version; + host->destination->postpone_reconnection_until = now_realtime_sec() + delay; + + ND_LOG_STACK lgs[] = { + ND_LOG_FIELD_TXT(NDF_RESPONSE_CODE, status), + ND_LOG_FIELD_END(), + }; + ND_LOG_STACK_PUSH(lgs); + + char buf[RFC3339_MAX_LENGTH]; + rfc3339_datetime_ut(buf, sizeof(buf), host->destination->postpone_reconnection_until * USEC_PER_SEC, 0, false); + + nd_log(NDLS_DAEMON, priority, + "STREAM %s [send to %s]: %s - will retry in %d secs, at %s", + rrdhost_hostname(host), s->connected_to, error, delay, buf); + + return false; +} + +unsigned char alpn_proto_list[] = { + 18, 'n', 'e', 't', 'd', 'a', 't', 'a', '_', 's', 't', 'r', 'e', 'a', 'm', '/', '2', '.', '0', + 8, 'h', 't', 't', 'p', '/', '1', '.', '1' +}; + +#define CONN_UPGRADE_VAL "upgrade" + +static bool rrdpush_sender_connect_ssl(struct sender_state *s) { + RRDHOST *host = s->host; + bool ssl_required = host && host->destination && host->destination->ssl; + + netdata_ssl_close(&host->sender->ssl); + + if(!ssl_required) + return true; + + if (netdata_ssl_open_ext(&host->sender->ssl, netdata_ssl_streaming_sender_ctx, s->rrdpush_sender_socket, alpn_proto_list, sizeof(alpn_proto_list))) { + if(!netdata_ssl_connect(&host->sender->ssl)) { + // couldn't connect + + ND_LOG_STACK lgs[] = { + ND_LOG_FIELD_TXT(NDF_RESPONSE_CODE, RRDPUSH_STATUS_SSL_ERROR), + ND_LOG_FIELD_END(), + }; + ND_LOG_STACK_PUSH(lgs); + + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_SSL_ERROR); + rrdpush_sender_thread_close_socket(s); + host->destination->reason = STREAM_HANDSHAKE_ERROR_SSL_ERROR; + host->destination->postpone_reconnection_until = now_realtime_sec() + 5 * 60; + return false; + } + + if (netdata_ssl_validate_certificate_sender && + security_test_certificate(host->sender->ssl.conn)) { + // certificate is not valid + + ND_LOG_STACK lgs[] = { + ND_LOG_FIELD_TXT(NDF_RESPONSE_CODE, RRDPUSH_STATUS_INVALID_SSL_CERTIFICATE), + ND_LOG_FIELD_END(), + }; + ND_LOG_STACK_PUSH(lgs); + + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_SSL_ERROR); + netdata_log_error("SSL: closing the stream connection, because the server SSL certificate is not valid."); + rrdpush_sender_thread_close_socket(s); + host->destination->reason = STREAM_HANDSHAKE_ERROR_INVALID_CERTIFICATE; + host->destination->postpone_reconnection_until = now_realtime_sec() + 5 * 60; + return false; + } + + return true; + } + + ND_LOG_STACK lgs[] = { + ND_LOG_FIELD_TXT(NDF_RESPONSE_CODE, RRDPUSH_STATUS_CANT_ESTABLISH_SSL_CONNECTION), + ND_LOG_FIELD_END(), + }; + ND_LOG_STACK_PUSH(lgs); + + netdata_log_error("SSL: failed to establish connection."); + return false; +} + +static int rrdpush_http_upgrade_prelude(RRDHOST *host, struct sender_state *s) { + + char http[HTTP_HEADER_SIZE + 1]; + snprintfz(http, HTTP_HEADER_SIZE, + "GET " NETDATA_STREAM_URL HTTP_1_1 HTTP_ENDL + "Upgrade: " NETDATA_STREAM_PROTO_NAME HTTP_ENDL + "Connection: Upgrade" + HTTP_HDR_END); + + ssize_t bytes = send_timeout( + &host->sender->ssl, + s->rrdpush_sender_socket, + http, + strlen(http), + 0, + 1000); + + bytes = recv_timeout( + &host->sender->ssl, + s->rrdpush_sender_socket, + http, + HTTP_HEADER_SIZE, + 0, + 1000); + + if (bytes <= 0) { + error_report("Error reading from remote"); + return 1; + } + + rbuf_t buf = rbuf_create(bytes); + rbuf_push(buf, http, bytes); + + http_parse_ctx ctx; + http_parse_ctx_create(&ctx, HTTP_PARSE_INITIAL); + ctx.flags |= HTTP_PARSE_FLAG_DONT_WAIT_FOR_CONTENT; + + int rc; + // while((rc = parse_http_response(buf, &ctx)) == HTTP_PARSE_NEED_MORE_DATA); + rc = parse_http_response(buf, &ctx); + + if (rc != HTTP_PARSE_SUCCESS) { + error_report("Failed to parse HTTP response sent. (%d)", rc); + goto err_cleanup; + } + if (ctx.http_code == HTTP_RESP_MOVED_PERM) { + const char *hdr = get_http_header_by_name(&ctx, "location"); + if (hdr) + error_report("HTTP response is %d Moved Permanently (location: \"%s\") instead of expected %d Switching Protocols.", ctx.http_code, hdr, HTTP_RESP_SWITCH_PROTO); + else + error_report("HTTP response is %d instead of expected %d Switching Protocols.", ctx.http_code, HTTP_RESP_SWITCH_PROTO); + goto err_cleanup; + } + if (ctx.http_code == HTTP_RESP_NOT_FOUND) { + error_report("HTTP response is %d instead of expected %d Switching Protocols. Parent version too old.", ctx.http_code, HTTP_RESP_SWITCH_PROTO); + // TODO set some flag here that will signify parent is older version + // and to try connection without rrdpush_http_upgrade_prelude next time + goto err_cleanup; + } + if (ctx.http_code != HTTP_RESP_SWITCH_PROTO) { + error_report("HTTP response is %d instead of expected %d Switching Protocols", ctx.http_code, HTTP_RESP_SWITCH_PROTO); + goto err_cleanup; + } + + const char *hdr = get_http_header_by_name(&ctx, "connection"); + if (!hdr) { + error_report("Missing \"connection\" header in reply"); + goto err_cleanup; + } + if (strncmp(hdr, CONN_UPGRADE_VAL, strlen(CONN_UPGRADE_VAL))) { + error_report("Expected \"connection: " CONN_UPGRADE_VAL "\""); + goto err_cleanup; + } + + hdr = get_http_header_by_name(&ctx, "upgrade"); + if (!hdr) { + error_report("Missing \"upgrade\" header in reply"); + goto err_cleanup; + } + if (strncmp(hdr, NETDATA_STREAM_PROTO_NAME, strlen(NETDATA_STREAM_PROTO_NAME))) { + error_report("Expected \"upgrade: " NETDATA_STREAM_PROTO_NAME "\""); + goto err_cleanup; + } + + netdata_log_debug(D_STREAM, "Stream sender upgrade to \"" NETDATA_STREAM_PROTO_NAME "\" successful"); + rbuf_free(buf); + http_parse_ctx_destroy(&ctx); + return 0; +err_cleanup: + rbuf_free(buf); + http_parse_ctx_destroy(&ctx); + return 1; +} + +static bool sender_send_connection_request(RRDHOST *host, int default_port, int timeout, struct sender_state *s) { + + struct timeval tv = { + .tv_sec = timeout, + .tv_usec = 0 + }; + + // make sure the socket is closed + rrdpush_sender_thread_close_socket(s); + + s->rrdpush_sender_socket = connect_to_one_of_destinations( + host + , default_port + , &tv + , &s->reconnects_counter + , s->connected_to + , sizeof(s->connected_to)-1 + , &host->destination + ); + + if(unlikely(s->rrdpush_sender_socket == -1)) { + // netdata_log_error("STREAM %s [send to %s]: could not connect to parent node at this time.", rrdhost_hostname(host), host->rrdpush_send_destination); + return false; + } + + // netdata_log_info("STREAM %s [send to %s]: initializing communication...", rrdhost_hostname(host), s->connected_to); + + // reset our capabilities to default + s->capabilities = stream_our_capabilities(host, true); + + /* TODO: During the implementation of #7265 switch the set of variables to HOST_* and CONTAINER_* if the + version negotiation resulted in a high enough version. + */ + stream_encoded_t se; + rrdpush_encode_variable(&se, host); + + host->sender->hops = host->system_info->hops + 1; + + char http[HTTP_HEADER_SIZE + 1]; + int eol = snprintfz(http, HTTP_HEADER_SIZE, + "STREAM " + "key=%s" + "&hostname=%s" + "®istry_hostname=%s" + "&machine_guid=%s" + "&update_every=%d" + "&os=%s" + "&timezone=%s" + "&abbrev_timezone=%s" + "&utc_offset=%d" + "&hops=%d" + "&ml_capable=%d" + "&ml_enabled=%d" + "&mc_version=%d" + "&ver=%u" + "&NETDATA_INSTANCE_CLOUD_TYPE=%s" + "&NETDATA_INSTANCE_CLOUD_INSTANCE_TYPE=%s" + "&NETDATA_INSTANCE_CLOUD_INSTANCE_REGION=%s" + "&NETDATA_SYSTEM_OS_NAME=%s" + "&NETDATA_SYSTEM_OS_ID=%s" + "&NETDATA_SYSTEM_OS_ID_LIKE=%s" + "&NETDATA_SYSTEM_OS_VERSION=%s" + "&NETDATA_SYSTEM_OS_VERSION_ID=%s" + "&NETDATA_SYSTEM_OS_DETECTION=%s" + "&NETDATA_HOST_IS_K8S_NODE=%s" + "&NETDATA_SYSTEM_KERNEL_NAME=%s" + "&NETDATA_SYSTEM_KERNEL_VERSION=%s" + "&NETDATA_SYSTEM_ARCHITECTURE=%s" + "&NETDATA_SYSTEM_VIRTUALIZATION=%s" + "&NETDATA_SYSTEM_VIRT_DETECTION=%s" + "&NETDATA_SYSTEM_CONTAINER=%s" + "&NETDATA_SYSTEM_CONTAINER_DETECTION=%s" + "&NETDATA_CONTAINER_OS_NAME=%s" + "&NETDATA_CONTAINER_OS_ID=%s" + "&NETDATA_CONTAINER_OS_ID_LIKE=%s" + "&NETDATA_CONTAINER_OS_VERSION=%s" + "&NETDATA_CONTAINER_OS_VERSION_ID=%s" + "&NETDATA_CONTAINER_OS_DETECTION=%s" + "&NETDATA_SYSTEM_CPU_LOGICAL_CPU_COUNT=%s" + "&NETDATA_SYSTEM_CPU_FREQ=%s" + "&NETDATA_SYSTEM_TOTAL_RAM=%s" + "&NETDATA_SYSTEM_TOTAL_DISK_SIZE=%s" + "&NETDATA_PROTOCOL_VERSION=%s" + HTTP_1_1 HTTP_ENDL + "User-Agent: %s/%s\r\n" + "Accept: */*\r\n\r\n" + , host->rrdpush.send.api_key + , rrdhost_hostname(host) + , rrdhost_registry_hostname(host) + , host->machine_guid + , default_rrd_update_every + , rrdhost_os(host) + , rrdhost_timezone(host) + , rrdhost_abbrev_timezone(host) + , host->utc_offset + , host->sender->hops + , host->system_info->ml_capable + , host->system_info->ml_enabled + , host->system_info->mc_version + , s->capabilities + , (host->system_info->cloud_provider_type) ? host->system_info->cloud_provider_type : "" + , (host->system_info->cloud_instance_type) ? host->system_info->cloud_instance_type : "" + , (host->system_info->cloud_instance_region) ? host->system_info->cloud_instance_region : "" + , se.os_name + , se.os_id + , (host->system_info->host_os_id_like) ? host->system_info->host_os_id_like : "" + , se.os_version + , (host->system_info->host_os_version_id) ? host->system_info->host_os_version_id : "" + , (host->system_info->host_os_detection) ? host->system_info->host_os_detection : "" + , (host->system_info->is_k8s_node) ? host->system_info->is_k8s_node : "" + , se.kernel_name + , se.kernel_version + , (host->system_info->architecture) ? host->system_info->architecture : "" + , (host->system_info->virtualization) ? host->system_info->virtualization : "" + , (host->system_info->virt_detection) ? host->system_info->virt_detection : "" + , (host->system_info->container) ? host->system_info->container : "" + , (host->system_info->container_detection) ? host->system_info->container_detection : "" + , (host->system_info->container_os_name) ? host->system_info->container_os_name : "" + , (host->system_info->container_os_id) ? host->system_info->container_os_id : "" + , (host->system_info->container_os_id_like) ? host->system_info->container_os_id_like : "" + , (host->system_info->container_os_version) ? host->system_info->container_os_version : "" + , (host->system_info->container_os_version_id) ? host->system_info->container_os_version_id : "" + , (host->system_info->container_os_detection) ? host->system_info->container_os_detection : "" + , (host->system_info->host_cores) ? host->system_info->host_cores : "" + , (host->system_info->host_cpu_freq) ? host->system_info->host_cpu_freq : "" + , (host->system_info->host_ram_total) ? host->system_info->host_ram_total : "" + , (host->system_info->host_disk_space) ? host->system_info->host_disk_space : "" + , STREAMING_PROTOCOL_VERSION + , rrdhost_program_name(host) + , rrdhost_program_version(host) + ); + http[eol] = 0x00; + rrdpush_clean_encoded(&se); + + if(!rrdpush_sender_connect_ssl(s)) + return false; + + if (s->parent_using_h2o && rrdpush_http_upgrade_prelude(host, s)) { + ND_LOG_STACK lgs[] = { + ND_LOG_FIELD_TXT(NDF_RESPONSE_CODE, RRDPUSH_STATUS_CANT_UPGRADE_CONNECTION), + ND_LOG_FIELD_END(), + }; + ND_LOG_STACK_PUSH(lgs); + + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_CANT_UPGRADE_CONNECTION); + rrdpush_sender_thread_close_socket(s); + host->destination->reason = STREAM_HANDSHAKE_ERROR_HTTP_UPGRADE; + host->destination->postpone_reconnection_until = now_realtime_sec() + 1 * 60; + return false; + } + + ssize_t len = (ssize_t)strlen(http); + ssize_t bytes = send_timeout( + &host->sender->ssl, + s->rrdpush_sender_socket, + http, + len, + 0, + timeout); + + if(bytes <= 0) { // timeout is 0 + ND_LOG_STACK lgs[] = { + ND_LOG_FIELD_TXT(NDF_RESPONSE_CODE, RRDPUSH_STATUS_TIMEOUT), + ND_LOG_FIELD_END(), + }; + ND_LOG_STACK_PUSH(lgs); + + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_TIMEOUT); + rrdpush_sender_thread_close_socket(s); + + nd_log(NDLS_DAEMON, NDLP_ERR, + "STREAM %s [send to %s]: failed to send HTTP header to remote netdata.", + rrdhost_hostname(host), s->connected_to); + + host->destination->reason = STREAM_HANDSHAKE_ERROR_SEND_TIMEOUT; + host->destination->postpone_reconnection_until = now_realtime_sec() + 1 * 60; + return false; + } + + bytes = recv_timeout( + &host->sender->ssl, + s->rrdpush_sender_socket, + http, + HTTP_HEADER_SIZE, + 0, + timeout); + + if(bytes <= 0) { // timeout is 0 + ND_LOG_STACK lgs[] = { + ND_LOG_FIELD_TXT(NDF_RESPONSE_CODE, RRDPUSH_STATUS_TIMEOUT), + ND_LOG_FIELD_END(), + }; + ND_LOG_STACK_PUSH(lgs); + + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_TIMEOUT); + rrdpush_sender_thread_close_socket(s); + + nd_log(NDLS_DAEMON, NDLP_ERR, + "STREAM %s [send to %s]: remote netdata does not respond.", + rrdhost_hostname(host), s->connected_to); + + host->destination->reason = STREAM_HANDSHAKE_ERROR_RECEIVE_TIMEOUT; + host->destination->postpone_reconnection_until = now_realtime_sec() + 30; + return false; + } + + if(sock_setnonblock(s->rrdpush_sender_socket) < 0) + nd_log(NDLS_DAEMON, NDLP_WARNING, + "STREAM %s [send to %s]: cannot set non-blocking mode for socket.", + rrdhost_hostname(host), s->connected_to); + sock_setcloexec(s->rrdpush_sender_socket); + + if(sock_enlarge_out(s->rrdpush_sender_socket) < 0) + nd_log(NDLS_DAEMON, NDLP_WARNING, + "STREAM %s [send to %s]: cannot enlarge the socket buffer.", + rrdhost_hostname(host), s->connected_to); + + http[bytes] = '\0'; + if(!rrdpush_sender_validate_response(host, s, http, bytes)) + return false; + + rrdpush_compression_initialize(s); + + log_sender_capabilities(s); + + ND_LOG_STACK lgs[] = { + ND_LOG_FIELD_TXT(NDF_RESPONSE_CODE, RRDPUSH_STATUS_CONNECTED), + ND_LOG_FIELD_END(), + }; + ND_LOG_STACK_PUSH(lgs); + + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "STREAM %s: connected to %s...", + rrdhost_hostname(host), s->connected_to); + + return true; +} + +bool attempt_to_connect(struct sender_state *state) { + ND_LOG_STACK lgs[] = { + ND_LOG_FIELD_UUID(NDF_MESSAGE_ID, &streaming_to_parent_msgid), + ND_LOG_FIELD_END(), + }; + ND_LOG_STACK_PUSH(lgs); + + state->send_attempts = 0; + + // reset the bytes we have sent for this session + state->sent_bytes_on_this_connection = 0; + memset(state->sent_bytes_on_this_connection_per_type, 0, sizeof(state->sent_bytes_on_this_connection_per_type)); + + if(sender_send_connection_request(state->host, state->default_port, state->timeout, state)) { + // reset the buffer, to properly send charts and metrics + rrdpush_sender_on_connect(state->host); + + // send from the beginning + state->begin = 0; + + // make sure the next reconnection will be immediate + state->not_connected_loops = 0; + + // let the data collection threads know we are ready + rrdhost_flag_set(state->host, RRDHOST_FLAG_RRDPUSH_SENDER_CONNECTED); + + rrdpush_sender_after_connect(state->host); + + return true; + } + + // we couldn't connect + + // increase the failed connections counter + state->not_connected_loops++; + + // slow re-connection on repeating errors + usec_t now_ut = now_monotonic_usec(); + usec_t end_ut = now_ut + USEC_PER_SEC * state->reconnect_delay; + while(now_ut < end_ut) { + if(nd_thread_signaled_to_cancel()) + return false; + + sleep_usec(100 * USEC_PER_MS); // seconds + now_ut = now_monotonic_usec(); + } + + return false; +} + +bool rrdpush_sender_connect(struct sender_state *s) { + worker_is_busy(WORKER_SENDER_JOB_CONNECT); + + time_t now_s = now_monotonic_sec(); + rrdpush_sender_cbuffer_recreate_timed(s, now_s, false, true); + rrdpush_sender_execute_commands_cleanup(s); + + rrdhost_flag_clear(s->host, RRDHOST_FLAG_RRDPUSH_SENDER_READY_4_METRICS); + s->flags &= ~SENDER_FLAG_OVERFLOW; + s->read_len = 0; + s->buffer->read = 0; + s->buffer->write = 0; + + if(!attempt_to_connect(s)) + return false; + + if(rrdhost_sender_should_exit(s)) + return false; + + s->last_traffic_seen_t = now_monotonic_sec(); + stream_path_send_to_parent(s->host); + rrdpush_sender_send_claimed_id(s->host); + rrdpush_send_host_labels(s->host); + rrdpush_send_global_functions(s->host); + s->replication.oldest_request_after_t = 0; + + rrdhost_flag_set(s->host, RRDHOST_FLAG_RRDPUSH_SENDER_READY_4_METRICS); + + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "STREAM %s [send to %s]: enabling metrics streaming...", + rrdhost_hostname(s->host), s->connected_to); + + return true; +} + +// Either the receiver lost the connection or the host is being destroyed. +// The sender mutex guards thread creation, any spurious data is wiped on reconnection. +void rrdpush_sender_thread_stop(RRDHOST *host, STREAM_HANDSHAKE reason, bool wait) { + if (!host->sender) + return; + + sender_lock(host->sender); + + if(rrdhost_flag_check(host, RRDHOST_FLAG_RRDPUSH_SENDER_SPAWN)) { + + host->sender->exit.shutdown = true; + host->sender->exit.reason = reason; + + // signal it to cancel + nd_thread_signal_cancel(host->rrdpush_sender_thread); + } + + sender_unlock(host->sender); + + if(wait) { + sender_lock(host->sender); + while(host->sender->tid) { + sender_unlock(host->sender); + sleep_usec(10 * USEC_PER_MS); + sender_lock(host->sender); + } + sender_unlock(host->sender); + } +} |