diff options
Diffstat (limited to '')
-rw-r--r-- | src/streaming/protocol/command-nodeid.c | 128 |
1 files changed, 128 insertions, 0 deletions
diff --git a/src/streaming/protocol/command-nodeid.c b/src/streaming/protocol/command-nodeid.c new file mode 100644 index 00000000..85ace83c --- /dev/null +++ b/src/streaming/protocol/command-nodeid.c @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "commands.h" +#include "plugins.d/pluginsd_internals.h" + +// the child disconnected from the parent, and it has to clear the parent's claim id +void rrdpush_sender_clear_parent_claim_id(RRDHOST *host) { + host->aclk.claim_id_of_parent = UUID_ZERO; +} + +// the parent sends to the child its claim id, node id and cloud url +void rrdpush_receiver_send_node_and_claim_id_to_child(RRDHOST *host) { + if(host == localhost || UUIDiszero(host->node_id)) return; + + spinlock_lock(&host->receiver_lock); + if(host->receiver && stream_has_capability(host->receiver, STREAM_CAP_NODE_ID)) { + char node_id_str[UUID_STR_LEN] = ""; + uuid_unparse_lower(host->node_id.uuid, node_id_str); + + CLAIM_ID claim_id = claim_id_get(); + + if((!claim_id_is_set(claim_id) || !aclk_online())) { + // the agent is not claimed or not connected, just use parent claim id + // to allow the connection flow. + // this may be zero and it is ok. + claim_id.uuid = host->aclk.claim_id_of_parent; + uuid_unparse_lower(claim_id.uuid.uuid, claim_id.str); + } + + char buf[4096]; + snprintfz(buf, sizeof(buf), + PLUGINSD_KEYWORD_NODE_ID " '%s' '%s' '%s'\n", + claim_id.str, node_id_str, cloud_config_url_get()); + + send_to_plugin(buf, __atomic_load_n(&host->receiver->parser, __ATOMIC_RELAXED)); + } + spinlock_unlock(&host->receiver_lock); +} + +// the sender of the child receives node id, claim id and cloud url from the receiver of the parent +void rrdpush_sender_get_node_and_claim_id_from_parent(struct sender_state *s) { + char *claim_id_str = get_word(s->line.words, s->line.num_words, 1); + char *node_id_str = get_word(s->line.words, s->line.num_words, 2); + char *url = get_word(s->line.words, s->line.num_words, 3); + + bool claimed = is_agent_claimed(); + bool update_node_id = false; + + ND_UUID claim_id; + if (uuid_parse(claim_id_str ? claim_id_str : "", claim_id.uuid) != 0) { + nd_log(NDLS_DAEMON, NDLP_ERR, + "STREAM %s [send to %s] received invalid claim id '%s'", + rrdhost_hostname(s->host), s->connected_to, + claim_id_str ? claim_id_str : "(unset)"); + return; + } + + ND_UUID node_id; + if(uuid_parse(node_id_str ? node_id_str : "", node_id.uuid) != 0) { + nd_log(NDLS_DAEMON, NDLP_ERR, + "STREAM %s [send to %s] received an invalid node id '%s'", + rrdhost_hostname(s->host), s->connected_to, + node_id_str ? node_id_str : "(unset)"); + return; + } + + if (!UUIDiszero(s->host->aclk.claim_id_of_parent) && !UUIDeq(s->host->aclk.claim_id_of_parent, claim_id)) + nd_log(NDLS_DAEMON, NDLP_INFO, + "STREAM %s [send to %s] changed parent's claim id to %s", + rrdhost_hostname(s->host), s->connected_to, + claim_id_str ? claim_id_str : "(unset)"); + + if(!UUIDiszero(s->host->node_id) && !UUIDeq(s->host->node_id, node_id)) { + if(claimed) { + nd_log(NDLS_DAEMON, NDLP_ERR, + "STREAM %s [send to %s] parent reports different node id '%s', but we are claimed. Ignoring it.", + rrdhost_hostname(s->host), s->connected_to, + node_id_str ? node_id_str : "(unset)"); + return; + } + else { + update_node_id = true; + nd_log(NDLS_DAEMON, NDLP_WARNING, + "STREAM %s [send to %s] changed node id to %s", + rrdhost_hostname(s->host), s->connected_to, + node_id_str ? node_id_str : "(unset)"); + } + } + + if(!url || !*url) { + nd_log(NDLS_DAEMON, NDLP_ERR, + "STREAM %s [send to %s] received an invalid cloud URL '%s'", + rrdhost_hostname(s->host), s->connected_to, + url ? url : "(unset)"); + return; + } + + s->host->aclk.claim_id_of_parent = claim_id; + + // There are some very strange corner cases here: + // + // - Agent is claimed but offline, and it receives node_id and cloud_url from a different Netdata Cloud. + // - Agent is configured to talk to an on-prem Netdata Cloud, it is offline, but the parent is connected + // to a different Netdata Cloud. + // + // The solution below, tries to get the agent online, using the latest information. + // So, if the agent is not claimed or not connected, we inherit whatever information sent from the parent, + // to allow the user to work with it. + + if(claimed && aclk_online()) + // we are directly claimed and connected, ignore node id and cloud url + return; + + bool node_id_updated = false; + if(UUIDiszero(s->host->node_id) || update_node_id) { + s->host->node_id = node_id; + node_id_updated = true; + } + + // we change the URL, to allow the agent dashboard to work with Netdata Cloud on-prem, if any. + cloud_config_url_set(url); + + // send it down the line (to children) + rrdpush_receiver_send_node_and_claim_id_to_child(s->host); + + if(node_id_updated) + stream_path_node_id_updated(s->host); +} |