diff options
Diffstat (limited to 'src/claim/claim.c')
-rw-r--r-- | src/claim/claim.c | 555 |
1 files changed, 147 insertions, 408 deletions
diff --git a/src/claim/claim.c b/src/claim/claim.c index 5383aac37..24e4e1c3c 100644 --- a/src/claim/claim.c +++ b/src/claim/claim.c @@ -1,470 +1,209 @@ // SPDX-License-Identifier: GPL-3.0-or-later #include "claim.h" -#include "registry/registry_internals.h" -#include "aclk/aclk.h" -#include "aclk/aclk_proxy.h" - -char *claiming_pending_arguments = NULL; - -static char *claiming_errors[] = { - "Agent claimed successfully", // 0 - "Unknown argument", // 1 - "Problems with claiming working directory", // 2 - "Missing dependencies", // 3 - "Failure to connect to endpoint", // 4 - "The CLI didn't work", // 5 - "Wrong user", // 6 - "Unknown HTTP error message", // 7 - "invalid node id", // 8 - "invalid node name", // 9 - "invalid room id", // 10 - "invalid public key", // 11 - "token expired/token not found/invalid token", // 12 - "already claimed", // 13 - "processing claiming", // 14 - "Internal Server Error", // 15 - "Gateway Timeout", // 16 - "Service Unavailable", // 17 - "Agent Unique Id Not Readable" // 18 -}; - -/* Retrieve the claim id for the agent. - * Caller owns the string. -*/ -char *get_agent_claimid() -{ - char *result; - rrdhost_aclk_state_lock(localhost); - result = (localhost->aclk_state.claimed_id == NULL) ? NULL : strdupz(localhost->aclk_state.claimed_id); - rrdhost_aclk_state_unlock(localhost); - return result; -} - -#define CLAIMING_COMMAND_LENGTH 16384 -#define CLAIMING_PROXY_LENGTH (CLAIMING_COMMAND_LENGTH/4) -/* rrd_init() and post_conf_load() must have been called before this function */ -CLAIM_AGENT_RESPONSE claim_agent(const char *claiming_arguments, bool force, const char **msg __maybe_unused) -{ - if (!force || !netdata_cloud_enabled) { - netdata_log_error("Refusing to claim agent -> cloud functionality has been disabled"); - return CLAIM_AGENT_CLOUD_DISABLED; - } +// -------------------------------------------------------------------------------------------------------------------- +// keep track of the last claiming failure reason -#ifndef DISABLE_CLOUD - char command_exec_buffer[CLAIMING_COMMAND_LENGTH + 1]; - char command_line_buffer[CLAIMING_COMMAND_LENGTH + 1]; +static char cloud_claim_failure_reason[4096] = ""; - // This is guaranteed to be set early in main via post_conf_load() - char *cloud_base_url = appconfig_get(&cloud_config, CONFIG_SECTION_GLOBAL, "cloud base url", NULL); - if (cloud_base_url == NULL) { - internal_fatal(true, "Do not move the cloud base url out of post_conf_load!!"); - return CLAIM_AGENT_NO_CLOUD_URL; +void claim_agent_failure_reason_set(const char *format, ...) { + if(!format || !*format) { + cloud_claim_failure_reason[0] = '\0'; + return; } - const char *proxy_str; - ACLK_PROXY_TYPE proxy_type; - char proxy_flag[CLAIMING_PROXY_LENGTH] = "-noproxy"; - - proxy_str = aclk_get_proxy(&proxy_type); - - if (proxy_type == PROXY_TYPE_SOCKS5 || proxy_type == PROXY_TYPE_HTTP) - snprintf(proxy_flag, CLAIMING_PROXY_LENGTH, "-proxy=\"%s\"", proxy_str); - - snprintfz(command_exec_buffer, CLAIMING_COMMAND_LENGTH, - "exec \"%s%snetdata-claim.sh\"", - netdata_exe_path[0] ? netdata_exe_path : "", - netdata_exe_path[0] ? "/" : "" - ); - - snprintfz(command_line_buffer, - CLAIMING_COMMAND_LENGTH, - "%s %s -hostname=%s -id=%s -url=%s -noreload %s", - command_exec_buffer, - proxy_flag, - netdata_configured_hostname, - localhost->machine_guid, - cloud_base_url, - claiming_arguments); - - netdata_log_info("Executing agent claiming command: %s", command_exec_buffer); - POPEN_INSTANCE *instance = spawn_popen_run(command_line_buffer); - if(!instance) { - netdata_log_error("Cannot popen(\"%s\").", command_exec_buffer); - return CLAIM_AGENT_CANNOT_EXECUTE_CLAIM_SCRIPT; - } + va_list args; + va_start(args, format); + vsnprintf(cloud_claim_failure_reason, sizeof(cloud_claim_failure_reason), format, args); + va_end(args); + + nd_log(NDLS_DAEMON, NDLP_ERR, + "CLAIM: %s", cloud_claim_failure_reason); +} - netdata_log_info("Waiting for claiming command '%s' to finish.", command_exec_buffer); - char read_buffer[100 + 1]; - while (fgets(read_buffer, 100, instance->child_stdout_fp) != NULL) ; +const char *claim_agent_failure_reason_get(void) { + if(!cloud_claim_failure_reason[0]) + return "Agent is not claimed yet"; + else + return cloud_claim_failure_reason; +} - int exit_code = spawn_popen_wait(instance); +// -------------------------------------------------------------------------------------------------------------------- +// claimed_id load/save - netdata_log_info("Agent claiming command '%s' returned with code %d", command_exec_buffer, exit_code); - if (0 == exit_code) { - load_claiming_state(); - return CLAIM_AGENT_OK; - } - if (exit_code < 0) { - netdata_log_error("Agent claiming command '%s' failed to complete its run", command_exec_buffer); - return CLAIM_AGENT_CLAIM_SCRIPT_FAILED; +bool claimed_id_save_to_file(const char *claimed_id_str) { + bool ret; + const char *filename = filename_from_path_entry_strdupz(netdata_configured_cloud_dir, "claimed_id"); + FILE *fp = fopen(filename, "w"); + if(fp) { + fprintf(fp, "%s", claimed_id_str); + fclose(fp); + ret = true; } - errno_clear(); - unsigned maximum_known_exit_code = sizeof(claiming_errors) / sizeof(claiming_errors[0]) - 1; - - if ((unsigned)exit_code > maximum_known_exit_code) { - netdata_log_error("Agent failed to be claimed with an unknown error. Cmd: '%s'", command_exec_buffer); - return CLAIM_AGENT_CLAIM_SCRIPT_RETURNED_INVALID_CODE; + else { + nd_log(NDLS_DAEMON, NDLP_ERR, + "CLAIM: cannot open file '%s' for writing.", filename); + ret = false; } - netdata_log_error("Agent failed to be claimed using the command '%s' with the following error message: %s", - command_exec_buffer, claiming_errors[exit_code]); + freez((void *)filename); + return ret; +} - if(msg) *msg = claiming_errors[exit_code]; +static ND_UUID claimed_id_parse(const char *claimed_id, const char *source) { + ND_UUID uuid; -#else - UNUSED(claiming_arguments); - UNUSED(claiming_errors); -#endif + if(uuid_parse_flexi(claimed_id, uuid.uuid) != 0) { + uuid = UUID_ZERO; + nd_log(NDLS_DAEMON, NDLP_ERR, + "CLAIM: claimed_id '%s' (loaded from '%s'), is not a valid UUID.", + claimed_id, source); + } - return CLAIM_AGENT_FAILED_WITH_MESSAGE; + return uuid; } -/* Change the claimed state of the agent. - * - * This only happens when the user has explicitly requested it: - * - via the cli tool by reloading the claiming state - * - after spawning the claim because of a command-line argument - * If this happens with the ACLK active under an old claim then we MUST KILL THE LINK - */ -void load_claiming_state(void) -{ - // -------------------------------------------------------------------- - // Check if the cloud is enabled -#if defined( DISABLE_CLOUD ) || !defined( ENABLE_ACLK ) - netdata_cloud_enabled = false; -#else - nd_uuid_t uuid; - - // Propagate into aclk and registry. Be kind of atomic... - appconfig_get(&cloud_config, CONFIG_SECTION_GLOBAL, "cloud base url", DEFAULT_CLOUD_BASE_URL); - - rrdhost_aclk_state_lock(localhost); - if (localhost->aclk_state.claimed_id) { - if (aclk_connected) - localhost->aclk_state.prev_claimed_id = strdupz(localhost->aclk_state.claimed_id); - freez(localhost->aclk_state.claimed_id); - localhost->aclk_state.claimed_id = NULL; - } - if (aclk_connected) - { - netdata_log_info("Agent was already connected to Cloud - forcing reconnection under new credentials"); - aclk_kill_link = 1; - } - aclk_disable_runtime = 0; - - char filename[FILENAME_MAX + 1]; - snprintfz(filename, FILENAME_MAX, "%s/cloud.d/claimed_id", netdata_configured_varlib_dir); +static ND_UUID claimed_id_load_from_file(void) { + ND_UUID uuid; long bytes_read; + const char *filename = filename_from_path_entry_strdupz(netdata_configured_cloud_dir, "claimed_id"); char *claimed_id = read_by_filename(filename, &bytes_read); - if(claimed_id && uuid_parse(claimed_id, uuid)) { - netdata_log_error("claimed_id \"%s\" doesn't look like valid UUID", claimed_id); - freez(claimed_id); - claimed_id = NULL; - } - - if(claimed_id) { - localhost->aclk_state.claimed_id = mallocz(UUID_STR_LEN); - uuid_unparse_lower(uuid, localhost->aclk_state.claimed_id); - } - rrdhost_aclk_state_unlock(localhost); - invalidate_node_instances(&localhost->host_uuid, claimed_id ? &uuid : NULL); - metaqueue_store_claim_id(&localhost->host_uuid, claimed_id ? &uuid : NULL); - - if (!claimed_id) { - netdata_log_info("Unable to load '%s', setting state to AGENT_UNCLAIMED", filename); - return; - } + if(!claimed_id) + uuid = UUID_ZERO; + else + uuid = claimed_id_parse(claimed_id, filename); freez(claimed_id); - - netdata_log_info("File '%s' was found. Setting state to AGENT_CLAIMED.", filename); - netdata_cloud_enabled = appconfig_get_boolean_ondemand(&cloud_config, CONFIG_SECTION_GLOBAL, "enabled", netdata_cloud_enabled); -#endif + freez((void *)filename); + return uuid; } -struct config cloud_config = { .first_section = NULL, - .last_section = NULL, - .mutex = NETDATA_MUTEX_INITIALIZER, - .index = { .avl_tree = { .root = NULL, .compar = appconfig_section_compare }, - .rwlock = AVL_LOCK_INITIALIZER } }; - -void load_cloud_conf(int silent) -{ - char *nd_disable_cloud = getenv("NETDATA_DISABLE_CLOUD"); - if (nd_disable_cloud && !strncmp(nd_disable_cloud, "1", 1)) - netdata_cloud_enabled = CONFIG_BOOLEAN_NO; - - char *filename; - errno_clear(); - - int ret = 0; - - filename = strdupz_path_subpath(netdata_configured_varlib_dir, "cloud.d/cloud.conf"); - - ret = appconfig_load(&cloud_config, filename, 1, NULL); - if(!ret && !silent) - netdata_log_info("CONFIG: cannot load cloud config '%s'. Running with internal defaults.", filename); - - freez(filename); - - // -------------------------------------------------------------------- - // Check if the cloud is enabled - -#if defined( DISABLE_CLOUD ) || !defined( ENABLE_ACLK ) - netdata_cloud_enabled = CONFIG_BOOLEAN_NO; -#else - netdata_cloud_enabled = appconfig_get_boolean_ondemand(&cloud_config, CONFIG_SECTION_GLOBAL, "enabled", netdata_cloud_enabled); -#endif - - // This must be set before any point in the code that accesses it. Do not move it from this function. - appconfig_get(&cloud_config, CONFIG_SECTION_GLOBAL, "cloud base url", DEFAULT_CLOUD_BASE_URL); -} - -static char *netdata_random_session_id_filename = NULL; -static nd_uuid_t netdata_random_session_id = { 0 }; - -bool netdata_random_session_id_generate(void) { - static char guid[UUID_STR_LEN] = ""; - - uuid_generate_random(netdata_random_session_id); - uuid_unparse_lower(netdata_random_session_id, guid); - - char filename[FILENAME_MAX + 1]; - snprintfz(filename, FILENAME_MAX, "%s/netdata_random_session_id", netdata_configured_varlib_dir); - - bool ret = true; - - (void)unlink(filename); - - // save it - int fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC|O_CLOEXEC, 640); - if(fd == -1) { - netdata_log_error("Cannot create random session id file '%s'.", filename); - ret = false; +static ND_UUID claimed_id_get_from_cloud_conf(void) { + if(appconfig_exists(&cloud_config, CONFIG_SECTION_GLOBAL, "claimed_id")) { + const char *claimed_id = appconfig_get(&cloud_config, CONFIG_SECTION_GLOBAL, "claimed_id", ""); + if(claimed_id && *claimed_id) + return claimed_id_parse(claimed_id, "cloud.conf"); } - else { - if (write(fd, guid, UUID_STR_LEN - 1) != UUID_STR_LEN - 1) { - netdata_log_error("Cannot write the random session id file '%s'.", filename); - ret = false; - } else { - ssize_t bytes = write(fd, "\n", 1); - UNUSED(bytes); - } - close(fd); - } - - if(ret && (!netdata_random_session_id_filename || strcmp(netdata_random_session_id_filename, filename) != 0)) { - freez(netdata_random_session_id_filename); - netdata_random_session_id_filename = strdupz(filename); - } - - return ret; + return UUID_ZERO; } -const char *netdata_random_session_id_get_filename(void) { - if(!netdata_random_session_id_filename) - netdata_random_session_id_generate(); +static ND_UUID claimed_id_load(void) { + ND_UUID uuid = claimed_id_get_from_cloud_conf(); + if(UUIDiszero(uuid)) + uuid = claimed_id_load_from_file(); - return netdata_random_session_id_filename; + return uuid; } -bool netdata_random_session_id_matches(const char *guid) { - if(uuid_is_null(netdata_random_session_id)) - return false; +bool is_agent_claimed(void) { + ND_UUID uuid = claim_id_get_uuid(); + return !UUIDiszero(uuid); +} - nd_uuid_t uuid; +// -------------------------------------------------------------------------------------------------------------------- - if(uuid_parse(guid, uuid)) +bool claim_id_matches(const char *claim_id) { + ND_UUID this_one = UUID_ZERO; + if(uuid_parse_flexi(claim_id, this_one.uuid) != 0 || UUIDiszero(this_one)) return false; - if(uuid_compare(netdata_random_session_id, uuid) == 0) + ND_UUID having = claim_id_get_uuid(); + if(!UUIDiszero(having) && UUIDeq(having, this_one)) return true; return false; } -static bool check_claim_param(const char *s) { - if(!s || !*s) return true; +bool claim_id_matches_any(const char *claim_id) { + ND_UUID this_one = UUID_ZERO; + if(uuid_parse_flexi(claim_id, this_one.uuid) != 0 || UUIDiszero(this_one)) + return false; - do { - if(isalnum((uint8_t)*s) || *s == '.' || *s == ',' || *s == '-' || *s == ':' || *s == '/' || *s == '_') - ; - else - return false; + ND_UUID having = claim_id_get_uuid(); + if(!UUIDiszero(having) && UUIDeq(having, this_one)) + return true; - } while(*++s); + having = localhost->aclk.claim_id_of_parent; + if(!UUIDiszero(having) && UUIDeq(having, this_one)) + return true; - return true; -} + having = localhost->aclk.claim_id_of_origin; + if(!UUIDiszero(having) && UUIDeq(having, this_one)) + return true; -void claim_reload_all(void) { - nd_log_limits_unlimited(); - load_claiming_state(); - registry_update_cloud_base_url(); - rrdpush_send_claimed_id(localhost); - nd_log_limits_reset(); + return false; } -int api_v2_claim(struct web_client *w, char *url) { - char *key = NULL; - char *token = NULL; - char *rooms = NULL; - char *base_url = NULL; - - while (url) { - char *value = strsep_skip_consecutive_separators(&url, "&"); - if (!value || !*value) continue; - - char *name = strsep_skip_consecutive_separators(&value, "="); - if (!name || !*name) continue; - if (!value || !*value) continue; - - if(!strcmp(name, "key")) - key = value; - else if(!strcmp(name, "token")) - token = value; - else if(!strcmp(name, "rooms")) - rooms = value; - else if(!strcmp(name, "url")) - base_url = value; +/* Change the claimed state of the agent. + * + * This only happens when the user has explicitly requested it: + * - via the cli tool by reloading the claiming state + * - after spawning the claim because of a command-line argument + * If this happens with the ACLK active under an old claim then we MUST KILL THE LINK + */ +bool load_claiming_state(void) { + if (aclk_online()) { + nd_log(NDLS_DAEMON, NDLP_ERR, + "CLAIM: agent was already connected to NC - forcing reconnection under new credentials"); + disconnect_req = ACLK_RELOAD_CONF; } + aclk_disable_runtime = 0; - BUFFER *wb = w->response.data; - buffer_flush(wb); - buffer_json_initialize(wb, "\"", "\"", 0, true, BUFFER_JSON_OPTIONS_DEFAULT); - - time_t now_s = now_realtime_sec(); - CLOUD_STATUS status = buffer_json_cloud_status(wb, now_s); - - bool can_be_claimed = false; - switch(status) { - case CLOUD_STATUS_AVAILABLE: - case CLOUD_STATUS_DISABLED: - case CLOUD_STATUS_OFFLINE: - can_be_claimed = true; - break; - - case CLOUD_STATUS_UNAVAILABLE: - case CLOUD_STATUS_BANNED: - case CLOUD_STATUS_ONLINE: - can_be_claimed = false; - break; + ND_UUID uuid = claimed_id_load(); + if(UUIDiszero(uuid)) { + // not found + if(claim_agent_automatically()) + uuid = claimed_id_load(); } - buffer_json_member_add_boolean(wb, "can_be_claimed", can_be_claimed); - - if(can_be_claimed && key) { - if(!netdata_random_session_id_matches(key)) { - buffer_reset(wb); - buffer_strcat(wb, "invalid key"); - netdata_random_session_id_generate(); // generate a new key, to avoid an attack to find it - return HTTP_RESP_FORBIDDEN; - } - - if(!token || !base_url || !check_claim_param(token) || !check_claim_param(base_url) || (rooms && !check_claim_param(rooms))) { - buffer_reset(wb); - buffer_strcat(wb, "invalid parameters"); - netdata_random_session_id_generate(); // generate a new key, to avoid an attack to find it - return HTTP_RESP_BAD_REQUEST; - } - - netdata_random_session_id_generate(); // generate a new key, to avoid an attack to find it - - netdata_cloud_enabled = CONFIG_BOOLEAN_AUTO; - appconfig_set_boolean(&cloud_config, CONFIG_SECTION_GLOBAL, "enabled", CONFIG_BOOLEAN_AUTO); - appconfig_set(&cloud_config, CONFIG_SECTION_GLOBAL, "cloud base url", base_url); - - nd_uuid_t claimed_id; - uuid_generate_random(claimed_id); - char claimed_id_str[UUID_STR_LEN]; - uuid_unparse_lower(claimed_id, claimed_id_str); - - BUFFER *t = buffer_create(1024, NULL); - if(rooms) - buffer_sprintf(t, "-id=%s -token=%s -rooms=%s", claimed_id_str, token, rooms); - else - buffer_sprintf(t, "-id=%s -token=%s", claimed_id_str, token); - - bool success = false; - const char *msg = NULL; - CLAIM_AGENT_RESPONSE rc = claim_agent(buffer_tostring(t), true, &msg); - switch(rc) { - case CLAIM_AGENT_OK: - msg = "ok"; - success = true; - can_be_claimed = false; - claim_reload_all(); - { - int ms = 0; - do { - status = cloud_status(); - if (status == CLOUD_STATUS_ONLINE && __atomic_load_n(&localhost->node_id, __ATOMIC_RELAXED)) - break; - - sleep_usec(50 * USEC_PER_MS); - ms += 50; - } while (ms < 10000); - } - break; + bool have_claimed_id = false; + if(!UUIDiszero(uuid)) { + // we go it somehow + claim_id_set(uuid); + have_claimed_id = true; + } - case CLAIM_AGENT_NO_CLOUD_URL: - msg = "No Netdata Cloud URL."; - break; + invalidate_node_instances(&localhost->host_id.uuid, have_claimed_id ? &uuid.uuid : NULL); + metaqueue_store_claim_id(&localhost->host_id.uuid, have_claimed_id ? &uuid.uuid : NULL); - case CLAIM_AGENT_CLAIM_SCRIPT_FAILED: - msg = "Claiming script failed."; - break; + errno_clear(); - case CLAIM_AGENT_CLOUD_DISABLED: - msg = "Netdata Cloud is disabled on this agent."; - break; + if (!have_claimed_id) + nd_log(NDLS_DAEMON, NDLP_ERR, + "CLAIM: Unable to find our claimed_id, setting state to AGENT_UNCLAIMED"); + else + nd_log(NDLS_DAEMON, NDLP_INFO, + "CLAIM: Found a valid claimed_id, setting state to AGENT_CLAIMED"); - case CLAIM_AGENT_CANNOT_EXECUTE_CLAIM_SCRIPT: - msg = "Failed to execute claiming script."; - break; + return have_claimed_id; +} - case CLAIM_AGENT_CLAIM_SCRIPT_RETURNED_INVALID_CODE: - msg = "Claiming script returned invalid code."; - break; +CLOUD_STATUS claim_reload_and_wait_online(void) { + nd_log(NDLS_DAEMON, NDLP_INFO, + "CLAIM: Reloading Agent Claiming configuration."); - default: - case CLAIM_AGENT_FAILED_WITH_MESSAGE: - if(!msg) - msg = "Unknown error"; - break; - } - - // our status may have changed - // refresh the status in our output - buffer_flush(wb); - buffer_json_initialize(wb, "\"", "\"", 0, true, BUFFER_JSON_OPTIONS_DEFAULT); - now_s = now_realtime_sec(); - buffer_json_cloud_status(wb, now_s); - - // and this is the status of the claiming command we run - buffer_json_member_add_boolean(wb, "success", success); - buffer_json_member_add_string(wb, "message", msg); - } + nd_log_limits_unlimited(); + cloud_conf_load(0); + bool claimed = load_claiming_state(); + registry_update_cloud_base_url(); + rrdpush_sender_send_claimed_id(localhost); + nd_log_limits_reset(); - if(can_be_claimed) - buffer_json_member_add_string(wb, "key_filename", netdata_random_session_id_get_filename()); + CLOUD_STATUS status = cloud_status(); + if(claimed) { + int ms = 0; + do { + status = cloud_status(); + if ((status == CLOUD_STATUS_ONLINE) && !UUIDiszero(localhost->node_id)) + break; - buffer_json_agents_v2(wb, NULL, now_s, false, false); - buffer_json_finalize(wb); + sleep_usec(50 * USEC_PER_MS); + ms += 50; + } while (ms < 10000); + } - return HTTP_RESP_OK; + return status; } |