diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 06:53:20 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 06:53:20 +0000 |
commit | e5a812082ae033afb1eed82c0f2df3d0f6bdc93f (patch) | |
tree | a6716c9275b4b413f6c9194798b34b91affb3cc7 /daemons/execd | |
parent | Initial commit. (diff) | |
download | pacemaker-e5a812082ae033afb1eed82c0f2df3d0f6bdc93f.tar.xz pacemaker-e5a812082ae033afb1eed82c0f2df3d0f6bdc93f.zip |
Adding upstream version 2.1.6.upstream/2.1.6
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'daemons/execd')
-rw-r--r-- | daemons/execd/Makefile.am | 76 | ||||
-rw-r--r-- | daemons/execd/cts-exec-helper.c | 624 | ||||
-rw-r--r-- | daemons/execd/execd_alerts.c | 205 | ||||
-rw-r--r-- | daemons/execd/execd_commands.c | 1927 | ||||
-rw-r--r-- | daemons/execd/pacemaker-execd.c | 582 | ||||
-rw-r--r-- | daemons/execd/pacemaker-execd.h | 110 | ||||
-rw-r--r-- | daemons/execd/pacemaker-remoted.8.inc | 5 | ||||
-rw-r--r-- | daemons/execd/pacemaker_remote.in | 176 | ||||
-rw-r--r-- | daemons/execd/pacemaker_remote.service.in | 52 | ||||
-rw-r--r-- | daemons/execd/remoted_pidone.c | 298 | ||||
-rw-r--r-- | daemons/execd/remoted_proxy.c | 470 | ||||
-rw-r--r-- | daemons/execd/remoted_tls.c | 428 |
12 files changed, 4953 insertions, 0 deletions
diff --git a/daemons/execd/Makefile.am b/daemons/execd/Makefile.am new file mode 100644 index 0000000..466f0df --- /dev/null +++ b/daemons/execd/Makefile.am @@ -0,0 +1,76 @@ +# +# Copyright 2012-2021 the Pacemaker project contributors +# +# The version control history for this file may have further details. +# +# This source code is licensed under the GNU Lesser General Public License +# version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. +# + +include $(top_srcdir)/mk/common.mk +include $(top_srcdir)/mk/man.mk + +halibdir = $(CRM_DAEMON_DIR) + +halib_PROGRAMS = pacemaker-execd cts-exec-helper + +EXTRA_DIST = pacemaker-remoted.8.inc + +pacemaker_execd_CFLAGS = $(CFLAGS_HARDENED_EXE) +pacemaker_execd_LDFLAGS = $(LDFLAGS_HARDENED_EXE) + +pacemaker_execd_LDADD = $(top_builddir)/lib/common/libcrmcommon.la \ + $(top_builddir)/lib/services/libcrmservice.la \ + $(top_builddir)/lib/fencing/libstonithd.la +pacemaker_execd_SOURCES = pacemaker-execd.c execd_commands.c \ + execd_alerts.c + +if BUILD_REMOTE +sbin_PROGRAMS = pacemaker-remoted +if BUILD_SYSTEMD +systemdsystemunit_DATA = pacemaker_remote.service +else +initdir = $(INITDIR) +init_SCRIPTS = pacemaker_remote +endif + +pacemaker_remoted_CPPFLAGS = -DPCMK__COMPILE_REMOTE $(AM_CPPFLAGS) + +pacemaker_remoted_CFLAGS = $(CFLAGS_HARDENED_EXE) +pacemaker_remoted_LDFLAGS = $(LDFLAGS_HARDENED_EXE) + +pacemaker_remoted_LDADD = $(pacemaker_execd_LDADD) \ + $(top_builddir)/lib/lrmd/liblrmd.la +pacemaker_remoted_SOURCES = $(pacemaker_execd_SOURCES) \ + remoted_tls.c remoted_pidone.c remoted_proxy.c +endif + +cts_exec_helper_LDADD = $(top_builddir)/lib/common/libcrmcommon.la \ + $(top_builddir)/lib/lrmd/liblrmd.la \ + $(top_builddir)/lib/cib/libcib.la \ + $(top_builddir)/lib/services/libcrmservice.la \ + $(top_builddir)/lib/pengine/libpe_status.la +cts_exec_helper_SOURCES = cts-exec-helper.c + +noinst_HEADERS = pacemaker-execd.h + +CLEANFILES = $(man8_MANS) + +# Always create a symlink for the old pacemaker_remoted name, so that bundle +# container images using a current Pacemaker will run on cluster nodes running +# Pacemaker 1 (>=1.1.17). +install-exec-hook: +if BUILD_LEGACY_LINKS + cd $(DESTDIR)$(CRM_DAEMON_DIR) && rm -f lrmd && $(LN_S) pacemaker-execd lrmd +endif +if BUILD_REMOTE + cd $(DESTDIR)$(sbindir) && rm -f pacemaker_remoted && $(LN_S) pacemaker-remoted pacemaker_remoted +endif + +uninstall-hook: +if BUILD_LEGACY_LINKS + cd $(DESTDIR)$(CRM_DAEMON_DIR) && rm -f lrmd +endif +if BUILD_REMOTE + cd $(DESTDIR)$(sbindir) && rm -f pacemaker_remoted +endif diff --git a/daemons/execd/cts-exec-helper.c b/daemons/execd/cts-exec-helper.c new file mode 100644 index 0000000..2af5e16 --- /dev/null +++ b/daemons/execd/cts-exec-helper.c @@ -0,0 +1,624 @@ +/* + * Copyright 2012-2023 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * + * This source code is licensed under the GNU Lesser General Public License + * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. + */ + +#include <crm_internal.h> + +#include <glib.h> +#include <unistd.h> + +#include <crm/crm.h> +#include <crm/services.h> +#include <crm/common/cmdline_internal.h> +#include <crm/common/mainloop.h> + +#include <crm/pengine/status.h> +#include <crm/pengine/internal.h> +#include <crm/cib.h> +#include <crm/cib/internal.h> +#include <crm/lrmd.h> + +#define SUMMARY "cts-exec-helper - inject commands into the Pacemaker executor and watch for events" + +static int exec_call_id = 0; +static gboolean start_test(gpointer user_data); +static void try_connect(void); + +static char *key = NULL; +static char *val = NULL; + +static struct { + int verbose; + int quiet; + guint interval_ms; + int timeout; + int start_delay; + int cancel_call_id; + gboolean no_wait; + gboolean is_running; + gboolean no_connect; + int exec_call_opts; + const char *api_call; + const char *rsc_id; + const char *provider; + const char *class; + const char *type; + const char *action; + const char *listen; + gboolean use_tls; + lrmd_key_value_t *params; +} options; + +static gboolean +interval_cb(const gchar *option_name, const gchar *optarg, gpointer data, GError **error) { + options.interval_ms = crm_parse_interval_spec(optarg); + return errno == 0; +} + +static gboolean +notify_cb(const gchar *option_name, const gchar *optarg, gpointer data, GError **error) { + if (pcmk__str_any_of(option_name, "--notify-orig", "-n", NULL)) { + options.exec_call_opts = lrmd_opt_notify_orig_only; + } else if (pcmk__str_any_of(option_name, "--notify-changes", "-o", NULL)) { + options.exec_call_opts = lrmd_opt_notify_changes_only; + } + + return TRUE; +} + +static gboolean +param_key_val_cb(const gchar *option_name, const gchar *optarg, gpointer data, GError **error) { + if (pcmk__str_any_of(option_name, "--param-key", "-k", NULL)) { + pcmk__str_update(&key, optarg); + } else if (pcmk__str_any_of(option_name, "--param-val", "-v", NULL)) { + pcmk__str_update(&val, optarg); + } + + if (key != NULL && val != NULL) { + options.params = lrmd_key_value_add(options.params, key, val); + pcmk__str_update(&key, NULL); + pcmk__str_update(&val, NULL); + } + + return TRUE; +} + +static GOptionEntry basic_entries[] = { + { "api-call", 'c', 0, G_OPTION_ARG_STRING, &options.api_call, + "Directly relates to executor API functions", + NULL }, + + { "is-running", 'R', 0, G_OPTION_ARG_NONE, &options.is_running, + "Determine if a resource is registered and running", + NULL }, + + { "listen", 'l', 0, G_OPTION_ARG_STRING, &options.listen, + "Listen for a specific event string", + NULL }, + + { "no-wait", 'w', 0, G_OPTION_ARG_NONE, &options.no_wait, + "Make api call and do not wait for result", + NULL }, + + { "notify-changes", 'o', G_OPTION_FLAG_NO_ARG, G_OPTION_ARG_CALLBACK, notify_cb, + "Only notify client changes to recurring operations", + NULL }, + + { "notify-orig", 'n', G_OPTION_FLAG_NO_ARG, G_OPTION_ARG_CALLBACK, notify_cb, + "Only notify this client of the results of an API action", + NULL }, + + { "tls", 'S', 0, G_OPTION_ARG_NONE, &options.use_tls, + "Use TLS backend for local connection", + NULL }, + + { NULL } +}; + +static GOptionEntry api_call_entries[] = { + { "action", 'a', 0, G_OPTION_ARG_STRING, &options.action, + NULL, NULL }, + + { "cancel-call-id", 'x', 0, G_OPTION_ARG_INT, &options.cancel_call_id, + NULL, NULL }, + + { "class", 'C', 0, G_OPTION_ARG_STRING, &options.class, + NULL, NULL }, + + { "interval", 'i', 0, G_OPTION_ARG_CALLBACK, interval_cb, + NULL, NULL }, + + { "param-key", 'k', 0, G_OPTION_ARG_CALLBACK, param_key_val_cb, + NULL, NULL }, + + { "param-val", 'v', 0, G_OPTION_ARG_CALLBACK, param_key_val_cb, + NULL, NULL }, + + { "provider", 'P', 0, G_OPTION_ARG_STRING, &options.provider, + NULL, NULL }, + + { "rsc-id", 'r', 0, G_OPTION_ARG_STRING, &options.rsc_id, + NULL, NULL }, + + { "start-delay", 's', 0, G_OPTION_ARG_INT, &options.start_delay, + NULL, NULL }, + + { "timeout", 't', 0, G_OPTION_ARG_INT, &options.timeout, + NULL, NULL }, + + { "type", 'T', 0, G_OPTION_ARG_STRING, &options.type, + NULL, NULL }, + + { NULL } +}; + +static GMainLoop *mainloop = NULL; +static lrmd_t *lrmd_conn = NULL; + +static char event_buf_v0[1024]; + +static crm_exit_t +test_exit(crm_exit_t exit_code) +{ + lrmd_api_delete(lrmd_conn); + return crm_exit(exit_code); +} + +#define print_result(fmt, args...) \ + if (!options.quiet) { \ + printf(fmt "\n" , ##args); \ + } + +#define report_event(event) \ + snprintf(event_buf_v0, sizeof(event_buf_v0), "NEW_EVENT event_type:%s rsc_id:%s action:%s rc:%s op_status:%s", \ + lrmd_event_type2str(event->type), \ + event->rsc_id, \ + event->op_type ? event->op_type : "none", \ + services_ocf_exitcode_str(event->rc), \ + pcmk_exec_status_str(event->op_status)); \ + crm_info("%s", event_buf_v0); + +static void +test_shutdown(int nsig) +{ + lrmd_api_delete(lrmd_conn); + lrmd_conn = NULL; +} + +static void +read_events(lrmd_event_data_t * event) +{ + report_event(event); + if (options.listen) { + if (pcmk__str_eq(options.listen, event_buf_v0, pcmk__str_casei)) { + print_result("LISTEN EVENT SUCCESSFUL"); + test_exit(CRM_EX_OK); + } + } + + if (exec_call_id && (event->call_id == exec_call_id)) { + if (event->op_status == 0 && event->rc == 0) { + print_result("API-CALL SUCCESSFUL for 'exec'"); + } else { + print_result("API-CALL FAILURE for 'exec', rc:%d lrmd_op_status:%s", + event->rc, pcmk_exec_status_str(event->op_status)); + test_exit(CRM_EX_ERROR); + } + + if (!options.listen) { + test_exit(CRM_EX_OK); + } + } +} + +static gboolean +timeout_err(gpointer data) +{ + print_result("LISTEN EVENT FAILURE - timeout occurred, never found"); + test_exit(CRM_EX_TIMEOUT); + return FALSE; +} + +static void +connection_events(lrmd_event_data_t * event) +{ + int rc = event->connection_rc; + + if (event->type != lrmd_event_connect) { + /* ignore */ + return; + } + + if (!rc) { + crm_info("Executor client connection established"); + start_test(NULL); + return; + } else { + sleep(1); + try_connect(); + crm_notice("Executor client connection failed"); + } +} + +static void +try_connect(void) +{ + int tries = 10; + static int num_tries = 0; + int rc = 0; + + lrmd_conn->cmds->set_callback(lrmd_conn, connection_events); + for (; num_tries < tries; num_tries++) { + rc = lrmd_conn->cmds->connect_async(lrmd_conn, crm_system_name, 3000); + + if (!rc) { + return; /* we'll hear back in async callback */ + } + sleep(1); + } + + print_result("API CONNECTION FAILURE"); + test_exit(CRM_EX_ERROR); +} + +static gboolean +start_test(gpointer user_data) +{ + int rc = 0; + + if (!options.no_connect) { + if (!lrmd_conn->cmds->is_connected(lrmd_conn)) { + try_connect(); + /* async connect -- this function will get called back into */ + return 0; + } + } + lrmd_conn->cmds->set_callback(lrmd_conn, read_events); + + if (options.timeout) { + g_timeout_add(options.timeout, timeout_err, NULL); + } + + if (!options.api_call) { + return 0; + } + + if (pcmk__str_eq(options.api_call, "exec", pcmk__str_casei)) { + rc = lrmd_conn->cmds->exec(lrmd_conn, + options.rsc_id, + options.action, + NULL, + options.interval_ms, + options.timeout, + options.start_delay, + options.exec_call_opts, + options.params); + + if (rc > 0) { + exec_call_id = rc; + print_result("API-CALL 'exec' action pending, waiting on response"); + } + + } else if (pcmk__str_eq(options.api_call, "register_rsc", pcmk__str_casei)) { + rc = lrmd_conn->cmds->register_rsc(lrmd_conn, + options.rsc_id, + options.class, options.provider, options.type, 0); + } else if (pcmk__str_eq(options.api_call, "get_rsc_info", pcmk__str_casei)) { + lrmd_rsc_info_t *rsc_info; + + rsc_info = lrmd_conn->cmds->get_rsc_info(lrmd_conn, options.rsc_id, 0); + + if (rsc_info) { + print_result("RSC_INFO: id:%s class:%s provider:%s type:%s", + rsc_info->id, rsc_info->standard, + (rsc_info->provider? rsc_info->provider : "<none>"), + rsc_info->type); + lrmd_free_rsc_info(rsc_info); + rc = pcmk_ok; + } else { + rc = -1; + } + } else if (pcmk__str_eq(options.api_call, "unregister_rsc", pcmk__str_casei)) { + rc = lrmd_conn->cmds->unregister_rsc(lrmd_conn, options.rsc_id, 0); + } else if (pcmk__str_eq(options.api_call, "cancel", pcmk__str_casei)) { + rc = lrmd_conn->cmds->cancel(lrmd_conn, options.rsc_id, options.action, + options.interval_ms); + } else if (pcmk__str_eq(options.api_call, "metadata", pcmk__str_casei)) { + char *output = NULL; + + rc = lrmd_conn->cmds->get_metadata(lrmd_conn, + options.class, + options.provider, options.type, &output, 0); + if (rc == pcmk_ok) { + print_result("%s", output); + free(output); + } + } else if (pcmk__str_eq(options.api_call, "list_agents", pcmk__str_casei)) { + lrmd_list_t *list = NULL; + lrmd_list_t *iter = NULL; + + rc = lrmd_conn->cmds->list_agents(lrmd_conn, &list, options.class, options.provider); + + if (rc > 0) { + print_result("%d agents found", rc); + for (iter = list; iter != NULL; iter = iter->next) { + print_result("%s", iter->val); + } + lrmd_list_freeall(list); + rc = 0; + } else { + print_result("API_CALL FAILURE - no agents found"); + rc = -1; + } + } else if (pcmk__str_eq(options.api_call, "list_ocf_providers", pcmk__str_casei)) { + lrmd_list_t *list = NULL; + lrmd_list_t *iter = NULL; + + rc = lrmd_conn->cmds->list_ocf_providers(lrmd_conn, options.type, &list); + + if (rc > 0) { + print_result("%d providers found", rc); + for (iter = list; iter != NULL; iter = iter->next) { + print_result("%s", iter->val); + } + lrmd_list_freeall(list); + rc = 0; + } else { + print_result("API_CALL FAILURE - no providers found"); + rc = -1; + } + + } else if (pcmk__str_eq(options.api_call, "list_standards", pcmk__str_casei)) { + lrmd_list_t *list = NULL; + lrmd_list_t *iter = NULL; + + rc = lrmd_conn->cmds->list_standards(lrmd_conn, &list); + + if (rc > 0) { + print_result("%d standards found", rc); + for (iter = list; iter != NULL; iter = iter->next) { + print_result("%s", iter->val); + } + lrmd_list_freeall(list); + rc = 0; + } else { + print_result("API_CALL FAILURE - no providers found"); + rc = -1; + } + + } else if (pcmk__str_eq(options.api_call, "get_recurring_ops", pcmk__str_casei)) { + GList *op_list = NULL; + GList *op_item = NULL; + rc = lrmd_conn->cmds->get_recurring_ops(lrmd_conn, options.rsc_id, 0, 0, + &op_list); + + for (op_item = op_list; op_item != NULL; op_item = op_item->next) { + lrmd_op_info_t *op_info = op_item->data; + + print_result("RECURRING_OP: %s_%s_%s timeout=%sms", + op_info->rsc_id, op_info->action, + op_info->interval_ms_s, op_info->timeout_ms_s); + lrmd_free_op_info(op_info); + } + g_list_free(op_list); + + } else if (options.api_call) { + print_result("API-CALL FAILURE unknown action '%s'", options.action); + test_exit(CRM_EX_ERROR); + } + + if (rc < 0) { + print_result("API-CALL FAILURE for '%s' api_rc:%d", + options.api_call, rc); + test_exit(CRM_EX_ERROR); + } + + if (options.api_call && rc == pcmk_ok) { + print_result("API-CALL SUCCESSFUL for '%s'", options.api_call); + if (!options.listen) { + test_exit(CRM_EX_OK); + } + } + + if (options.no_wait) { + /* just make the call and exit regardless of anything else. */ + test_exit(CRM_EX_OK); + } + + return 0; +} + +/*! + * \internal + * \brief Generate resource parameters from CIB if none explicitly given + * + * \return Standard Pacemaker return code + */ +static int +generate_params(void) +{ + int rc = pcmk_rc_ok; + pe_working_set_t *data_set = NULL; + xmlNode *cib_xml_copy = NULL; + pe_resource_t *rsc = NULL; + GHashTable *params = NULL; + GHashTable *meta = NULL; + GHashTableIter iter; + char *key = NULL; + char *value = NULL; + + if (options.params != NULL) { + return pcmk_rc_ok; // User specified parameters explicitly + } + + // Retrieve and update CIB + rc = cib__signon_query(NULL, NULL, &cib_xml_copy); + if (rc != pcmk_rc_ok) { + return rc; + } + if (!cli_config_update(&cib_xml_copy, NULL, FALSE)) { + crm_err("Could not update CIB"); + return pcmk_rc_cib_corrupt; + } + + // Calculate cluster status + data_set = pe_new_working_set(); + if (data_set == NULL) { + crm_crit("Could not allocate working set"); + return ENOMEM; + } + pe__set_working_set_flags(data_set, pe_flag_no_counts|pe_flag_no_compat); + data_set->input = cib_xml_copy; + data_set->now = crm_time_new(NULL); + cluster_status(data_set); + + // Find resource in CIB + rsc = pe_find_resource_with_flags(data_set->resources, options.rsc_id, + pe_find_renamed|pe_find_any); + if (rsc == NULL) { + crm_err("Resource does not exist in config"); + pe_free_working_set(data_set); + return EINVAL; + } + + // Add resource instance parameters to options.params + params = pe_rsc_params(rsc, NULL, data_set); + if (params != NULL) { + g_hash_table_iter_init(&iter, params); + while (g_hash_table_iter_next(&iter, (gpointer *) &key, + (gpointer *) &value)) { + options.params = lrmd_key_value_add(options.params, key, value); + } + } + + // Add resource meta-attributes to options.params + meta = pcmk__strkey_table(free, free); + get_meta_attributes(meta, rsc, NULL, data_set); + g_hash_table_iter_init(&iter, meta); + while (g_hash_table_iter_next(&iter, (gpointer *) &key, + (gpointer *) &value)) { + char *crm_name = crm_meta_name(key); + + options.params = lrmd_key_value_add(options.params, crm_name, value); + free(crm_name); + } + g_hash_table_destroy(meta); + + pe_free_working_set(data_set); + return rc; +} + +static GOptionContext * +build_arg_context(pcmk__common_args_t *args, GOptionGroup **group) { + GOptionContext *context = NULL; + + context = pcmk__build_arg_context(args, NULL, group, NULL); + + pcmk__add_main_args(context, basic_entries); + pcmk__add_arg_group(context, "api-call", "API Call Options:", + "Parameters for api-call option", api_call_entries); + + return context; +} + +int +main(int argc, char **argv) +{ + GError *error = NULL; + crm_exit_t exit_code = CRM_EX_OK; + crm_trigger_t *trig = NULL; + + pcmk__common_args_t *args = pcmk__new_common_args(SUMMARY); + /* Typically we'd pass all the single character options that take an argument + * as the second parameter here (and there's a bunch of those in this tool). + * However, we control how this program is called so we can just not call it + * in a way where the preprocessing ever matters. + */ + gchar **processed_args = pcmk__cmdline_preproc(argv, NULL); + GOptionContext *context = build_arg_context(args, NULL); + + if (!g_option_context_parse_strv(context, &processed_args, &error)) { + exit_code = CRM_EX_USAGE; + goto done; + } + + /* We have to use crm_log_init here to set up the logging because there's + * different handling for daemons vs. command line programs, and + * pcmk__cli_init_logging is set up to only handle the latter. + */ + crm_log_init(NULL, LOG_INFO, TRUE, (args->verbosity? TRUE : FALSE), argc, + argv, FALSE); + + for (int i = 0; i < args->verbosity; i++) { + crm_bump_log_level(argc, argv); + } + + if (!options.listen && pcmk__strcase_any_of(options.api_call, "metadata", "list_agents", + "list_standards", "list_ocf_providers", NULL)) { + options.no_connect = TRUE; + } + + if (options.is_running) { + int rc = pcmk_rc_ok; + + if (options.rsc_id == NULL) { + exit_code = CRM_EX_USAGE; + g_set_error(&error, PCMK__EXITC_ERROR, exit_code, + "--is-running requires --rsc-id"); + goto done; + } + + options.interval_ms = 0; + if (options.timeout == 0) { + options.timeout = 30000; + } + + rc = generate_params(); + if (rc != pcmk_rc_ok) { + exit_code = pcmk_rc2exitc(rc); + g_set_error(&error, PCMK__EXITC_ERROR, exit_code, + "Can not determine resource status: " + "unable to get parameters from CIB"); + goto done; + } + options.api_call = "exec"; + options.action = "monitor"; + options.exec_call_opts = lrmd_opt_notify_orig_only; + } + + if (!options.api_call && !options.listen) { + exit_code = CRM_EX_USAGE; + g_set_error(&error, PCMK__EXITC_ERROR, exit_code, + "Must specify at least one of --api-call, --listen, " + "or --is-running"); + goto done; + } + + if (options.use_tls) { + lrmd_conn = lrmd_remote_api_new(NULL, "localhost", 0); + } else { + lrmd_conn = lrmd_api_new(); + } + trig = mainloop_add_trigger(G_PRIORITY_HIGH, start_test, NULL); + mainloop_set_trigger(trig); + mainloop_add_signal(SIGTERM, test_shutdown); + + crm_info("Starting"); + mainloop = g_main_loop_new(NULL, FALSE); + g_main_loop_run(mainloop); + +done: + g_strfreev(processed_args); + pcmk__free_arg_context(context); + + free(key); + free(val); + + pcmk__output_and_clear_error(&error, NULL); + return test_exit(exit_code); +} diff --git a/daemons/execd/execd_alerts.c b/daemons/execd/execd_alerts.c new file mode 100644 index 0000000..5944d93 --- /dev/null +++ b/daemons/execd/execd_alerts.c @@ -0,0 +1,205 @@ +/* + * Copyright 2016-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * + * This source code is licensed under the GNU General Public License version 2 + * or later (GPLv2+) WITHOUT ANY WARRANTY. + */ + +#include <crm_internal.h> + +#include <glib.h> + +#include <crm/crm.h> +#include <crm/services.h> +#include <crm/services_internal.h> +#include <crm/common/ipc.h> +#include <crm/common/ipc_internal.h> +#include <crm/common/alerts_internal.h> +#include <crm/msg_xml.h> + +#include "pacemaker-execd.h" + +/* Track in-flight alerts so we can wait for them at shutdown */ +static GHashTable *inflight_alerts; /* key = call_id, value = timeout */ +static gboolean draining_alerts = FALSE; + +static inline void +add_inflight_alert(int call_id, int timeout) +{ + if (inflight_alerts == NULL) { + inflight_alerts = pcmk__intkey_table(NULL); + } + pcmk__intkey_table_insert(inflight_alerts, call_id, + GINT_TO_POINTER(timeout)); +} + +static inline void +remove_inflight_alert(int call_id) +{ + if (inflight_alerts != NULL) { + pcmk__intkey_table_remove(inflight_alerts, call_id); + } +} + +static int +max_inflight_timeout(void) +{ + GHashTableIter iter; + gpointer timeout; + int max_timeout = 0; + + if (inflight_alerts) { + g_hash_table_iter_init(&iter, inflight_alerts); + while (g_hash_table_iter_next(&iter, NULL, &timeout)) { + if (GPOINTER_TO_INT(timeout) > max_timeout) { + max_timeout = GPOINTER_TO_INT(timeout); + } + } + } + return max_timeout; +} + +struct alert_cb_s { + char *client_id; + int call_id; +}; + +static void +alert_complete(svc_action_t *action) +{ + struct alert_cb_s *cb_data = (struct alert_cb_s *) (action->cb_data); + + CRM_CHECK(cb_data != NULL, return); + + remove_inflight_alert(cb_data->call_id); + + if (action->status != PCMK_EXEC_DONE) { + const char *reason = services__exit_reason(action); + + crm_notice("Could not send alert: %s%s%s%s " CRM_XS " client=%s", + pcmk_exec_status_str(action->status), + (reason == NULL)? "" : " (", + (reason == NULL)? "" : reason, + (reason == NULL)? "" : ")", + cb_data->client_id); + + } else if (action->rc != 0) { + crm_notice("Alert [%d] completed but exited with status %d " + CRM_XS " client=%s", + action->pid, action->rc, cb_data->client_id); + + } else { + crm_debug("Alert [%d] completed " CRM_XS " client=%s", + action->pid, cb_data->client_id); + } + + free(cb_data->client_id); + free(action->cb_data); + action->cb_data = NULL; +} + +int +process_lrmd_alert_exec(pcmk__client_t *client, uint32_t id, xmlNode *request) +{ + static int alert_sequence_no = 0; + + xmlNode *alert_xml = get_xpath_object("//" F_LRMD_ALERT, request, LOG_ERR); + const char *alert_id = crm_element_value(alert_xml, F_LRMD_ALERT_ID); + const char *alert_path = crm_element_value(alert_xml, F_LRMD_ALERT_PATH); + svc_action_t *action = NULL; + int alert_timeout = 0; + int rc = pcmk_ok; + GHashTable *params = NULL; + struct alert_cb_s *cb_data = NULL; + + if ((alert_id == NULL) || (alert_path == NULL) || + (client == NULL) || (client->id == NULL)) { /* hint static analyzer */ + return -EINVAL; + } + if (draining_alerts) { + return pcmk_ok; + } + + crm_element_value_int(alert_xml, F_LRMD_TIMEOUT, &alert_timeout); + + crm_info("Executing alert %s for %s", alert_id, client->id); + + params = xml2list(alert_xml); + pcmk__add_alert_key_int(params, PCMK__alert_key_node_sequence, + ++alert_sequence_no); + + cb_data = calloc(1, sizeof(struct alert_cb_s)); + if (cb_data == NULL) { + rc = -errno; + goto err; + } + + /* coverity[deref_ptr] False Positive */ + cb_data->client_id = strdup(client->id); + if (cb_data->client_id == NULL) { + rc = -errno; + goto err; + } + + crm_element_value_int(request, F_LRMD_CALLID, &(cb_data->call_id)); + + action = services_alert_create(alert_id, alert_path, alert_timeout, params, + alert_sequence_no, cb_data); + if (action->rc != PCMK_OCF_UNKNOWN) { + rc = -E2BIG; + goto err; + } + + rc = services_action_user(action, CRM_DAEMON_USER); + if (rc < 0) { + goto err; + } + + add_inflight_alert(cb_data->call_id, alert_timeout); + if (services_alert_async(action, alert_complete) == FALSE) { + services_action_free(action); + } + return pcmk_ok; + +err: + if (cb_data) { + if (cb_data->client_id) { + free(cb_data->client_id); + } + free(cb_data); + } + services_action_free(action); + return rc; +} + +static bool +drain_check(guint remaining_timeout_ms) +{ + if (inflight_alerts != NULL) { + guint count = g_hash_table_size(inflight_alerts); + + if (count > 0) { + crm_trace("%d alerts pending (%.3fs timeout remaining)", + count, remaining_timeout_ms / 1000.0); + return TRUE; + } + } + return FALSE; +} + +void +lrmd_drain_alerts(GMainLoop *mloop) +{ + if (inflight_alerts != NULL) { + guint timer_ms = max_inflight_timeout() + 5000; + + crm_trace("Draining in-flight alerts (timeout %.3fs)", + timer_ms / 1000.0); + draining_alerts = TRUE; + pcmk_drain_main_loop(mloop, timer_ms, drain_check); + g_hash_table_destroy(inflight_alerts); + inflight_alerts = NULL; + } +} diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c new file mode 100644 index 0000000..fa2761e --- /dev/null +++ b/daemons/execd/execd_commands.c @@ -0,0 +1,1927 @@ +/* + * Copyright 2012-2023 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * + * This source code is licensed under the GNU Lesser General Public License + * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. + */ + +#include <crm_internal.h> +#include <crm/fencing/internal.h> + +#include <glib.h> + +// Check whether we have a high-resolution monotonic clock +#undef PCMK__TIME_USE_CGT +#if HAVE_DECL_CLOCK_MONOTONIC && defined(CLOCK_MONOTONIC) +# define PCMK__TIME_USE_CGT +# include <time.h> /* clock_gettime */ +#endif + +#include <unistd.h> + +#include <crm/crm.h> +#include <crm/fencing/internal.h> +#include <crm/services.h> +#include <crm/services_internal.h> +#include <crm/common/mainloop.h> +#include <crm/common/ipc.h> +#include <crm/common/ipc_internal.h> +#include <crm/msg_xml.h> + +#include "pacemaker-execd.h" + +GHashTable *rsc_list = NULL; + +typedef struct lrmd_cmd_s { + int timeout; + guint interval_ms; + int start_delay; + int timeout_orig; + + int call_id; + + int call_opts; + /* Timer ids, must be removed on cmd destruction. */ + int delay_id; + int stonith_recurring_id; + + int rsc_deleted; + + int service_flags; + + char *client_id; + char *origin; + char *rsc_id; + char *action; + char *real_action; + char *userdata_str; + + pcmk__action_result_t result; + + /* We can track operation queue time and run time, to be saved with the CIB + * resource history (and displayed in cluster status). We need + * high-resolution monotonic time for this purpose, so we use + * clock_gettime(CLOCK_MONOTONIC, ...) (if available, otherwise this feature + * is disabled). + * + * However, we also need epoch timestamps for recording the time the command + * last ran and the time its return value last changed, for use in time + * displays (as opposed to interval calculations). We keep time_t values for + * this purpose. + * + * The last run time is used for both purposes, so we keep redundant + * monotonic and epoch values for this. Technically the two could represent + * different times, but since time_t has only second resolution and the + * values are used for distinct purposes, that is not significant. + */ +#ifdef PCMK__TIME_USE_CGT + /* Recurring and systemd operations may involve more than one executor + * command per operation, so they need info about the original and the most + * recent. + */ + struct timespec t_first_run; // When op first ran + struct timespec t_run; // When op most recently ran + struct timespec t_first_queue; // When op was first queued + struct timespec t_queue; // When op was most recently queued +#endif + time_t epoch_last_run; // Epoch timestamp of when op last ran + time_t epoch_rcchange; // Epoch timestamp of when rc last changed + + bool first_notify_sent; + int last_notify_rc; + int last_notify_op_status; + int last_pid; + + GHashTable *params; +} lrmd_cmd_t; + +static void cmd_finalize(lrmd_cmd_t * cmd, lrmd_rsc_t * rsc); +static gboolean execute_resource_action(gpointer user_data); +static void cancel_all_recurring(lrmd_rsc_t * rsc, const char *client_id); + +#ifdef PCMK__TIME_USE_CGT + +/*! + * \internal + * \brief Check whether a struct timespec has been set + * + * \param[in] timespec Time to check + * + * \return true if timespec has been set (i.e. is nonzero), false otherwise + */ +static inline bool +time_is_set(const struct timespec *timespec) +{ + return (timespec != NULL) && + ((timespec->tv_sec != 0) || (timespec->tv_nsec != 0)); +} + +/* + * \internal + * \brief Set a timespec (and its original if unset) to the current time + * + * \param[out] t_current Where to store current time + * \param[out] t_orig Where to copy t_current if unset + */ +static void +get_current_time(struct timespec *t_current, struct timespec *t_orig) +{ + clock_gettime(CLOCK_MONOTONIC, t_current); + if ((t_orig != NULL) && !time_is_set(t_orig)) { + *t_orig = *t_current; + } +} + +/*! + * \internal + * \brief Return difference between two times in milliseconds + * + * \param[in] now More recent time (or NULL to use current time) + * \param[in] old Earlier time + * + * \return milliseconds difference (or 0 if old is NULL or unset) + * + * \note Can overflow on 32bit machines when the differences is around + * 24 days or more. + */ +static int +time_diff_ms(const struct timespec *now, const struct timespec *old) +{ + int diff_ms = 0; + + if (time_is_set(old)) { + struct timespec local_now = { 0, }; + + if (now == NULL) { + clock_gettime(CLOCK_MONOTONIC, &local_now); + now = &local_now; + } + diff_ms = (now->tv_sec - old->tv_sec) * 1000 + + (now->tv_nsec - old->tv_nsec) / 1000000; + } + return diff_ms; +} + +/*! + * \internal + * \brief Reset a command's operation times to their original values. + * + * Reset a command's run and queued timestamps to the timestamps of the original + * command, so we report the entire time since then and not just the time since + * the most recent command (for recurring and systemd operations). + * + * \param[in,out] cmd Executor command object to reset + * + * \note It's not obvious what the queued time should be for a systemd + * start/stop operation, which might go like this: + * initial command queued 5ms, runs 3s + * monitor command queued 10ms, runs 10s + * monitor command queued 10ms, runs 10s + * Is the queued time for that operation 5ms, 10ms or 25ms? The current + * implementation will report 5ms. If it's 25ms, then we need to + * subtract 20ms from the total exec time so as not to count it twice. + * We can implement that later if it matters to anyone ... + */ +static void +cmd_original_times(lrmd_cmd_t * cmd) +{ + cmd->t_run = cmd->t_first_run; + cmd->t_queue = cmd->t_first_queue; +} +#endif + +static inline bool +action_matches(const lrmd_cmd_t *cmd, const char *action, guint interval_ms) +{ + return (cmd->interval_ms == interval_ms) + && pcmk__str_eq(cmd->action, action, pcmk__str_casei); +} + +/*! + * \internal + * \brief Log the result of an asynchronous command + * + * \param[in] cmd Command to log result for + * \param[in] exec_time_ms Execution time in milliseconds, if known + * \param[in] queue_time_ms Queue time in milliseconds, if known + */ +static void +log_finished(const lrmd_cmd_t *cmd, int exec_time_ms, int queue_time_ms) +{ + int log_level = LOG_INFO; + GString *str = g_string_sized_new(100); // reasonable starting size + + if (pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) { + log_level = LOG_DEBUG; + } + + g_string_append_printf(str, "%s %s (call %d", + cmd->rsc_id, cmd->action, cmd->call_id); + if (cmd->last_pid != 0) { + g_string_append_printf(str, ", PID %d", cmd->last_pid); + } + if (cmd->result.execution_status == PCMK_EXEC_DONE) { + g_string_append_printf(str, ") exited with status %d", + cmd->result.exit_status); + } else { + pcmk__g_strcat(str, ") could not be executed: ", + pcmk_exec_status_str(cmd->result.execution_status), + NULL); + } + if (cmd->result.exit_reason != NULL) { + pcmk__g_strcat(str, " (", cmd->result.exit_reason, ")", NULL); + } + +#ifdef PCMK__TIME_USE_CGT + pcmk__g_strcat(str, " (execution time ", + pcmk__readable_interval(exec_time_ms), NULL); + if (queue_time_ms > 0) { + pcmk__g_strcat(str, " after being queued ", + pcmk__readable_interval(queue_time_ms), NULL); + } + g_string_append_c(str, ')'); +#endif + + do_crm_log(log_level, "%s", str->str); + g_string_free(str, TRUE); +} + +static void +log_execute(lrmd_cmd_t * cmd) +{ + int log_level = LOG_INFO; + + if (pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) { + log_level = LOG_DEBUG; + } + + do_crm_log(log_level, "executing - rsc:%s action:%s call_id:%d", + cmd->rsc_id, cmd->action, cmd->call_id); +} + +static const char * +normalize_action_name(lrmd_rsc_t * rsc, const char *action) +{ + if (pcmk__str_eq(action, "monitor", pcmk__str_casei) && + pcmk_is_set(pcmk_get_ra_caps(rsc->class), pcmk_ra_cap_status)) { + return "status"; + } + return action; +} + +static lrmd_rsc_t * +build_rsc_from_xml(xmlNode * msg) +{ + xmlNode *rsc_xml = get_xpath_object("//" F_LRMD_RSC, msg, LOG_ERR); + lrmd_rsc_t *rsc = NULL; + + rsc = calloc(1, sizeof(lrmd_rsc_t)); + + crm_element_value_int(msg, F_LRMD_CALLOPTS, &rsc->call_opts); + + rsc->rsc_id = crm_element_value_copy(rsc_xml, F_LRMD_RSC_ID); + rsc->class = crm_element_value_copy(rsc_xml, F_LRMD_CLASS); + rsc->provider = crm_element_value_copy(rsc_xml, F_LRMD_PROVIDER); + rsc->type = crm_element_value_copy(rsc_xml, F_LRMD_TYPE); + rsc->work = mainloop_add_trigger(G_PRIORITY_HIGH, execute_resource_action, + rsc); + + // Initialize fence device probes (to return "not running") + pcmk__set_result(&rsc->fence_probe_result, CRM_EX_ERROR, + PCMK_EXEC_NO_FENCE_DEVICE, NULL); + return rsc; +} + +static lrmd_cmd_t * +create_lrmd_cmd(xmlNode *msg, pcmk__client_t *client) +{ + int call_options = 0; + xmlNode *rsc_xml = get_xpath_object("//" F_LRMD_RSC, msg, LOG_ERR); + lrmd_cmd_t *cmd = NULL; + + cmd = calloc(1, sizeof(lrmd_cmd_t)); + + crm_element_value_int(msg, F_LRMD_CALLOPTS, &call_options); + cmd->call_opts = call_options; + cmd->client_id = strdup(client->id); + + crm_element_value_int(msg, F_LRMD_CALLID, &cmd->call_id); + crm_element_value_ms(rsc_xml, F_LRMD_RSC_INTERVAL, &cmd->interval_ms); + crm_element_value_int(rsc_xml, F_LRMD_TIMEOUT, &cmd->timeout); + crm_element_value_int(rsc_xml, F_LRMD_RSC_START_DELAY, &cmd->start_delay); + cmd->timeout_orig = cmd->timeout; + + cmd->origin = crm_element_value_copy(rsc_xml, F_LRMD_ORIGIN); + cmd->action = crm_element_value_copy(rsc_xml, F_LRMD_RSC_ACTION); + cmd->userdata_str = crm_element_value_copy(rsc_xml, F_LRMD_RSC_USERDATA_STR); + cmd->rsc_id = crm_element_value_copy(rsc_xml, F_LRMD_RSC_ID); + + cmd->params = xml2list(rsc_xml); + + if (pcmk__str_eq(g_hash_table_lookup(cmd->params, "CRM_meta_on_fail"), "block", pcmk__str_casei)) { + crm_debug("Setting flag to leave pid group on timeout and " + "only kill action pid for " PCMK__OP_FMT, + cmd->rsc_id, cmd->action, cmd->interval_ms); + cmd->service_flags = pcmk__set_flags_as(__func__, __LINE__, + LOG_TRACE, "Action", + cmd->action, 0, + SVC_ACTION_LEAVE_GROUP, + "SVC_ACTION_LEAVE_GROUP"); + } + return cmd; +} + +static void +stop_recurring_timer(lrmd_cmd_t *cmd) +{ + if (cmd) { + if (cmd->stonith_recurring_id) { + g_source_remove(cmd->stonith_recurring_id); + } + cmd->stonith_recurring_id = 0; + } +} + +static void +free_lrmd_cmd(lrmd_cmd_t * cmd) +{ + stop_recurring_timer(cmd); + if (cmd->delay_id) { + g_source_remove(cmd->delay_id); + } + if (cmd->params) { + g_hash_table_destroy(cmd->params); + } + pcmk__reset_result(&(cmd->result)); + free(cmd->origin); + free(cmd->action); + free(cmd->real_action); + free(cmd->userdata_str); + free(cmd->rsc_id); + free(cmd->client_id); + free(cmd); +} + +static gboolean +stonith_recurring_op_helper(gpointer data) +{ + lrmd_cmd_t *cmd = data; + lrmd_rsc_t *rsc; + + cmd->stonith_recurring_id = 0; + + if (!cmd->rsc_id) { + return FALSE; + } + + rsc = g_hash_table_lookup(rsc_list, cmd->rsc_id); + + CRM_ASSERT(rsc != NULL); + /* take it out of recurring_ops list, and put it in the pending ops + * to be executed */ + rsc->recurring_ops = g_list_remove(rsc->recurring_ops, cmd); + rsc->pending_ops = g_list_append(rsc->pending_ops, cmd); +#ifdef PCMK__TIME_USE_CGT + get_current_time(&(cmd->t_queue), &(cmd->t_first_queue)); +#endif + mainloop_set_trigger(rsc->work); + + return FALSE; +} + +static inline void +start_recurring_timer(lrmd_cmd_t *cmd) +{ + if (cmd && (cmd->interval_ms > 0)) { + cmd->stonith_recurring_id = g_timeout_add(cmd->interval_ms, + stonith_recurring_op_helper, + cmd); + } +} + +static gboolean +start_delay_helper(gpointer data) +{ + lrmd_cmd_t *cmd = data; + lrmd_rsc_t *rsc = NULL; + + cmd->delay_id = 0; + rsc = cmd->rsc_id ? g_hash_table_lookup(rsc_list, cmd->rsc_id) : NULL; + + if (rsc) { + mainloop_set_trigger(rsc->work); + } + + return FALSE; +} + +/*! + * \internal + * \brief Check whether a list already contains the equivalent of a given action + * + * \param[in] action_list List to search + * \param[in] cmd Action to search for + */ +static lrmd_cmd_t * +find_duplicate_action(const GList *action_list, const lrmd_cmd_t *cmd) +{ + for (const GList *item = action_list; item != NULL; item = item->next) { + lrmd_cmd_t *dup = item->data; + + if (action_matches(cmd, dup->action, dup->interval_ms)) { + return dup; + } + } + return NULL; +} + +static bool +merge_recurring_duplicate(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) +{ + lrmd_cmd_t * dup = NULL; + bool dup_pending = true; + + if (cmd->interval_ms == 0) { + return false; + } + + // Search for a duplicate of this action (in-flight or not) + dup = find_duplicate_action(rsc->pending_ops, cmd); + if (dup == NULL) { + dup_pending = false; + dup = find_duplicate_action(rsc->recurring_ops, cmd); + if (dup == NULL) { + return false; + } + } + + /* Do not merge fencing monitors marked for cancellation, so we can reply to + * the cancellation separately. + */ + if (pcmk__str_eq(rsc->class, PCMK_RESOURCE_CLASS_STONITH, + pcmk__str_casei) + && (dup->result.execution_status == PCMK_EXEC_CANCELLED)) { + return false; + } + + /* This should not occur. If it does, we need to investigate how something + * like this is possible in the controller. + */ + crm_warn("Duplicate recurring op entry detected (" PCMK__OP_FMT + "), merging with previous op entry", + rsc->rsc_id, normalize_action_name(rsc, dup->action), + dup->interval_ms); + + // Merge new action's call ID and user data into existing action + dup->first_notify_sent = false; + free(dup->userdata_str); + dup->userdata_str = cmd->userdata_str; + cmd->userdata_str = NULL; + dup->call_id = cmd->call_id; + free_lrmd_cmd(cmd); + cmd = NULL; + + /* If dup is not pending, that means it has already executed at least once + * and is waiting in the interval. In that case, stop waiting and initiate + * a new instance now. + */ + if (!dup_pending) { + if (pcmk__str_eq(rsc->class, PCMK_RESOURCE_CLASS_STONITH, + pcmk__str_casei)) { + stop_recurring_timer(dup); + stonith_recurring_op_helper(dup); + } else { + services_action_kick(rsc->rsc_id, + normalize_action_name(rsc, dup->action), + dup->interval_ms); + } + } + return true; +} + +static void +schedule_lrmd_cmd(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) +{ + CRM_CHECK(cmd != NULL, return); + CRM_CHECK(rsc != NULL, return); + + crm_trace("Scheduling %s on %s", cmd->action, rsc->rsc_id); + + if (merge_recurring_duplicate(rsc, cmd)) { + // Equivalent of cmd has already been scheduled + return; + } + + /* The controller expects the executor to automatically cancel + * recurring operations before a resource stops. + */ + if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) { + cancel_all_recurring(rsc, NULL); + } + + rsc->pending_ops = g_list_append(rsc->pending_ops, cmd); +#ifdef PCMK__TIME_USE_CGT + get_current_time(&(cmd->t_queue), &(cmd->t_first_queue)); +#endif + mainloop_set_trigger(rsc->work); + + if (cmd->start_delay) { + cmd->delay_id = g_timeout_add(cmd->start_delay, start_delay_helper, cmd); + } +} + +static xmlNode * +create_lrmd_reply(const char *origin, int rc, int call_id) +{ + xmlNode *reply = create_xml_node(NULL, T_LRMD_REPLY); + + crm_xml_add(reply, F_LRMD_ORIGIN, origin); + crm_xml_add_int(reply, F_LRMD_RC, rc); + crm_xml_add_int(reply, F_LRMD_CALLID, call_id); + return reply; +} + +static void +send_client_notify(gpointer key, gpointer value, gpointer user_data) +{ + xmlNode *update_msg = user_data; + pcmk__client_t *client = value; + int rc; + int log_level = LOG_WARNING; + const char *msg = NULL; + + CRM_CHECK(client != NULL, return); + if (client->name == NULL) { + crm_trace("Skipping notification to client without name"); + return; + } + if (pcmk_is_set(client->flags, pcmk__client_to_proxy)) { + /* We only want to notify clients of the executor IPC API. If we are + * running as Pacemaker Remote, we may have clients proxied to other + * IPC services in the cluster, so skip those. + */ + crm_trace("Skipping executor API notification to client %s", + pcmk__client_name(client)); + return; + } + + rc = lrmd_server_send_notify(client, update_msg); + if (rc == pcmk_rc_ok) { + return; + } + + switch (rc) { + case ENOTCONN: + case EPIPE: // Client exited without waiting for notification + log_level = LOG_INFO; + msg = "Disconnected"; + break; + + default: + msg = pcmk_rc_str(rc); + break; + } + do_crm_log(log_level, "Could not notify client %s: %s " CRM_XS " rc=%d", + pcmk__client_name(client), msg, rc); +} + +static void +send_cmd_complete_notify(lrmd_cmd_t * cmd) +{ + xmlNode *notify = NULL; + int exec_time = 0; + int queue_time = 0; + +#ifdef PCMK__TIME_USE_CGT + exec_time = time_diff_ms(NULL, &(cmd->t_run)); + queue_time = time_diff_ms(&cmd->t_run, &(cmd->t_queue)); +#endif + log_finished(cmd, exec_time, queue_time); + + /* If the originator requested to be notified only for changes in recurring + * operation results, skip the notification if the result hasn't changed. + */ + if (cmd->first_notify_sent + && pcmk_is_set(cmd->call_opts, lrmd_opt_notify_changes_only) + && (cmd->last_notify_rc == cmd->result.exit_status) + && (cmd->last_notify_op_status == cmd->result.execution_status)) { + return; + } + + cmd->first_notify_sent = true; + cmd->last_notify_rc = cmd->result.exit_status; + cmd->last_notify_op_status = cmd->result.execution_status; + + notify = create_xml_node(NULL, T_LRMD_NOTIFY); + + crm_xml_add(notify, F_LRMD_ORIGIN, __func__); + crm_xml_add_int(notify, F_LRMD_TIMEOUT, cmd->timeout); + crm_xml_add_ms(notify, F_LRMD_RSC_INTERVAL, cmd->interval_ms); + crm_xml_add_int(notify, F_LRMD_RSC_START_DELAY, cmd->start_delay); + crm_xml_add_int(notify, F_LRMD_EXEC_RC, cmd->result.exit_status); + crm_xml_add_int(notify, F_LRMD_OP_STATUS, cmd->result.execution_status); + crm_xml_add_int(notify, F_LRMD_CALLID, cmd->call_id); + crm_xml_add_int(notify, F_LRMD_RSC_DELETED, cmd->rsc_deleted); + + crm_xml_add_ll(notify, F_LRMD_RSC_RUN_TIME, + (long long) cmd->epoch_last_run); + crm_xml_add_ll(notify, F_LRMD_RSC_RCCHANGE_TIME, + (long long) cmd->epoch_rcchange); +#ifdef PCMK__TIME_USE_CGT + crm_xml_add_int(notify, F_LRMD_RSC_EXEC_TIME, exec_time); + crm_xml_add_int(notify, F_LRMD_RSC_QUEUE_TIME, queue_time); +#endif + + crm_xml_add(notify, F_LRMD_OPERATION, LRMD_OP_RSC_EXEC); + crm_xml_add(notify, F_LRMD_RSC_ID, cmd->rsc_id); + if(cmd->real_action) { + crm_xml_add(notify, F_LRMD_RSC_ACTION, cmd->real_action); + } else { + crm_xml_add(notify, F_LRMD_RSC_ACTION, cmd->action); + } + crm_xml_add(notify, F_LRMD_RSC_USERDATA_STR, cmd->userdata_str); + crm_xml_add(notify, F_LRMD_RSC_EXIT_REASON, cmd->result.exit_reason); + + if (cmd->result.action_stderr != NULL) { + crm_xml_add(notify, F_LRMD_RSC_OUTPUT, cmd->result.action_stderr); + + } else if (cmd->result.action_stdout != NULL) { + crm_xml_add(notify, F_LRMD_RSC_OUTPUT, cmd->result.action_stdout); + } + + if (cmd->params) { + char *key = NULL; + char *value = NULL; + GHashTableIter iter; + + xmlNode *args = create_xml_node(notify, XML_TAG_ATTRS); + + g_hash_table_iter_init(&iter, cmd->params); + while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & value)) { + hash2smartfield((gpointer) key, (gpointer) value, args); + } + } + if ((cmd->client_id != NULL) + && pcmk_is_set(cmd->call_opts, lrmd_opt_notify_orig_only)) { + + pcmk__client_t *client = pcmk__find_client_by_id(cmd->client_id); + + if (client != NULL) { + send_client_notify(client->id, client, notify); + } + } else { + pcmk__foreach_ipc_client(send_client_notify, notify); + } + + free_xml(notify); +} + +static void +send_generic_notify(int rc, xmlNode * request) +{ + if (pcmk__ipc_client_count() != 0) { + int call_id = 0; + xmlNode *notify = NULL; + xmlNode *rsc_xml = get_xpath_object("//" F_LRMD_RSC, request, LOG_ERR); + const char *rsc_id = crm_element_value(rsc_xml, F_LRMD_RSC_ID); + const char *op = crm_element_value(request, F_LRMD_OPERATION); + + crm_element_value_int(request, F_LRMD_CALLID, &call_id); + + notify = create_xml_node(NULL, T_LRMD_NOTIFY); + crm_xml_add(notify, F_LRMD_ORIGIN, __func__); + crm_xml_add_int(notify, F_LRMD_RC, rc); + crm_xml_add_int(notify, F_LRMD_CALLID, call_id); + crm_xml_add(notify, F_LRMD_OPERATION, op); + crm_xml_add(notify, F_LRMD_RSC_ID, rsc_id); + + pcmk__foreach_ipc_client(send_client_notify, notify); + + free_xml(notify); + } +} + +static void +cmd_reset(lrmd_cmd_t * cmd) +{ + cmd->last_pid = 0; +#ifdef PCMK__TIME_USE_CGT + memset(&cmd->t_run, 0, sizeof(cmd->t_run)); + memset(&cmd->t_queue, 0, sizeof(cmd->t_queue)); +#endif + cmd->epoch_last_run = 0; + + pcmk__reset_result(&(cmd->result)); + cmd->result.execution_status = PCMK_EXEC_DONE; +} + +static void +cmd_finalize(lrmd_cmd_t * cmd, lrmd_rsc_t * rsc) +{ + crm_trace("Resource operation rsc:%s action:%s completed (%p %p)", cmd->rsc_id, cmd->action, + rsc ? rsc->active : NULL, cmd); + + if (rsc && (rsc->active == cmd)) { + rsc->active = NULL; + mainloop_set_trigger(rsc->work); + } + + if (!rsc) { + cmd->rsc_deleted = 1; + } + + /* reset original timeout so client notification has correct information */ + cmd->timeout = cmd->timeout_orig; + + send_cmd_complete_notify(cmd); + + if ((cmd->interval_ms != 0) + && (cmd->result.execution_status == PCMK_EXEC_CANCELLED)) { + + if (rsc) { + rsc->recurring_ops = g_list_remove(rsc->recurring_ops, cmd); + rsc->pending_ops = g_list_remove(rsc->pending_ops, cmd); + } + free_lrmd_cmd(cmd); + } else if (cmd->interval_ms == 0) { + if (rsc) { + rsc->pending_ops = g_list_remove(rsc->pending_ops, cmd); + } + free_lrmd_cmd(cmd); + } else { + /* Clear all the values pertaining just to the last iteration of a recurring op. */ + cmd_reset(cmd); + } +} + +struct notify_new_client_data { + xmlNode *notify; + pcmk__client_t *new_client; +}; + +static void +notify_one_client(gpointer key, gpointer value, gpointer user_data) +{ + pcmk__client_t *client = value; + struct notify_new_client_data *data = user_data; + + if (!pcmk__str_eq(client->id, data->new_client->id, pcmk__str_casei)) { + send_client_notify(key, (gpointer) client, (gpointer) data->notify); + } +} + +void +notify_of_new_client(pcmk__client_t *new_client) +{ + struct notify_new_client_data data; + + data.new_client = new_client; + data.notify = create_xml_node(NULL, T_LRMD_NOTIFY); + crm_xml_add(data.notify, F_LRMD_ORIGIN, __func__); + crm_xml_add(data.notify, F_LRMD_OPERATION, LRMD_OP_NEW_CLIENT); + pcmk__foreach_ipc_client(notify_one_client, &data); + free_xml(data.notify); +} + +void +client_disconnect_cleanup(const char *client_id) +{ + GHashTableIter iter; + lrmd_rsc_t *rsc = NULL; + char *key = NULL; + + g_hash_table_iter_init(&iter, rsc_list); + while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & rsc)) { + if (pcmk_all_flags_set(rsc->call_opts, lrmd_opt_drop_recurring)) { + /* This client is disconnecting, drop any recurring operations + * it may have initiated on the resource */ + cancel_all_recurring(rsc, client_id); + } + } +} + +static void +action_complete(svc_action_t * action) +{ + lrmd_rsc_t *rsc; + lrmd_cmd_t *cmd = action->cb_data; + enum ocf_exitcode code; + +#ifdef PCMK__TIME_USE_CGT + const char *rclass = NULL; + bool goagain = false; +#endif + + if (!cmd) { + crm_err("Completed executor action (%s) does not match any known operations", + action->id); + return; + } + +#ifdef PCMK__TIME_USE_CGT + if (cmd->result.exit_status != action->rc) { + cmd->epoch_rcchange = time(NULL); + } +#endif + + cmd->last_pid = action->pid; + + // Cast variable instead of function return to keep compilers happy + code = services_result2ocf(action->standard, cmd->action, action->rc); + pcmk__set_result(&(cmd->result), (int) code, + action->status, services__exit_reason(action)); + + rsc = cmd->rsc_id ? g_hash_table_lookup(rsc_list, cmd->rsc_id) : NULL; + +#ifdef PCMK__TIME_USE_CGT + if (rsc && pcmk__str_eq(rsc->class, PCMK_RESOURCE_CLASS_SERVICE, pcmk__str_casei)) { + rclass = resources_find_service_class(rsc->type); + } else if(rsc) { + rclass = rsc->class; + } + + if (pcmk__str_eq(rclass, PCMK_RESOURCE_CLASS_SYSTEMD, pcmk__str_casei)) { + if (pcmk__result_ok(&(cmd->result)) + && pcmk__strcase_any_of(cmd->action, "start", "stop", NULL)) { + /* systemd returns from start and stop actions after the action + * begins, not after it completes. We have to jump through a few + * hoops so that we don't report 'complete' to the rest of pacemaker + * until it's actually done. + */ + goagain = true; + cmd->real_action = cmd->action; + cmd->action = strdup("monitor"); + + } else if (cmd->real_action != NULL) { + // This is follow-up monitor to check whether start/stop completed + if (cmd->result.execution_status == PCMK_EXEC_PENDING) { + goagain = true; + + } else if (pcmk__result_ok(&(cmd->result)) + && pcmk__str_eq(cmd->real_action, "stop", pcmk__str_casei)) { + goagain = true; + + } else { + int time_sum = time_diff_ms(NULL, &(cmd->t_first_run)); + int timeout_left = cmd->timeout_orig - time_sum; + + crm_debug("%s systemd %s is now complete (elapsed=%dms, " + "remaining=%dms): %s (%d)", + cmd->rsc_id, cmd->real_action, time_sum, timeout_left, + services_ocf_exitcode_str(cmd->result.exit_status), + cmd->result.exit_status); + cmd_original_times(cmd); + + // Monitors may return "not running", but start/stop shouldn't + if ((cmd->result.execution_status == PCMK_EXEC_DONE) + && (cmd->result.exit_status == PCMK_OCF_NOT_RUNNING)) { + + if (pcmk__str_eq(cmd->real_action, "start", pcmk__str_casei)) { + cmd->result.exit_status = PCMK_OCF_UNKNOWN_ERROR; + } else if (pcmk__str_eq(cmd->real_action, "stop", pcmk__str_casei)) { + cmd->result.exit_status = PCMK_OCF_OK; + } + } + } + } + } +#endif + +#if SUPPORT_NAGIOS + if (rsc && pcmk__str_eq(rsc->class, PCMK_RESOURCE_CLASS_NAGIOS, pcmk__str_casei)) { + if (action_matches(cmd, "monitor", 0) + && pcmk__result_ok(&(cmd->result))) { + /* Successfully executed --version for the nagios plugin */ + cmd->result.exit_status = PCMK_OCF_NOT_RUNNING; + + } else if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei) + && !pcmk__result_ok(&(cmd->result))) { +#ifdef PCMK__TIME_USE_CGT + goagain = true; +#endif + } + } +#endif + +#ifdef PCMK__TIME_USE_CGT + if (goagain) { + int time_sum = time_diff_ms(NULL, &(cmd->t_first_run)); + int timeout_left = cmd->timeout_orig - time_sum; + int delay = cmd->timeout_orig / 10; + + if(delay >= timeout_left && timeout_left > 20) { + delay = timeout_left/2; + } + + delay = QB_MIN(2000, delay); + if (delay < timeout_left) { + cmd->start_delay = delay; + cmd->timeout = timeout_left; + + if (pcmk__result_ok(&(cmd->result))) { + crm_debug("%s %s may still be in progress: re-scheduling (elapsed=%dms, remaining=%dms, start_delay=%dms)", + cmd->rsc_id, cmd->real_action, time_sum, timeout_left, delay); + + } else if (cmd->result.execution_status == PCMK_EXEC_PENDING) { + crm_info("%s %s is still in progress: re-scheduling (elapsed=%dms, remaining=%dms, start_delay=%dms)", + cmd->rsc_id, cmd->action, time_sum, timeout_left, delay); + + } else { + crm_notice("%s %s failed '%s' (%d): re-scheduling (elapsed=%dms, remaining=%dms, start_delay=%dms)", + cmd->rsc_id, cmd->action, + services_ocf_exitcode_str(cmd->result.exit_status), + cmd->result.exit_status, time_sum, timeout_left, + delay); + } + + cmd_reset(cmd); + if(rsc) { + rsc->active = NULL; + } + schedule_lrmd_cmd(rsc, cmd); + + /* Don't finalize cmd, we're not done with it yet */ + return; + + } else { + crm_notice("Giving up on %s %s (rc=%d): timeout (elapsed=%dms, remaining=%dms)", + cmd->rsc_id, + (cmd->real_action? cmd->real_action : cmd->action), + cmd->result.exit_status, time_sum, timeout_left); + pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR, + PCMK_EXEC_TIMEOUT, + "Investigate reason for timeout, and adjust " + "configured operation timeout if necessary"); + cmd_original_times(cmd); + } + } +#endif + + pcmk__set_result_output(&(cmd->result), services__grab_stdout(action), + services__grab_stderr(action)); + cmd_finalize(cmd, rsc); +} + +/*! + * \internal + * \brief Process the result of a fence device action (start, stop, or monitor) + * + * \param[in,out] cmd Fence device action that completed + * \param[in] exit_status Fencer API exit status for action + * \param[in] execution_status Fencer API execution status for action + * \param[in] exit_reason Human-friendly detail, if action failed + */ +static void +stonith_action_complete(lrmd_cmd_t *cmd, int exit_status, + enum pcmk_exec_status execution_status, + const char *exit_reason) +{ + // This can be NULL if resource was removed before command completed + lrmd_rsc_t *rsc = g_hash_table_lookup(rsc_list, cmd->rsc_id); + + // Simplify fencer exit status to uniform exit status + if (exit_status != CRM_EX_OK) { + exit_status = PCMK_OCF_UNKNOWN_ERROR; + } + + if (cmd->result.execution_status == PCMK_EXEC_CANCELLED) { + /* An in-flight fence action was cancelled. The execution status is + * already correct, so don't overwrite it. + */ + execution_status = PCMK_EXEC_CANCELLED; + + } else { + /* Some execution status codes have specific meanings for the fencer + * that executor clients may not expect, so map them to a simple error + * status. + */ + switch (execution_status) { + case PCMK_EXEC_NOT_CONNECTED: + case PCMK_EXEC_INVALID: + execution_status = PCMK_EXEC_ERROR; + break; + + case PCMK_EXEC_NO_FENCE_DEVICE: + /* This should be possible only for probes in practice, but + * interpret for all actions to be safe. + */ + if (pcmk__str_eq(cmd->action, CRMD_ACTION_STATUS, + pcmk__str_none)) { + exit_status = PCMK_OCF_NOT_RUNNING; + + } else if (pcmk__str_eq(cmd->action, CRMD_ACTION_STOP, + pcmk__str_none)) { + exit_status = PCMK_OCF_OK; + + } else { + exit_status = PCMK_OCF_NOT_INSTALLED; + } + execution_status = PCMK_EXEC_ERROR; + break; + + case PCMK_EXEC_NOT_SUPPORTED: + exit_status = PCMK_OCF_UNIMPLEMENT_FEATURE; + break; + + default: + break; + } + } + + pcmk__set_result(&cmd->result, exit_status, execution_status, exit_reason); + + // Certain successful actions change the known state of the resource + if ((rsc != NULL) && pcmk__result_ok(&(cmd->result))) { + + if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) { + pcmk__set_result(&rsc->fence_probe_result, CRM_EX_OK, + PCMK_EXEC_DONE, NULL); // "running" + + } else if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) { + pcmk__set_result(&rsc->fence_probe_result, CRM_EX_ERROR, + PCMK_EXEC_NO_FENCE_DEVICE, NULL); // "not running" + } + } + + /* The recurring timer should not be running at this point in any case, but + * as a failsafe, stop it if it is. + */ + stop_recurring_timer(cmd); + + /* Reschedule this command if appropriate. If a recurring command is *not* + * rescheduled, its status must be PCMK_EXEC_CANCELLED, otherwise it will + * not be removed from recurring_ops by cmd_finalize(). + */ + if (rsc && (cmd->interval_ms > 0) + && (cmd->result.execution_status != PCMK_EXEC_CANCELLED)) { + start_recurring_timer(cmd); + } + + cmd_finalize(cmd, rsc); +} + +static void +lrmd_stonith_callback(stonith_t * stonith, stonith_callback_data_t * data) +{ + if ((data == NULL) || (data->userdata == NULL)) { + crm_err("Ignoring fence action result: " + "Invalid callback arguments (bug?)"); + } else { + stonith_action_complete((lrmd_cmd_t *) data->userdata, + stonith__exit_status(data), + stonith__execution_status(data), + stonith__exit_reason(data)); + } +} + +void +stonith_connection_failed(void) +{ + GHashTableIter iter; + lrmd_rsc_t *rsc = NULL; + + crm_warn("Connection to fencer lost (any pending operations for " + "fence devices will be considered failed)"); + + g_hash_table_iter_init(&iter, rsc_list); + while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &rsc)) { + if (!pcmk__str_eq(rsc->class, PCMK_RESOURCE_CLASS_STONITH, + pcmk__str_none)) { + continue; + } + + /* If we registered this fence device, we don't know whether the + * fencer still has the registration or not. Cause future probes to + * return an error until the resource is stopped or started + * successfully. This is especially important if the controller also + * went away (possibly due to a cluster layer restart) and won't + * receive our client notification of any monitors finalized below. + */ + if (rsc->fence_probe_result.execution_status == PCMK_EXEC_DONE) { + pcmk__set_result(&rsc->fence_probe_result, CRM_EX_ERROR, + PCMK_EXEC_NOT_CONNECTED, + "Lost connection to fencer"); + } + + // Consider any active, pending, or recurring operations as failed + + for (GList *op = rsc->recurring_ops; op != NULL; op = op->next) { + lrmd_cmd_t *cmd = op->data; + + /* This won't free a recurring op but instead restart its timer. + * If cmd is rsc->active, this will set rsc->active to NULL, so we + * don't have to worry about finalizing it a second time below. + */ + stonith_action_complete(cmd, + CRM_EX_ERROR, PCMK_EXEC_NOT_CONNECTED, + "Lost connection to fencer"); + } + + if (rsc->active != NULL) { + rsc->pending_ops = g_list_prepend(rsc->pending_ops, rsc->active); + } + while (rsc->pending_ops != NULL) { + // This will free the op and remove it from rsc->pending_ops + stonith_action_complete((lrmd_cmd_t *) rsc->pending_ops->data, + CRM_EX_ERROR, PCMK_EXEC_NOT_CONNECTED, + "Lost connection to fencer"); + } + } +} + +/*! + * \internal + * \brief Execute a stonith resource "start" action + * + * Start a stonith resource by registering it with the fencer. + * (Stonith agents don't have a start command.) + * + * \param[in,out] stonith_api Connection to fencer + * \param[in] rsc Stonith resource to start + * \param[in] cmd Start command to execute + * + * \return pcmk_ok on success, -errno otherwise + */ +static int +execd_stonith_start(stonith_t *stonith_api, const lrmd_rsc_t *rsc, + const lrmd_cmd_t *cmd) +{ + char *key = NULL; + char *value = NULL; + stonith_key_value_t *device_params = NULL; + int rc = pcmk_ok; + + // Convert command parameters to stonith API key/values + if (cmd->params) { + GHashTableIter iter; + + g_hash_table_iter_init(&iter, cmd->params); + while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & value)) { + device_params = stonith_key_value_add(device_params, key, value); + } + } + + /* The fencer will automatically register devices via CIB notifications + * when the CIB changes, but to avoid a possible race condition between + * the fencer receiving the notification and the executor requesting that + * resource, the executor registers the device as well. The fencer knows how + * to handle duplicate registrations. + */ + rc = stonith_api->cmds->register_device(stonith_api, st_opt_sync_call, + cmd->rsc_id, rsc->provider, + rsc->type, device_params); + + stonith_key_value_freeall(device_params, 1, 1); + return rc; +} + +/*! + * \internal + * \brief Execute a stonith resource "stop" action + * + * Stop a stonith resource by unregistering it with the fencer. + * (Stonith agents don't have a stop command.) + * + * \param[in,out] stonith_api Connection to fencer + * \param[in] rsc Stonith resource to stop + * + * \return pcmk_ok on success, -errno otherwise + */ +static inline int +execd_stonith_stop(stonith_t *stonith_api, const lrmd_rsc_t *rsc) +{ + /* @TODO Failure would indicate a problem communicating with fencer; + * perhaps we should try reconnecting and retrying a few times? + */ + return stonith_api->cmds->remove_device(stonith_api, st_opt_sync_call, + rsc->rsc_id); +} + +/*! + * \internal + * \brief Initiate a stonith resource agent recurring "monitor" action + * + * \param[in,out] stonith_api Connection to fencer + * \param[in,out] rsc Stonith resource to monitor + * \param[in] cmd Monitor command being executed + * + * \return pcmk_ok if monitor was successfully initiated, -errno otherwise + */ +static inline int +execd_stonith_monitor(stonith_t *stonith_api, lrmd_rsc_t *rsc, lrmd_cmd_t *cmd) +{ + int rc = stonith_api->cmds->monitor(stonith_api, 0, cmd->rsc_id, + cmd->timeout / 1000); + + rc = stonith_api->cmds->register_callback(stonith_api, rc, 0, 0, cmd, + "lrmd_stonith_callback", + lrmd_stonith_callback); + if (rc == TRUE) { + rsc->active = cmd; + rc = pcmk_ok; + } else { + rc = -pcmk_err_generic; + } + return rc; +} + +static void +execute_stonith_action(lrmd_rsc_t *rsc, lrmd_cmd_t *cmd) +{ + int rc = 0; + bool do_monitor = FALSE; + + stonith_t *stonith_api = get_stonith_connection(); + + if (pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei) + && (cmd->interval_ms == 0)) { + // Probes don't require a fencer connection + stonith_action_complete(cmd, rsc->fence_probe_result.exit_status, + rsc->fence_probe_result.execution_status, + rsc->fence_probe_result.exit_reason); + return; + + } else if (stonith_api == NULL) { + stonith_action_complete(cmd, PCMK_OCF_UNKNOWN_ERROR, + PCMK_EXEC_NOT_CONNECTED, + "No connection to fencer"); + return; + + } else if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) { + rc = execd_stonith_start(stonith_api, rsc, cmd); + if (rc == pcmk_ok) { + do_monitor = TRUE; + } + + } else if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) { + rc = execd_stonith_stop(stonith_api, rsc); + + } else if (pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) { + do_monitor = TRUE; + + } else { + stonith_action_complete(cmd, PCMK_OCF_UNIMPLEMENT_FEATURE, + PCMK_EXEC_ERROR, + "Invalid fence device action (bug?)"); + return; + } + + if (do_monitor) { + rc = execd_stonith_monitor(stonith_api, rsc, cmd); + if (rc == pcmk_ok) { + // Don't clean up yet, we will find out result of the monitor later + return; + } + } + + stonith_action_complete(cmd, + ((rc == pcmk_ok)? CRM_EX_OK : CRM_EX_ERROR), + stonith__legacy2status(rc), + ((rc == -pcmk_err_generic)? NULL : pcmk_strerror(rc))); +} + +static void +execute_nonstonith_action(lrmd_rsc_t *rsc, lrmd_cmd_t *cmd) +{ + svc_action_t *action = NULL; + GHashTable *params_copy = NULL; + + CRM_ASSERT(rsc); + CRM_ASSERT(cmd); + + crm_trace("Creating action, resource:%s action:%s class:%s provider:%s agent:%s", + rsc->rsc_id, cmd->action, rsc->class, rsc->provider, rsc->type); + +#if SUPPORT_NAGIOS + /* Recurring operations are cancelled anyway for a stop operation */ + if (pcmk__str_eq(rsc->class, PCMK_RESOURCE_CLASS_NAGIOS, pcmk__str_casei) + && pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) { + + cmd->result.exit_status = PCMK_OCF_OK; + cmd_finalize(cmd, rsc); + return; + } +#endif + + params_copy = pcmk__str_table_dup(cmd->params); + + action = services__create_resource_action(rsc->rsc_id, rsc->class, rsc->provider, + rsc->type, + normalize_action_name(rsc, cmd->action), + cmd->interval_ms, cmd->timeout, + params_copy, cmd->service_flags); + + if (action == NULL) { + pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR, + PCMK_EXEC_ERROR, strerror(ENOMEM)); + cmd_finalize(cmd, rsc); + return; + } + + if (action->rc != PCMK_OCF_UNKNOWN) { + pcmk__set_result(&(cmd->result), action->rc, action->status, + services__exit_reason(action)); + services_action_free(action); + cmd_finalize(cmd, rsc); + return; + } + + action->cb_data = cmd; + + if (services_action_async(action, action_complete)) { + /* The services library has taken responsibility for the action. It + * could be pending, blocked, or merged into a duplicate recurring + * action, in which case the action callback (action_complete()) + * will be called when the action completes, otherwise the callback has + * already been called. + * + * action_complete() calls cmd_finalize() which can free cmd, so cmd + * cannot be used here. + */ + } else { + /* This is a recurring action that is not being cancelled and could not + * be initiated. It has been rescheduled, and the action callback + * (action_complete()) has been called, which in this case has already + * called cmd_finalize(), which in this case should only reset (not + * free) cmd. + */ + + pcmk__set_result(&(cmd->result), action->rc, action->status, + services__exit_reason(action)); + services_action_free(action); + } +} + +static gboolean +execute_resource_action(gpointer user_data) +{ + lrmd_rsc_t *rsc = (lrmd_rsc_t *) user_data; + lrmd_cmd_t *cmd = NULL; + + CRM_CHECK(rsc != NULL, return FALSE); + + if (rsc->active) { + crm_trace("%s is still active", rsc->rsc_id); + return TRUE; + } + + if (rsc->pending_ops) { + GList *first = rsc->pending_ops; + + cmd = first->data; + if (cmd->delay_id) { + crm_trace + ("Command %s %s was asked to run too early, waiting for start_delay timeout of %dms", + cmd->rsc_id, cmd->action, cmd->start_delay); + return TRUE; + } + rsc->pending_ops = g_list_remove_link(rsc->pending_ops, first); + g_list_free_1(first); + +#ifdef PCMK__TIME_USE_CGT + get_current_time(&(cmd->t_run), &(cmd->t_first_run)); +#endif + cmd->epoch_last_run = time(NULL); + } + + if (!cmd) { + crm_trace("Nothing further to do for %s", rsc->rsc_id); + return TRUE; + } + + rsc->active = cmd; /* only one op at a time for a rsc */ + if (cmd->interval_ms) { + rsc->recurring_ops = g_list_append(rsc->recurring_ops, cmd); + } + + log_execute(cmd); + + if (pcmk__str_eq(rsc->class, PCMK_RESOURCE_CLASS_STONITH, pcmk__str_casei)) { + execute_stonith_action(rsc, cmd); + } else { + execute_nonstonith_action(rsc, cmd); + } + + return TRUE; +} + +void +free_rsc(gpointer data) +{ + GList *gIter = NULL; + lrmd_rsc_t *rsc = data; + int is_stonith = pcmk__str_eq(rsc->class, PCMK_RESOURCE_CLASS_STONITH, + pcmk__str_casei); + + gIter = rsc->pending_ops; + while (gIter != NULL) { + GList *next = gIter->next; + lrmd_cmd_t *cmd = gIter->data; + + /* command was never executed */ + cmd->result.execution_status = PCMK_EXEC_CANCELLED; + cmd_finalize(cmd, NULL); + + gIter = next; + } + /* frees list, but not list elements. */ + g_list_free(rsc->pending_ops); + + gIter = rsc->recurring_ops; + while (gIter != NULL) { + GList *next = gIter->next; + lrmd_cmd_t *cmd = gIter->data; + + if (is_stonith) { + cmd->result.execution_status = PCMK_EXEC_CANCELLED; + /* If a stonith command is in-flight, just mark it as cancelled; + * it is not safe to finalize/free the cmd until the stonith api + * says it has either completed or timed out. + */ + if (rsc->active != cmd) { + cmd_finalize(cmd, NULL); + } + } else { + /* This command is already handed off to service library, + * let service library cancel it and tell us via the callback + * when it is cancelled. The rsc can be safely destroyed + * even if we are waiting for the cancel result */ + services_action_cancel(rsc->rsc_id, + normalize_action_name(rsc, cmd->action), + cmd->interval_ms); + } + + gIter = next; + } + /* frees list, but not list elements. */ + g_list_free(rsc->recurring_ops); + + free(rsc->rsc_id); + free(rsc->class); + free(rsc->provider); + free(rsc->type); + mainloop_destroy_trigger(rsc->work); + + free(rsc); +} + +static int +process_lrmd_signon(pcmk__client_t *client, xmlNode *request, int call_id, + xmlNode **reply) +{ + int rc = pcmk_ok; + time_t now = time(NULL); + const char *protocol_version = crm_element_value(request, F_LRMD_PROTOCOL_VERSION); + + if (compare_version(protocol_version, LRMD_MIN_PROTOCOL_VERSION) < 0) { + crm_err("Cluster API version must be greater than or equal to %s, not %s", + LRMD_MIN_PROTOCOL_VERSION, protocol_version); + rc = -EPROTO; + } + + if (pcmk__xe_attr_is_true(request, F_LRMD_IS_IPC_PROVIDER)) { +#ifdef PCMK__COMPILE_REMOTE + if ((client->remote != NULL) + && pcmk_is_set(client->flags, + pcmk__client_tls_handshake_complete)) { + + // This is a remote connection from a cluster node's controller + ipc_proxy_add_provider(client); + } else { + rc = -EACCES; + } +#else + rc = -EPROTONOSUPPORT; +#endif + } + + *reply = create_lrmd_reply(__func__, rc, call_id); + crm_xml_add(*reply, F_LRMD_OPERATION, CRM_OP_REGISTER); + crm_xml_add(*reply, F_LRMD_CLIENTID, client->id); + crm_xml_add(*reply, F_LRMD_PROTOCOL_VERSION, LRMD_PROTOCOL_VERSION); + crm_xml_add_ll(*reply, PCMK__XA_UPTIME, now - start_time); + + return rc; +} + +static int +process_lrmd_rsc_register(pcmk__client_t *client, uint32_t id, xmlNode *request) +{ + int rc = pcmk_ok; + lrmd_rsc_t *rsc = build_rsc_from_xml(request); + lrmd_rsc_t *dup = g_hash_table_lookup(rsc_list, rsc->rsc_id); + + if (dup && + pcmk__str_eq(rsc->class, dup->class, pcmk__str_casei) && + pcmk__str_eq(rsc->provider, dup->provider, pcmk__str_casei) && pcmk__str_eq(rsc->type, dup->type, pcmk__str_casei)) { + + crm_notice("Ignoring duplicate registration of '%s'", rsc->rsc_id); + free_rsc(rsc); + return rc; + } + + g_hash_table_replace(rsc_list, rsc->rsc_id, rsc); + crm_info("Cached agent information for '%s'", rsc->rsc_id); + return rc; +} + +static xmlNode * +process_lrmd_get_rsc_info(xmlNode *request, int call_id) +{ + int rc = pcmk_ok; + xmlNode *rsc_xml = get_xpath_object("//" F_LRMD_RSC, request, LOG_ERR); + const char *rsc_id = crm_element_value(rsc_xml, F_LRMD_RSC_ID); + xmlNode *reply = NULL; + lrmd_rsc_t *rsc = NULL; + + if (rsc_id == NULL) { + rc = -ENODEV; + } else { + rsc = g_hash_table_lookup(rsc_list, rsc_id); + if (rsc == NULL) { + crm_info("Agent information for '%s' not in cache", rsc_id); + rc = -ENODEV; + } + } + + reply = create_lrmd_reply(__func__, rc, call_id); + if (rsc) { + crm_xml_add(reply, F_LRMD_RSC_ID, rsc->rsc_id); + crm_xml_add(reply, F_LRMD_CLASS, rsc->class); + crm_xml_add(reply, F_LRMD_PROVIDER, rsc->provider); + crm_xml_add(reply, F_LRMD_TYPE, rsc->type); + } + return reply; +} + +static int +process_lrmd_rsc_unregister(pcmk__client_t *client, uint32_t id, + xmlNode *request) +{ + int rc = pcmk_ok; + lrmd_rsc_t *rsc = NULL; + xmlNode *rsc_xml = get_xpath_object("//" F_LRMD_RSC, request, LOG_ERR); + const char *rsc_id = crm_element_value(rsc_xml, F_LRMD_RSC_ID); + + if (!rsc_id) { + return -ENODEV; + } + + rsc = g_hash_table_lookup(rsc_list, rsc_id); + if (rsc == NULL) { + crm_info("Ignoring unregistration of resource '%s', which is not registered", + rsc_id); + return pcmk_ok; + } + + if (rsc->active) { + /* let the caller know there are still active ops on this rsc to watch for */ + crm_trace("Operation (%p) still in progress for unregistered resource %s", + rsc->active, rsc_id); + rc = -EINPROGRESS; + } + + g_hash_table_remove(rsc_list, rsc_id); + + return rc; +} + +static int +process_lrmd_rsc_exec(pcmk__client_t *client, uint32_t id, xmlNode *request) +{ + lrmd_rsc_t *rsc = NULL; + lrmd_cmd_t *cmd = NULL; + xmlNode *rsc_xml = get_xpath_object("//" F_LRMD_RSC, request, LOG_ERR); + const char *rsc_id = crm_element_value(rsc_xml, F_LRMD_RSC_ID); + int call_id; + + if (!rsc_id) { + return -EINVAL; + } + if (!(rsc = g_hash_table_lookup(rsc_list, rsc_id))) { + crm_info("Resource '%s' not found (%d active resources)", + rsc_id, g_hash_table_size(rsc_list)); + return -ENODEV; + } + + cmd = create_lrmd_cmd(request, client); + call_id = cmd->call_id; + + /* Don't reference cmd after handing it off to be scheduled. + * The cmd could get merged and freed. */ + schedule_lrmd_cmd(rsc, cmd); + + return call_id; +} + +static int +cancel_op(const char *rsc_id, const char *action, guint interval_ms) +{ + GList *gIter = NULL; + lrmd_rsc_t *rsc = g_hash_table_lookup(rsc_list, rsc_id); + + /* How to cancel an action. + * 1. Check pending ops list, if it hasn't been handed off + * to the service library or stonith recurring list remove + * it there and that will stop it. + * 2. If it isn't in the pending ops list, then it's either a + * recurring op in the stonith recurring list, or the service + * library's recurring list. Stop it there + * 3. If not found in any lists, then this operation has either + * been executed already and is not a recurring operation, or + * never existed. + */ + if (!rsc) { + return -ENODEV; + } + + for (gIter = rsc->pending_ops; gIter != NULL; gIter = gIter->next) { + lrmd_cmd_t *cmd = gIter->data; + + if (action_matches(cmd, action, interval_ms)) { + cmd->result.execution_status = PCMK_EXEC_CANCELLED; + cmd_finalize(cmd, rsc); + return pcmk_ok; + } + } + + if (pcmk__str_eq(rsc->class, PCMK_RESOURCE_CLASS_STONITH, pcmk__str_casei)) { + /* The service library does not handle stonith operations. + * We have to handle recurring stonith operations ourselves. */ + for (gIter = rsc->recurring_ops; gIter != NULL; gIter = gIter->next) { + lrmd_cmd_t *cmd = gIter->data; + + if (action_matches(cmd, action, interval_ms)) { + cmd->result.execution_status = PCMK_EXEC_CANCELLED; + if (rsc->active != cmd) { + cmd_finalize(cmd, rsc); + } + return pcmk_ok; + } + } + } else if (services_action_cancel(rsc_id, + normalize_action_name(rsc, action), + interval_ms) == TRUE) { + /* The service library will tell the action_complete callback function + * this action was cancelled, which will destroy the cmd and remove + * it from the recurring_op list. Do not do that in this function + * if the service library says it cancelled it. */ + return pcmk_ok; + } + + return -EOPNOTSUPP; +} + +static void +cancel_all_recurring(lrmd_rsc_t * rsc, const char *client_id) +{ + GList *cmd_list = NULL; + GList *cmd_iter = NULL; + + /* Notice a copy of each list is created when concat is called. + * This prevents odd behavior from occurring when the cmd_list + * is iterated through later on. It is possible the cancel_op + * function may end up modifying the recurring_ops and pending_ops + * lists. If we did not copy those lists, our cmd_list iteration + * could get messed up.*/ + if (rsc->recurring_ops) { + cmd_list = g_list_concat(cmd_list, g_list_copy(rsc->recurring_ops)); + } + if (rsc->pending_ops) { + cmd_list = g_list_concat(cmd_list, g_list_copy(rsc->pending_ops)); + } + if (!cmd_list) { + return; + } + + for (cmd_iter = cmd_list; cmd_iter; cmd_iter = cmd_iter->next) { + lrmd_cmd_t *cmd = cmd_iter->data; + + if (cmd->interval_ms == 0) { + continue; + } + + if (client_id && !pcmk__str_eq(cmd->client_id, client_id, pcmk__str_casei)) { + continue; + } + + cancel_op(rsc->rsc_id, cmd->action, cmd->interval_ms); + } + /* frees only the copied list data, not the cmds */ + g_list_free(cmd_list); +} + +static int +process_lrmd_rsc_cancel(pcmk__client_t *client, uint32_t id, xmlNode *request) +{ + xmlNode *rsc_xml = get_xpath_object("//" F_LRMD_RSC, request, LOG_ERR); + const char *rsc_id = crm_element_value(rsc_xml, F_LRMD_RSC_ID); + const char *action = crm_element_value(rsc_xml, F_LRMD_RSC_ACTION); + guint interval_ms = 0; + + crm_element_value_ms(rsc_xml, F_LRMD_RSC_INTERVAL, &interval_ms); + + if (!rsc_id || !action) { + return -EINVAL; + } + + return cancel_op(rsc_id, action, interval_ms); +} + +static void +add_recurring_op_xml(xmlNode *reply, lrmd_rsc_t *rsc) +{ + xmlNode *rsc_xml = create_xml_node(reply, F_LRMD_RSC); + + crm_xml_add(rsc_xml, F_LRMD_RSC_ID, rsc->rsc_id); + for (GList *item = rsc->recurring_ops; item != NULL; item = item->next) { + lrmd_cmd_t *cmd = item->data; + xmlNode *op_xml = create_xml_node(rsc_xml, T_LRMD_RSC_OP); + + crm_xml_add(op_xml, F_LRMD_RSC_ACTION, + (cmd->real_action? cmd->real_action : cmd->action)); + crm_xml_add_ms(op_xml, F_LRMD_RSC_INTERVAL, cmd->interval_ms); + crm_xml_add_int(op_xml, F_LRMD_TIMEOUT, cmd->timeout_orig); + } +} + +static xmlNode * +process_lrmd_get_recurring(xmlNode *request, int call_id) +{ + int rc = pcmk_ok; + const char *rsc_id = NULL; + lrmd_rsc_t *rsc = NULL; + xmlNode *reply = NULL; + xmlNode *rsc_xml = NULL; + + // Resource ID is optional + rsc_xml = first_named_child(request, F_LRMD_CALLDATA); + if (rsc_xml) { + rsc_xml = first_named_child(rsc_xml, F_LRMD_RSC); + } + if (rsc_xml) { + rsc_id = crm_element_value(rsc_xml, F_LRMD_RSC_ID); + } + + // If resource ID is specified, resource must exist + if (rsc_id != NULL) { + rsc = g_hash_table_lookup(rsc_list, rsc_id); + if (rsc == NULL) { + crm_info("Resource '%s' not found (%d active resources)", + rsc_id, g_hash_table_size(rsc_list)); + rc = -ENODEV; + } + } + + reply = create_lrmd_reply(__func__, rc, call_id); + + // If resource ID is not specified, check all resources + if (rsc_id == NULL) { + GHashTableIter iter; + char *key = NULL; + + g_hash_table_iter_init(&iter, rsc_list); + while (g_hash_table_iter_next(&iter, (gpointer *) &key, + (gpointer *) &rsc)) { + add_recurring_op_xml(reply, rsc); + } + } else if (rsc) { + add_recurring_op_xml(reply, rsc); + } + return reply; +} + +void +process_lrmd_message(pcmk__client_t *client, uint32_t id, xmlNode *request) +{ + int rc = pcmk_ok; + int call_id = 0; + const char *op = crm_element_value(request, F_LRMD_OPERATION); + int do_reply = 0; + int do_notify = 0; + xmlNode *reply = NULL; + + /* Certain IPC commands may be done only by privileged users (i.e. root or + * hacluster), because they would otherwise provide a means of bypassing + * ACLs. + */ + bool allowed = pcmk_is_set(client->flags, pcmk__client_privileged); + + crm_trace("Processing %s operation from %s", op, client->id); + crm_element_value_int(request, F_LRMD_CALLID, &call_id); + + if (pcmk__str_eq(op, CRM_OP_IPC_FWD, pcmk__str_none)) { +#ifdef PCMK__COMPILE_REMOTE + if (allowed) { + ipc_proxy_forward_client(client, request); + } else { + rc = -EACCES; + } +#else + rc = -EPROTONOSUPPORT; +#endif + do_reply = 1; + } else if (pcmk__str_eq(op, CRM_OP_REGISTER, pcmk__str_none)) { + rc = process_lrmd_signon(client, request, call_id, &reply); + do_reply = 1; + } else if (pcmk__str_eq(op, LRMD_OP_RSC_REG, pcmk__str_none)) { + if (allowed) { + rc = process_lrmd_rsc_register(client, id, request); + do_notify = 1; + } else { + rc = -EACCES; + } + do_reply = 1; + } else if (pcmk__str_eq(op, LRMD_OP_RSC_INFO, pcmk__str_none)) { + if (allowed) { + reply = process_lrmd_get_rsc_info(request, call_id); + } else { + rc = -EACCES; + } + do_reply = 1; + } else if (pcmk__str_eq(op, LRMD_OP_RSC_UNREG, pcmk__str_none)) { + if (allowed) { + rc = process_lrmd_rsc_unregister(client, id, request); + /* don't notify anyone about failed un-registers */ + if (rc == pcmk_ok || rc == -EINPROGRESS) { + do_notify = 1; + } + } else { + rc = -EACCES; + } + do_reply = 1; + } else if (pcmk__str_eq(op, LRMD_OP_RSC_EXEC, pcmk__str_none)) { + if (allowed) { + rc = process_lrmd_rsc_exec(client, id, request); + } else { + rc = -EACCES; + } + do_reply = 1; + } else if (pcmk__str_eq(op, LRMD_OP_RSC_CANCEL, pcmk__str_none)) { + if (allowed) { + rc = process_lrmd_rsc_cancel(client, id, request); + } else { + rc = -EACCES; + } + do_reply = 1; + } else if (pcmk__str_eq(op, LRMD_OP_POKE, pcmk__str_none)) { + do_notify = 1; + do_reply = 1; + } else if (pcmk__str_eq(op, LRMD_OP_CHECK, pcmk__str_none)) { + if (allowed) { + xmlNode *data = get_message_xml(request, F_LRMD_CALLDATA); + + CRM_LOG_ASSERT(data != NULL); + pcmk__valid_sbd_timeout(crm_element_value(data, F_LRMD_WATCHDOG)); + } else { + rc = -EACCES; + } + } else if (pcmk__str_eq(op, LRMD_OP_ALERT_EXEC, pcmk__str_none)) { + if (allowed) { + rc = process_lrmd_alert_exec(client, id, request); + } else { + rc = -EACCES; + } + do_reply = 1; + } else if (pcmk__str_eq(op, LRMD_OP_GET_RECURRING, pcmk__str_none)) { + if (allowed) { + reply = process_lrmd_get_recurring(request, call_id); + } else { + rc = -EACCES; + } + do_reply = 1; + } else { + rc = -EOPNOTSUPP; + do_reply = 1; + crm_err("Unknown IPC request '%s' from client %s", + op, pcmk__client_name(client)); + } + + if (rc == -EACCES) { + crm_warn("Rejecting IPC request '%s' from unprivileged client %s", + op, pcmk__client_name(client)); + } + + crm_debug("Processed %s operation from %s: rc=%d, reply=%d, notify=%d", + op, client->id, rc, do_reply, do_notify); + + if (do_reply) { + int send_rc = pcmk_rc_ok; + + if (reply == NULL) { + reply = create_lrmd_reply(__func__, rc, call_id); + } + send_rc = lrmd_server_send_reply(client, id, reply); + free_xml(reply); + if (send_rc != pcmk_rc_ok) { + crm_warn("Reply to client %s failed: %s " CRM_XS " rc=%d", + pcmk__client_name(client), pcmk_rc_str(send_rc), send_rc); + } + } + + if (do_notify) { + send_generic_notify(rc, request); + } +} diff --git a/daemons/execd/pacemaker-execd.c b/daemons/execd/pacemaker-execd.c new file mode 100644 index 0000000..83a8cd7 --- /dev/null +++ b/daemons/execd/pacemaker-execd.c @@ -0,0 +1,582 @@ +/* + * Copyright 2012-2023 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * + * This source code is licensed under the GNU Lesser General Public License + * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. + */ + +#include <crm_internal.h> + +#include <glib.h> +#include <signal.h> +#include <sys/types.h> + +#include <crm/crm.h> +#include <crm/msg_xml.h> +#include <crm/services.h> +#include <crm/common/cmdline_internal.h> +#include <crm/common/ipc.h> +#include <crm/common/ipc_internal.h> +#include <crm/common/mainloop.h> +#include <crm/common/output_internal.h> +#include <crm/common/remote_internal.h> +#include <crm/lrmd_internal.h> + +#include "pacemaker-execd.h" + +#ifdef PCMK__COMPILE_REMOTE +# define EXECD_TYPE "remote" +# define EXECD_NAME "pacemaker-remoted" +# define SUMMARY "resource agent executor daemon for Pacemaker Remote nodes" +#else +# define EXECD_TYPE "local" +# define EXECD_NAME "pacemaker-execd" +# define SUMMARY "resource agent executor daemon for Pacemaker cluster nodes" +#endif + +static GMainLoop *mainloop = NULL; +static qb_ipcs_service_t *ipcs = NULL; +static stonith_t *stonith_api = NULL; +int lrmd_call_id = 0; +time_t start_time; + +static struct { + gchar **log_files; +#ifdef PCMK__COMPILE_REMOTE + gchar *port; +#endif // PCMK__COMPILE_REMOTE +} options; + +#ifdef PCMK__COMPILE_REMOTE +/* whether shutdown request has been sent */ +static gboolean shutting_down = FALSE; + +/* timer for waiting for acknowledgment of shutdown request */ +static guint shutdown_ack_timer = 0; + +static gboolean lrmd_exit(gpointer data); +#endif + +static void +stonith_connection_destroy_cb(stonith_t * st, stonith_event_t * e) +{ + stonith_api->state = stonith_disconnected; + stonith_connection_failed(); +} + +stonith_t * +get_stonith_connection(void) +{ + if (stonith_api && stonith_api->state == stonith_disconnected) { + stonith_api_delete(stonith_api); + stonith_api = NULL; + } + + if (stonith_api == NULL) { + int rc = pcmk_ok; + + stonith_api = stonith_api_new(); + if (stonith_api == NULL) { + crm_err("Could not connect to fencer: API memory allocation failed"); + return NULL; + } + rc = stonith_api_connect_retry(stonith_api, crm_system_name, 10); + if (rc != pcmk_ok) { + crm_err("Could not connect to fencer in 10 attempts: %s " + CRM_XS " rc=%d", pcmk_strerror(rc), rc); + stonith_api_delete(stonith_api); + stonith_api = NULL; + } else { + stonith_api->cmds->register_notification(stonith_api, + T_STONITH_NOTIFY_DISCONNECT, + stonith_connection_destroy_cb); + } + } + return stonith_api; +} + +static int32_t +lrmd_ipc_accept(qb_ipcs_connection_t * c, uid_t uid, gid_t gid) +{ + crm_trace("Connection %p", c); + if (pcmk__new_client(c, uid, gid) == NULL) { + return -EIO; + } + return 0; +} + +static void +lrmd_ipc_created(qb_ipcs_connection_t * c) +{ + pcmk__client_t *new_client = pcmk__find_client(c); + + crm_trace("Connection %p", c); + CRM_ASSERT(new_client != NULL); + /* Now that the connection is offically established, alert + * the other clients a new connection exists. */ + + notify_of_new_client(new_client); +} + +static int32_t +lrmd_ipc_dispatch(qb_ipcs_connection_t * c, void *data, size_t size) +{ + uint32_t id = 0; + uint32_t flags = 0; + pcmk__client_t *client = pcmk__find_client(c); + xmlNode *request = pcmk__client_data2xml(client, data, &id, &flags); + + CRM_CHECK(client != NULL, crm_err("Invalid client"); + return FALSE); + CRM_CHECK(client->id != NULL, crm_err("Invalid client: %p", client); + return FALSE); + + CRM_CHECK(flags & crm_ipc_client_response, crm_err("Invalid client request: %p", client); + return FALSE); + + if (!request) { + return 0; + } + + if (!client->name) { + const char *value = crm_element_value(request, F_LRMD_CLIENTNAME); + + if (value == NULL) { + client->name = pcmk__itoa(pcmk__client_pid(c)); + } else { + client->name = strdup(value); + } + } + + lrmd_call_id++; + if (lrmd_call_id < 1) { + lrmd_call_id = 1; + } + + crm_xml_add(request, F_LRMD_CLIENTID, client->id); + crm_xml_add(request, F_LRMD_CLIENTNAME, client->name); + crm_xml_add_int(request, F_LRMD_CALLID, lrmd_call_id); + + process_lrmd_message(client, id, request); + + free_xml(request); + return 0; +} + +/*! + * \internal + * \brief Free a client connection, and exit if appropriate + * + * \param[in,out] client Client connection to free + */ +void +lrmd_client_destroy(pcmk__client_t *client) +{ + pcmk__free_client(client); + +#ifdef PCMK__COMPILE_REMOTE + /* If we were waiting to shut down, we can now safely do so + * if there are no more proxied IPC providers + */ + if (shutting_down && (ipc_proxy_get_provider() == NULL)) { + lrmd_exit(NULL); + } +#endif +} + +static int32_t +lrmd_ipc_closed(qb_ipcs_connection_t * c) +{ + pcmk__client_t *client = pcmk__find_client(c); + + if (client == NULL) { + return 0; + } + + crm_trace("Connection %p", c); + client_disconnect_cleanup(client->id); +#ifdef PCMK__COMPILE_REMOTE + ipc_proxy_remove_provider(client); +#endif + lrmd_client_destroy(client); + return 0; +} + +static void +lrmd_ipc_destroy(qb_ipcs_connection_t * c) +{ + lrmd_ipc_closed(c); + crm_trace("Connection %p", c); +} + +static struct qb_ipcs_service_handlers lrmd_ipc_callbacks = { + .connection_accept = lrmd_ipc_accept, + .connection_created = lrmd_ipc_created, + .msg_process = lrmd_ipc_dispatch, + .connection_closed = lrmd_ipc_closed, + .connection_destroyed = lrmd_ipc_destroy +}; + +// \return Standard Pacemaker return code +int +lrmd_server_send_reply(pcmk__client_t *client, uint32_t id, xmlNode *reply) +{ + crm_trace("Sending reply (%d) to client (%s)", id, client->id); + switch (PCMK__CLIENT_TYPE(client)) { + case pcmk__client_ipc: + return pcmk__ipc_send_xml(client, id, reply, FALSE); +#ifdef PCMK__COMPILE_REMOTE + case pcmk__client_tls: + return lrmd__remote_send_xml(client->remote, reply, id, "reply"); +#endif + default: + crm_err("Could not send reply: unknown type for client %s " + CRM_XS " flags=%#llx", + pcmk__client_name(client), client->flags); + } + return ENOTCONN; +} + +// \return Standard Pacemaker return code +int +lrmd_server_send_notify(pcmk__client_t *client, xmlNode *msg) +{ + crm_trace("Sending notification to client (%s)", client->id); + switch (PCMK__CLIENT_TYPE(client)) { + case pcmk__client_ipc: + if (client->ipcs == NULL) { + crm_trace("Could not notify local client: disconnected"); + return ENOTCONN; + } + return pcmk__ipc_send_xml(client, 0, msg, crm_ipc_server_event); +#ifdef PCMK__COMPILE_REMOTE + case pcmk__client_tls: + if (client->remote == NULL) { + crm_trace("Could not notify remote client: disconnected"); + return ENOTCONN; + } else { + return lrmd__remote_send_xml(client->remote, msg, 0, "notify"); + } +#endif + default: + crm_err("Could not notify client %s with unknown transport " + CRM_XS " flags=%#llx", + pcmk__client_name(client), client->flags); + } + return ENOTCONN; +} + +/*! + * \internal + * \brief Clean up and exit immediately + * + * \param[in] data Ignored + * + * \return Doesn't return + * \note This can be used as a timer callback. + */ +static gboolean +lrmd_exit(gpointer data) +{ + crm_info("Terminating with %d clients", pcmk__ipc_client_count()); + if (stonith_api) { + stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_DISCONNECT); + stonith_api->cmds->disconnect(stonith_api); + stonith_api_delete(stonith_api); + } + if (ipcs) { + mainloop_del_ipc_server(ipcs); + } + +#ifdef PCMK__COMPILE_REMOTE + execd_stop_tls_server(); + ipc_proxy_cleanup(); +#endif + + pcmk__client_cleanup(); + g_hash_table_destroy(rsc_list); + + if (mainloop) { + lrmd_drain_alerts(mainloop); + } + + crm_exit(CRM_EX_OK); + return FALSE; +} + +/*! + * \internal + * \brief Request cluster shutdown if appropriate, otherwise exit immediately + * + * \param[in] nsig Signal that caused invocation (ignored) + */ +static void +lrmd_shutdown(int nsig) +{ +#ifdef PCMK__COMPILE_REMOTE + pcmk__client_t *ipc_proxy = ipc_proxy_get_provider(); + + /* If there are active proxied IPC providers, then we may be running + * resources, so notify the cluster that we wish to shut down. + */ + if (ipc_proxy) { + if (shutting_down) { + crm_notice("Waiting for cluster to stop resources before exiting"); + return; + } + + crm_info("Sending shutdown request to cluster"); + if (ipc_proxy_shutdown_req(ipc_proxy) < 0) { + crm_crit("Shutdown request failed, exiting immediately"); + + } else { + /* We requested a shutdown. Now, we need to wait for an + * acknowledgement from the proxy host (which ensures the proxy host + * supports shutdown requests), then wait for all proxy hosts to + * disconnect (which ensures that all resources have been stopped). + */ + shutting_down = TRUE; + + /* Stop accepting new proxy connections */ + execd_stop_tls_server(); + + /* Older controller versions will never acknowledge our request, so + * set a fairly short timeout to exit quickly in that case. If we + * get the ack, we'll defuse this timer. + */ + shutdown_ack_timer = g_timeout_add_seconds(20, lrmd_exit, NULL); + + /* Currently, we let the OS kill us if the clients don't disconnect + * in a reasonable time. We could instead set a long timer here + * (shorter than what the OS is likely to use) and exit immediately + * if it pops. + */ + return; + } + } +#endif + lrmd_exit(NULL); +} + +/*! + * \internal + * \brief Defuse short exit timer if shutting down + */ +void +handle_shutdown_ack(void) +{ +#ifdef PCMK__COMPILE_REMOTE + if (shutting_down) { + crm_info("Received shutdown ack"); + if (shutdown_ack_timer > 0) { + g_source_remove(shutdown_ack_timer); + shutdown_ack_timer = 0; + } + return; + } +#endif + crm_debug("Ignoring unexpected shutdown ack"); +} + +/*! + * \internal + * \brief Make short exit timer fire immediately + */ +void +handle_shutdown_nack(void) +{ +#ifdef PCMK__COMPILE_REMOTE + if (shutting_down) { + crm_info("Received shutdown nack"); + if (shutdown_ack_timer > 0) { + g_source_remove(shutdown_ack_timer); + shutdown_ack_timer = g_timeout_add(0, lrmd_exit, NULL); + } + return; + } +#endif + crm_debug("Ignoring unexpected shutdown nack"); +} + +static GOptionEntry entries[] = { + { "logfile", 'l', G_OPTION_FLAG_NONE, G_OPTION_ARG_FILENAME_ARRAY, + &options.log_files, "Send logs to the additional named logfile", NULL }, + +#ifdef PCMK__COMPILE_REMOTE + + { "port", 'p', G_OPTION_FLAG_NONE, G_OPTION_ARG_STRING, &options.port, + "Port to listen on (defaults to " G_STRINGIFY(DEFAULT_REMOTE_PORT) ")", NULL }, +#endif // PCMK__COMPILE_REMOTE + + { NULL } +}; + +static pcmk__supported_format_t formats[] = { + PCMK__SUPPORTED_FORMAT_NONE, + PCMK__SUPPORTED_FORMAT_TEXT, + PCMK__SUPPORTED_FORMAT_XML, + { NULL, NULL, NULL } +}; + +static GOptionContext * +build_arg_context(pcmk__common_args_t *args, GOptionGroup **group) +{ + GOptionContext *context = NULL; + + context = pcmk__build_arg_context(args, "text (default), xml", group, NULL); + pcmk__add_main_args(context, entries); + return context; +} + +int +main(int argc, char **argv, char **envp) +{ + int rc = pcmk_rc_ok; + crm_exit_t exit_code = CRM_EX_OK; + + const char *option = NULL; + + pcmk__output_t *out = NULL; + + GError *error = NULL; + + GOptionGroup *output_group = NULL; + pcmk__common_args_t *args = pcmk__new_common_args(SUMMARY); +#ifdef PCMK__COMPILE_REMOTE + gchar **processed_args = pcmk__cmdline_preproc(argv, "lp"); +#else + gchar **processed_args = pcmk__cmdline_preproc(argv, "l"); +#endif // PCMK__COMPILE_REMOTE + GOptionContext *context = build_arg_context(args, &output_group); + +#ifdef PCMK__COMPILE_REMOTE + // If necessary, create PID 1 now before any file descriptors are opened + remoted_spawn_pidone(argc, argv, envp); +#endif + + crm_log_preinit(EXECD_NAME, argc, argv); + + pcmk__register_formats(output_group, formats); + if (!g_option_context_parse_strv(context, &processed_args, &error)) { + exit_code = CRM_EX_USAGE; + goto done; + } + + rc = pcmk__output_new(&out, args->output_ty, args->output_dest, argv); + if (rc != pcmk_rc_ok) { + exit_code = CRM_EX_ERROR; + g_set_error(&error, PCMK__EXITC_ERROR, exit_code, + "Error creating output format %s: %s", + args->output_ty, pcmk_rc_str(rc)); + goto done; + } + + if (args->version) { + out->version(out, false); + goto done; + } + + // Open additional log files + if (options.log_files != NULL) { + for (gchar **fname = options.log_files; *fname != NULL; fname++) { + rc = pcmk__add_logfile(*fname); + + if (rc != pcmk_rc_ok) { + out->err(out, "Logging to %s is disabled: %s", + *fname, pcmk_rc_str(rc)); + } + } + } + + pcmk__cli_init_logging(EXECD_NAME, args->verbosity); + crm_log_init(NULL, LOG_INFO, TRUE, FALSE, argc, argv, FALSE); + + option = pcmk__env_option(PCMK__ENV_LOGFACILITY); + if (!pcmk__str_eq(option, PCMK__VALUE_NONE, + pcmk__str_casei|pcmk__str_null_matches) + && !pcmk__str_eq(option, "/dev/null", pcmk__str_none)) { + setenv("HA_LOGFACILITY", option, 1); /* Used by the ocf_log/ha_log OCF macro */ + } + + option = pcmk__env_option(PCMK__ENV_LOGFILE); + if (!pcmk__str_eq(option, PCMK__VALUE_NONE, + pcmk__str_casei|pcmk__str_null_matches)) { + setenv("HA_LOGFILE", option, 1); /* Used by the ocf_log/ha_log OCF macro */ + + if (pcmk__env_option_enabled(crm_system_name, PCMK__ENV_DEBUG)) { + setenv("HA_DEBUGLOG", option, 1); /* Used by the ocf_log/ha_debug OCF macro */ + } + } + +#ifdef PCMK__COMPILE_REMOTE + if (options.port != NULL) { + setenv("PCMK_remote_port", options.port, 1); + } +#endif // PCMK__COMPILE_REMOTE + + start_time = time(NULL); + + crm_notice("Starting Pacemaker " EXECD_TYPE " executor"); + + /* The presence of this variable allegedly controls whether child + * processes like httpd will try and use Systemd's sd_notify + * API + */ + unsetenv("NOTIFY_SOCKET"); + + { + // Temporary directory for resource agent use (leave owned by root) + int rc = pcmk__build_path(CRM_RSCTMP_DIR, 0755); + + if (rc != pcmk_rc_ok) { + crm_warn("Could not create resource agent temporary directory " + CRM_RSCTMP_DIR ": %s", pcmk_rc_str(rc)); + } + } + + rsc_list = pcmk__strkey_table(NULL, free_rsc); + ipcs = mainloop_add_ipc_server(CRM_SYSTEM_LRMD, QB_IPC_SHM, &lrmd_ipc_callbacks); + if (ipcs == NULL) { + crm_err("Failed to create IPC server: shutting down and inhibiting respawn"); + exit_code = CRM_EX_FATAL; + goto done; + } + +#ifdef PCMK__COMPILE_REMOTE + if (lrmd_init_remote_tls_server() < 0) { + crm_err("Failed to create TLS listener: shutting down and staying down"); + exit_code = CRM_EX_FATAL; + goto done; + } + ipc_proxy_init(); +#endif + + mainloop_add_signal(SIGTERM, lrmd_shutdown); + mainloop = g_main_loop_new(NULL, FALSE); + crm_notice("Pacemaker " EXECD_TYPE " executor successfully started and accepting connections"); + crm_notice("OCF resource agent search path is %s", OCF_RA_PATH); + g_main_loop_run(mainloop); + + /* should never get here */ + lrmd_exit(NULL); + +done: + g_strfreev(options.log_files); +#ifdef PCMK__COMPILE_REMOTE + g_free(options.port); +#endif // PCMK__COMPILE_REMOTE + + g_strfreev(processed_args); + pcmk__free_arg_context(context); + + pcmk__output_and_clear_error(&error, out); + + if (out != NULL) { + out->finish(out, exit_code, true, NULL); + pcmk__output_free(out); + } + pcmk__unregister_formats(); + crm_exit(exit_code); +} diff --git a/daemons/execd/pacemaker-execd.h b/daemons/execd/pacemaker-execd.h new file mode 100644 index 0000000..9c1d173 --- /dev/null +++ b/daemons/execd/pacemaker-execd.h @@ -0,0 +1,110 @@ +/* + * Copyright 2012-2023 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * + * This source code is licensed under the GNU Lesser General Public License + * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. + */ + +#ifndef PACEMAKER_EXECD__H +# define PACEMAKER_EXECD__H + +# include <glib.h> +# include <crm/common/ipc_internal.h> +# include <crm/lrmd.h> +# include <crm/stonith-ng.h> + +# ifdef HAVE_GNUTLS_GNUTLS_H +# include <gnutls/gnutls.h> +# endif + +extern GHashTable *rsc_list; +extern time_t start_time; + +typedef struct lrmd_rsc_s { + char *rsc_id; + char *class; + char *provider; + char *type; + + int call_opts; + + /* NEVER dereference this pointer, + * It simply exists as a switch to let us know + * when the currently active operation has completed */ + void *active; + + /* Operations in this list + * have not been executed yet. */ + GList *pending_ops; + /* Operations in this list are recurring operations + * that have been handed off from the pending ops list. */ + GList *recurring_ops; + + /* If this resource is a fence device, probes are handled internally by the + * executor, and this value indicates the result that should currently be + * returned for probes. It should be one of: + * PCMK_EXEC_DONE (to indicate "running"), + * PCMK_EXEC_NO_FENCE_DEVICE ("not running"), or + * PCMK_EXEC_NOT_CONNECTED ("unknown because fencer connection was lost"). + */ + pcmk__action_result_t fence_probe_result; + + crm_trigger_t *work; +} lrmd_rsc_t; + +# ifdef HAVE_GNUTLS_GNUTLS_H +// in remoted_tls.c +int lrmd_init_remote_tls_server(void); +void execd_stop_tls_server(void); +# endif + +int lrmd_server_send_reply(pcmk__client_t *client, uint32_t id, xmlNode *reply); + +int lrmd_server_send_notify(pcmk__client_t *client, xmlNode *msg); + +void notify_of_new_client(pcmk__client_t *new_client); + +void process_lrmd_message(pcmk__client_t *client, uint32_t id, + xmlNode *request); + +void free_rsc(gpointer data); + +void handle_shutdown_ack(void); + +void handle_shutdown_nack(void); + +void lrmd_client_destroy(pcmk__client_t *client); + +void client_disconnect_cleanup(const char *client_id); + +/*! + * \brief Don't worry about freeing this connection. It is + * taken care of after mainloop exits by the main() function. + */ +stonith_t *get_stonith_connection(void); + +/*! + * \brief This is a callback that tells the lrmd + * the current stonith connection has gone away. This allows + * us to timeout any pending stonith commands + */ +void stonith_connection_failed(void); + +#ifdef PCMK__COMPILE_REMOTE +void ipc_proxy_init(void); +void ipc_proxy_cleanup(void); +void ipc_proxy_add_provider(pcmk__client_t *client); +void ipc_proxy_remove_provider(pcmk__client_t *client); +void ipc_proxy_forward_client(pcmk__client_t *client, xmlNode *xml); +pcmk__client_t *ipc_proxy_get_provider(void); +int ipc_proxy_shutdown_req(pcmk__client_t *ipc_proxy); +void remoted_spawn_pidone(int argc, char **argv, char **envp); +#endif + +int process_lrmd_alert_exec(pcmk__client_t *client, uint32_t id, + xmlNode *request); +void lrmd_drain_alerts(GMainLoop *mloop); + +#endif // PACEMAKER_EXECD__H diff --git a/daemons/execd/pacemaker-remoted.8.inc b/daemons/execd/pacemaker-remoted.8.inc new file mode 100644 index 0000000..bc86acc --- /dev/null +++ b/daemons/execd/pacemaker-remoted.8.inc @@ -0,0 +1,5 @@ +[synopsis] +pacemaker-remoted [options] + +/for Pacemaker Remote nodes/ +.SH OPTIONS diff --git a/daemons/execd/pacemaker_remote.in b/daemons/execd/pacemaker_remote.in new file mode 100644 index 0000000..2096c5f --- /dev/null +++ b/daemons/execd/pacemaker_remote.in @@ -0,0 +1,176 @@ +#!@BASH_PATH@ + +# Authors: +# Andrew Beekhof <abeekhof@redhat.com> +# +# License: Revised BSD + +# chkconfig: - 99 01 +# description: Pacemaker Cluster Manager +# processname: pacemaker-remoted +# +### BEGIN INIT INFO +# Provides: pacemaker_remote +# Required-Start: $network $remote_fs +# Should-Start: $syslog +# Required-Stop: $network $remote_fs +# Default-Start: +# Default-Stop: +# Short-Description: Manage the executor for Pacemaker Remote nodes +# Description: Manage the executor for Pacemaker Remote nodes +### END INIT INFO + +desc="Pacemaker Remote Executor" +prog="pacemaker-remoted" + +# set secure PATH +PATH="/sbin:/bin:/usr/sbin:/usr/bin:@sbindir@" + +checkrc() { + if [ $? = 0 ]; then + success + else + failure + fi +} + +success() +{ + echo -ne "[ OK ]\r" +} + +failure() +{ + echo -ne "[FAILED]\r" +} + +status() +{ + pid=$(pidof $1 2>/dev/null) + local rtrn=$? + if [ $rtrn -ne 0 ]; then + echo "$1 is stopped" + if [ -f "@localstatedir@/run/$prog.pid" ]; then + rtrn=1 + else + rtrn=3 + fi + else + echo "$1 (pid $pid) is running..." + fi + return $rtrn +} + +if [ -d @CONFIGDIR@ ]; then + [ -f @INITDIR@/functions ] && . @INITDIR@/functions +set -a + [ -f @CONFIGDIR@/pacemaker ] && . @CONFIGDIR@/pacemaker + [ -f @CONFIGDIR@/sbd ] && . @CONFIGDIR@/sbd +set +a +fi + +LOCK_DIR="." +if [ -d "@localstatedir@/lock/subsys" ]; then + LOCK_DIR="@localstatedir@/lock/subsys" +elif [ -d "@localstatedir@/lock" ]; then + LOCK_DIR="@localstatedir@/lock" +fi +[ -z "$LOCK_FILE" ] && LOCK_FILE="$LOCK_DIR/pacemaker_remote" + +# Check if there is a valid watchdog-device configured in sbd config +if [ x != "x$SBD_WATCHDOG_DEV" -a "/dev/null" != "$SBD_WATCHDOG_DEV" -a -c "$SBD_WATCHDOG_DEV" ]; then + # enhance for unavailable chkconfig - don't touch sbd for now + if chkconfig --list sbd_remote_helper 2>/dev/null | grep -q ":on"; then + SBD_SERVICE=sbd_remote_helper + fi +fi + +start() +{ + echo -n "Starting $desc: " + + # most recent distributions use tmpfs for $@localstatedir@/run + # to avoid to clean it up on every boot. + # they also assume that init scripts will create + # required subdirectories for proper operations + mkdir -p "@localstatedir@/run" + + if status $prog > /dev/null 2>&1; then + success + else + $prog > /dev/null 2>&1 & + + # Time to connect to corosync and fail + sleep 5 + + if status $prog > /dev/null 2>&1; then + touch "$LOCK_FILE" + pidof $prog > "@localstatedir@/run/$prog.pid" + success + else + failure + rtrn=1 + fi + fi + echo + + [ "x$SBD_SERVICE" = "x" ] || service $SBD_SERVICE start +} + +stop() +{ + if status $prog > /dev/null 2>&1; then + echo -n "Signaling $desc to terminate: " + kill -TERM $(pidof $prog) > /dev/null 2>&1 + success + echo + + echo -n "Waiting for $desc to unload:" + while status $prog > /dev/null 2>&1; do + sleep 1 + echo -n "." + done + else + echo -n "$desc is already stopped" + fi + + rm -f "$LOCK_FILE" + rm -f "@localstatedir@/run/$prog.pid" + success + echo + + [ "x$SBD_SERVICE" = "x" ] || service $SBD_SERVICE stop +} + +rtrn=0 + +case "$1" in +start) + start +;; +restart|reload|force-reload) + stop + start +;; +condrestart|try-restart) + if status $prog > /dev/null 2>&1; then + stop + start + rtrn=$? + fi +;; +status) + status $prog + rtrn=$? +;; +stop) + stop + rtrn=$? +;; +*) + echo "usage: $0 {start|stop|restart|reload|force-reload|condrestart|try-restart|status}" + rtrn=2 +;; +esac + +exit $rtrn diff --git a/daemons/execd/pacemaker_remote.service.in b/daemons/execd/pacemaker_remote.service.in new file mode 100644 index 0000000..1e48d14 --- /dev/null +++ b/daemons/execd/pacemaker_remote.service.in @@ -0,0 +1,52 @@ +[Unit] +Description=Pacemaker Remote executor daemon +Documentation=man:pacemaker-remoted +Documentation=https://clusterlabs.org/pacemaker/doc/ + +# See main pacemaker unit file for descriptions of why these are needed +After=network.target +After=time-sync.target +After=dbus.service +Wants=dbus.service +After=resource-agents-deps.target +Wants=resource-agents-deps.target +After=syslog.service +After=rsyslog.service + +[Install] +Alias=pacemaker-remote.service +WantedBy=multi-user.target + +[Service] +Type=simple +KillMode=process +NotifyAccess=none +EnvironmentFile=-@CONFIGDIR@/pacemaker +EnvironmentFile=-@CONFIGDIR@/sbd + +# Not actually success, but fatal failure -- this ensures no respawn +SuccessExitStatus=100 + +ExecStart=@sbindir@/pacemaker-remoted + +# Systemd v227 and above can limit the number of processes spawned by a +# service. That is a bad idea for an HA cluster resource manager, so disable it +# by default. The administrator can create a local override if they really want +# a limit. If your systemd version does not support TasksMax, and you want to +# get rid of the resulting log warnings, comment out this option. +TasksMax=infinity + +# If connected to the cluster and when the service functions properly, it will +# wait to exit until the cluster notifies it all resources on the remote node +# have been stopped. The default of 30min should cover most typical cluster +# configurations, but it may need an increase to adapt to local conditions +# (e.g. a large, clustered database could conceivably take longer to stop). +TimeoutStopSec=30min +TimeoutStartSec=30s + +# Restart options include: no, on-success, on-failure, on-abort or always +Restart=on-failure + +# crm_perror() writes directly to stderr, so ignore it here +# to avoid double-logging with the wrong format +StandardError=null diff --git a/daemons/execd/remoted_pidone.c b/daemons/execd/remoted_pidone.c new file mode 100644 index 0000000..4f914eb --- /dev/null +++ b/daemons/execd/remoted_pidone.c @@ -0,0 +1,298 @@ +/* + * Copyright 2017-2020 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * + * This source code is licensed under the GNU Lesser General Public License + * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. + */ + +#include <crm_internal.h> + +#include <stdio.h> +#include <ctype.h> +#include <stdlib.h> +#include <signal.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/wait.h> + +#include <crm/crm.h> +#include "pacemaker-execd.h" + +static pid_t main_pid = 0; + +static void +sigdone(void) +{ + exit(CRM_EX_OK); +} + +static void +sigreap(void) +{ + pid_t pid = 0; + int status; + + do { + /* + * Opinions seem to differ as to what to put here: + * -1, any child process + * 0, any child process whose process group ID is equal to that of the calling process + */ + pid = waitpid(-1, &status, WNOHANG); + if (pid == main_pid) { + /* Exit when pacemaker-remote exits and use the same return code */ + if (WIFEXITED(status)) { + exit(WEXITSTATUS(status)); + } + exit(CRM_EX_ERROR); + } + } while (pid > 0); +} + +static struct { + int sig; + void (*handler)(void); +} sigmap[] = { + { SIGCHLD, sigreap }, + { SIGINT, sigdone }, +}; + +/*! + * \internal + * \brief Check a line of text for a valid environment variable name + * + * \param[in] line Text to check + * \param[out] first First character of valid name if found, NULL otherwise + * \param[out] last Last character of valid name if found, NULL otherwise + * + * \return TRUE if valid name found, FALSE otherwise + * \note It's reasonable to impose limitations on environment variable names + * beyond what C or setenv() does: We only allow names that contain only + * [a-zA-Z0-9_] characters and do not start with a digit. + */ +static bool +find_env_var_name(char *line, char **first, char **last) +{ + // Skip leading whitespace + *first = line; + while (isspace(**first)) { + ++*first; + } + + if (isalpha(**first) || (**first == '_')) { // Valid first character + *last = *first; + while (isalnum(*(*last + 1)) || (*(*last + 1) == '_')) { + ++*last; + } + return TRUE; + } + + *first = *last = NULL; + return FALSE; +} + +static void +load_env_vars(const char *filename) +{ + /* We haven't forked or initialized logging yet, so don't leave any file + * descriptors open, and don't log -- silently ignore errors. + */ + FILE *fp = fopen(filename, "r"); + + if (fp != NULL) { + char line[LINE_MAX] = { '\0', }; + + while (fgets(line, LINE_MAX, fp) != NULL) { + char *name = NULL; + char *end = NULL; + char *value = NULL; + char *quote = NULL; + + // Look for valid name immediately followed by equals sign + if (find_env_var_name(line, &name, &end) && (*++end == '=')) { + + // Null-terminate name, and advance beyond equals sign + *end++ = '\0'; + + // Check whether value is quoted + if ((*end == '\'') || (*end == '"')) { + quote = end++; + } + value = end; + + if (quote) { + /* Value is remaining characters up to next non-backslashed + * matching quote character. + */ + while (((*end != *quote) || (*(end - 1) == '\\')) + && (*end != '\0')) { + end++; + } + if (*end == *quote) { + // Null-terminate value, and advance beyond close quote + *end++ = '\0'; + } else { + // Matching closing quote wasn't found + value = NULL; + } + + } else { + /* Value is remaining characters up to next non-backslashed + * whitespace. + */ + while ((!isspace(*end) || (*(end - 1) == '\\')) + && (*end != '\0')) { + ++end; + } + + if (end == (line + LINE_MAX - 1)) { + // Line was too long + value = NULL; + } + // Do NOT null-terminate value (yet) + } + + /* We have a valid name and value, and end is now the character + * after the closing quote or the first whitespace after the + * unquoted value. Make sure the rest of the line is just + * whitespace or a comment. + */ + if (value) { + char *value_end = end; + + while (isspace(*end) && (*end != '\n')) { + ++end; + } + if ((*end == '\n') || (*end == '#')) { + if (quote == NULL) { + // Now we can null-terminate an unquoted value + *value_end = '\0'; + } + + // Don't overwrite (bundle options take precedence) + setenv(name, value, 0); + + } else { + value = NULL; + } + } + } + + if ((value == NULL) && (strchr(line, '\n') == NULL)) { + // Eat remainder of line beyond LINE_MAX + if (fscanf(fp, "%*[^\n]\n") == EOF) { + value = NULL; // Don't care, make compiler happy + } + } + } + fclose(fp); + } +} + +void +remoted_spawn_pidone(int argc, char **argv, char **envp) +{ + sigset_t set; + + /* This environment variable exists for two purposes: + * - For testing, setting it to "full" enables full PID 1 behavior even + * when PID is not 1 + * - Setting to "vars" enables just the loading of environment variables + * from /etc/pacemaker/pcmk-init.env, which could be useful for testing or + * containers with a custom PID 1 script that launches pacemaker-remoted. + */ + const char *pid1 = (getpid() == 1)? "full" : getenv("PCMK_remote_pid1"); + + if (pid1 == NULL) { + return; + } + + /* When a container is launched, it may be given specific environment + * variables, which for Pacemaker bundles are given in the bundle + * configuration. However, that does not allow for host-specific values. + * To allow for that, look for a special file containing a shell-like syntax + * of name/value pairs, and export those into the environment. + */ + load_env_vars("/etc/pacemaker/pcmk-init.env"); + + if (strcmp(pid1, "full")) { + return; + } + + /* Containers can be expected to have /var/log, but they may not have + * /var/log/pacemaker, so use a different default if no value has been + * explicitly configured in the container's environment. + */ + if (pcmk__env_option(PCMK__ENV_LOGFILE) == NULL) { + pcmk__set_env_option(PCMK__ENV_LOGFILE, "/var/log/pcmk-init.log"); + } + + sigfillset(&set); + sigprocmask(SIG_BLOCK, &set, 0); + + main_pid = fork(); + switch (main_pid) { + case 0: + sigprocmask(SIG_UNBLOCK, &set, NULL); + setsid(); + setpgid(0, 0); + + // Child remains as pacemaker-remoted + return; + case -1: + perror("fork"); + } + + /* Parent becomes the reaper of zombie processes */ + /* Safe to initialize logging now if needed */ + +# ifdef HAVE_PROGNAME + /* Differentiate ourselves in the 'ps' output */ + { + char *p; + int i, maxlen; + char *LastArgv = NULL; + const char *name = "pcmk-init"; + + for (i = 0; i < argc; i++) { + if (!i || (LastArgv + 1 == argv[i])) + LastArgv = argv[i] + strlen(argv[i]); + } + + for (i = 0; envp[i] != NULL; i++) { + if ((LastArgv + 1) == envp[i]) { + LastArgv = envp[i] + strlen(envp[i]); + } + } + + maxlen = (LastArgv - argv[0]) - 2; + + i = strlen(name); + + /* We can overwrite individual argv[] arguments */ + snprintf(argv[0], maxlen, "%s", name); + + /* Now zero out everything else */ + p = &argv[0][i]; + while (p < LastArgv) { + *p++ = '\0'; + } + argv[1] = NULL; + } +# endif // HAVE_PROGNAME + + while (1) { + int sig; + size_t i; + + sigwait(&set, &sig); + for (i = 0; i < PCMK__NELEM(sigmap); i++) { + if (sigmap[i].sig == sig) { + sigmap[i].handler(); + break; + } + } + } +} diff --git a/daemons/execd/remoted_proxy.c b/daemons/execd/remoted_proxy.c new file mode 100644 index 0000000..62c8c3a --- /dev/null +++ b/daemons/execd/remoted_proxy.c @@ -0,0 +1,470 @@ +/* + * Copyright 2012-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * + * This source code is licensed under the GNU Lesser General Public License + * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. + */ + +#include <crm_internal.h> + +#include <glib.h> +#include <unistd.h> + +#include "pacemaker-execd.h" +#include <crm/crm.h> +#include <crm/msg_xml.h> +#include <crm/services.h> +#include <crm/common/mainloop.h> +#include <crm/common/ipc.h> +#include <crm/common/ipc_internal.h> +#include <crm/cib/internal.h> +#include <crm/fencing/internal.h> + +static qb_ipcs_service_t *cib_ro = NULL; +static qb_ipcs_service_t *cib_rw = NULL; +static qb_ipcs_service_t *cib_shm = NULL; + +static qb_ipcs_service_t *attrd_ipcs = NULL; +static qb_ipcs_service_t *crmd_ipcs = NULL; +static qb_ipcs_service_t *stonith_ipcs = NULL; +static qb_ipcs_service_t *pacemakerd_ipcs = NULL; + +// An IPC provider is a cluster node controller connecting as a client +static GList *ipc_providers = NULL; +/* ipc clients == things like cibadmin, crm_resource, connecting locally */ +static GHashTable *ipc_clients = NULL; + +/*! + * \internal + * \brief Get an IPC proxy provider + * + * \return Pointer to a provider if one exists, NULL otherwise + * + * \note Grab the first provider, which is the most recent connection. That way, + * if we haven't yet timed out an old, failed connection, we don't try to + * use it. + */ +pcmk__client_t * +ipc_proxy_get_provider(void) +{ + return ipc_providers? (pcmk__client_t *) (ipc_providers->data) : NULL; +} + +/*! + * \internal + * \brief Accept a client connection on a proxy IPC server + * + * \param[in] c Client's IPC connection + * \param[in] uid Client's user ID + * \param[in] gid Client's group ID + * \param[in] ipc_channel Name of IPC server to proxy + * + * \return pcmk_ok on success, -errno on error + */ +static int32_t +ipc_proxy_accept(qb_ipcs_connection_t * c, uid_t uid, gid_t gid, const char *ipc_channel) +{ + pcmk__client_t *client; + pcmk__client_t *ipc_proxy = ipc_proxy_get_provider(); + xmlNode *msg; + + if (ipc_proxy == NULL) { + crm_warn("Cannot proxy IPC connection from uid %d gid %d to %s " + "because not connected to cluster", uid, gid, ipc_channel); + return -EREMOTEIO; + } + + /* This new client is a local IPC client on a Pacemaker Remote controlled + * node, needing to access cluster node IPC services. + */ + client = pcmk__new_client(c, uid, gid); + if (client == NULL) { + return -EREMOTEIO; + } + + /* This ipc client is bound to a single ipc provider. If the + * provider goes away, this client is disconnected */ + client->userdata = strdup(ipc_proxy->id); + client->name = crm_strdup_printf("proxy-%s-%d-%.8s", ipc_channel, client->pid, client->id); + + /* Allow remote executor to distinguish between proxied local clients and + * actual executor API clients + */ + pcmk__set_client_flags(client, pcmk__client_to_proxy); + + g_hash_table_insert(ipc_clients, client->id, client); + + msg = create_xml_node(NULL, T_LRMD_IPC_PROXY); + crm_xml_add(msg, F_LRMD_IPC_OP, LRMD_IPC_OP_NEW); + crm_xml_add(msg, F_LRMD_IPC_IPC_SERVER, ipc_channel); + crm_xml_add(msg, F_LRMD_IPC_SESSION, client->id); + lrmd_server_send_notify(ipc_proxy, msg); + free_xml(msg); + crm_debug("Accepted IPC proxy connection (session ID %s) " + "from uid %d gid %d on channel %s", + client->id, uid, gid, ipc_channel); + return 0; +} + +static int32_t +crmd_proxy_accept(qb_ipcs_connection_t * c, uid_t uid, gid_t gid) +{ + return ipc_proxy_accept(c, uid, gid, CRM_SYSTEM_CRMD); +} + +static int32_t +attrd_proxy_accept(qb_ipcs_connection_t * c, uid_t uid, gid_t gid) +{ + return ipc_proxy_accept(c, uid, gid, T_ATTRD); +} + +static int32_t +stonith_proxy_accept(qb_ipcs_connection_t * c, uid_t uid, gid_t gid) +{ + return ipc_proxy_accept(c, uid, gid, "stonith-ng"); +} + +static int32_t +pacemakerd_proxy_accept(qb_ipcs_connection_t * c, uid_t uid, gid_t gid) +{ + return -EREMOTEIO; +} + +static int32_t +cib_proxy_accept_rw(qb_ipcs_connection_t * c, uid_t uid, gid_t gid) +{ + return ipc_proxy_accept(c, uid, gid, PCMK__SERVER_BASED_RW); +} + +static int32_t +cib_proxy_accept_ro(qb_ipcs_connection_t * c, uid_t uid, gid_t gid) +{ + return ipc_proxy_accept(c, uid, gid, PCMK__SERVER_BASED_RO); +} + +void +ipc_proxy_forward_client(pcmk__client_t *ipc_proxy, xmlNode *xml) +{ + const char *session = crm_element_value(xml, F_LRMD_IPC_SESSION); + const char *msg_type = crm_element_value(xml, F_LRMD_IPC_OP); + xmlNode *msg = get_message_xml(xml, F_LRMD_IPC_MSG); + pcmk__client_t *ipc_client; + int rc = pcmk_rc_ok; + + /* If the IPC provider is acknowledging our shutdown request, + * defuse the short exit timer to give the cluster time to + * stop any resources we're running. + */ + if (pcmk__str_eq(msg_type, LRMD_IPC_OP_SHUTDOWN_ACK, pcmk__str_casei)) { + handle_shutdown_ack(); + return; + } + + if (pcmk__str_eq(msg_type, LRMD_IPC_OP_SHUTDOWN_NACK, pcmk__str_casei)) { + handle_shutdown_nack(); + return; + } + + ipc_client = pcmk__find_client_by_id(session); + if (ipc_client == NULL) { + xmlNode *msg = create_xml_node(NULL, T_LRMD_IPC_PROXY); + crm_xml_add(msg, F_LRMD_IPC_OP, LRMD_IPC_OP_DESTROY); + crm_xml_add(msg, F_LRMD_IPC_SESSION, session); + lrmd_server_send_notify(ipc_proxy, msg); + free_xml(msg); + return; + } + + /* This is an event or response from the ipc provider + * going to the local ipc client. + * + * Looking at the chain of events. + * + * -----remote node----------------|---- cluster node ------ + * ipc_client <--1--> this code + * <--2--> pacemaker-controld:remote_proxy_cb/remote_proxy_relay_event() + * <--3--> ipc server + * + * This function is receiving a msg from connection 2 + * and forwarding it to connection 1. + */ + + if (pcmk__str_eq(msg_type, LRMD_IPC_OP_EVENT, pcmk__str_casei)) { + crm_trace("Sending event to %s", ipc_client->id); + rc = pcmk__ipc_send_xml(ipc_client, 0, msg, crm_ipc_server_event); + + } else if (pcmk__str_eq(msg_type, LRMD_IPC_OP_RESPONSE, pcmk__str_casei)) { + int msg_id = 0; + + crm_element_value_int(xml, F_LRMD_IPC_MSG_ID, &msg_id); + crm_trace("Sending response to %d - %s", ipc_client->request_id, ipc_client->id); + rc = pcmk__ipc_send_xml(ipc_client, msg_id, msg, FALSE); + + CRM_LOG_ASSERT(msg_id == ipc_client->request_id); + ipc_client->request_id = 0; + + } else if (pcmk__str_eq(msg_type, LRMD_IPC_OP_DESTROY, pcmk__str_casei)) { + qb_ipcs_disconnect(ipc_client->ipcs); + + } else { + crm_err("Unknown ipc proxy msg type %s" , msg_type); + } + + if (rc != pcmk_rc_ok) { + crm_warn("Could not proxy IPC to client %s: %s " CRM_XS " rc=%d", + ipc_client->id, pcmk_rc_str(rc), rc); + } +} + +static int32_t +ipc_proxy_dispatch(qb_ipcs_connection_t * c, void *data, size_t size) +{ + uint32_t id = 0; + uint32_t flags = 0; + pcmk__client_t *client = pcmk__find_client(c); + pcmk__client_t *ipc_proxy = pcmk__find_client_by_id(client->userdata); + xmlNode *request = NULL; + xmlNode *msg = NULL; + + if (!ipc_proxy) { + qb_ipcs_disconnect(client->ipcs); + return 0; + } + + /* This is a request from the local ipc client going + * to the ipc provider. + * + * Looking at the chain of events. + * + * -----remote node----------------|---- cluster node ------ + * ipc_client <--1--> this code + * <--2--> pacemaker-controld:remote_proxy_dispatch_internal() + * <--3--> ipc server + * + * This function is receiving a request from connection + * 1 and forwarding it to connection 2. + */ + request = pcmk__client_data2xml(client, data, &id, &flags); + + if (!request) { + return 0; + } + + CRM_CHECK(client != NULL, crm_err("Invalid client"); + free_xml(request); return FALSE); + CRM_CHECK(client->id != NULL, crm_err("Invalid client: %p", client); + free_xml(request); return FALSE); + + /* This ensures that synced request/responses happen over the event channel + * in the controller, allowing the controller to process the messages async. + */ + pcmk__set_ipc_flags(flags, pcmk__client_name(client), crm_ipc_proxied); + client->request_id = id; + + msg = create_xml_node(NULL, T_LRMD_IPC_PROXY); + crm_xml_add(msg, F_LRMD_IPC_OP, LRMD_IPC_OP_REQUEST); + crm_xml_add(msg, F_LRMD_IPC_SESSION, client->id); + crm_xml_add(msg, F_LRMD_IPC_CLIENT, pcmk__client_name(client)); + crm_xml_add(msg, F_LRMD_IPC_USER, client->user); + crm_xml_add_int(msg, F_LRMD_IPC_MSG_ID, id); + crm_xml_add_int(msg, F_LRMD_IPC_MSG_FLAGS, flags); + add_message_xml(msg, F_LRMD_IPC_MSG, request); + lrmd_server_send_notify(ipc_proxy, msg); + free_xml(request); + free_xml(msg); + + return 0; +} + +/*! + * \internal + * \brief Notify a proxy provider that we wish to shut down + * + * \param[in,out] ipc_proxy IPC client connection to proxy provider + * + * \return 0 on success, -1 on error + */ +int +ipc_proxy_shutdown_req(pcmk__client_t *ipc_proxy) +{ + xmlNode *msg = create_xml_node(NULL, T_LRMD_IPC_PROXY); + int rc; + + crm_xml_add(msg, F_LRMD_IPC_OP, LRMD_IPC_OP_SHUTDOWN_REQ); + + /* We don't really have a session, but the controller needs this attribute + * to recognize this as proxy communication. + */ + crm_xml_add(msg, F_LRMD_IPC_SESSION, "0"); + + rc = (lrmd_server_send_notify(ipc_proxy, msg) != pcmk_rc_ok)? -1 : 0; + free_xml(msg); + return rc; +} + +static int32_t +ipc_proxy_closed(qb_ipcs_connection_t * c) +{ + pcmk__client_t *client = pcmk__find_client(c); + pcmk__client_t *ipc_proxy; + + if (client == NULL) { + return 0; + } + + ipc_proxy = pcmk__find_client_by_id(client->userdata); + + crm_trace("Connection %p", c); + + if (ipc_proxy) { + xmlNode *msg = create_xml_node(NULL, T_LRMD_IPC_PROXY); + crm_xml_add(msg, F_LRMD_IPC_OP, LRMD_IPC_OP_DESTROY); + crm_xml_add(msg, F_LRMD_IPC_SESSION, client->id); + lrmd_server_send_notify(ipc_proxy, msg); + free_xml(msg); + } + + g_hash_table_remove(ipc_clients, client->id); + + free(client->userdata); + client->userdata = NULL; + pcmk__free_client(client); + return 0; +} + +static void +ipc_proxy_destroy(qb_ipcs_connection_t * c) +{ + crm_trace("Connection %p", c); + ipc_proxy_closed(c); +} + +static struct qb_ipcs_service_handlers crmd_proxy_callbacks = { + .connection_accept = crmd_proxy_accept, + .connection_created = NULL, + .msg_process = ipc_proxy_dispatch, + .connection_closed = ipc_proxy_closed, + .connection_destroyed = ipc_proxy_destroy +}; + +static struct qb_ipcs_service_handlers attrd_proxy_callbacks = { + .connection_accept = attrd_proxy_accept, + .connection_created = NULL, + .msg_process = ipc_proxy_dispatch, + .connection_closed = ipc_proxy_closed, + .connection_destroyed = ipc_proxy_destroy +}; + +static struct qb_ipcs_service_handlers stonith_proxy_callbacks = { + .connection_accept = stonith_proxy_accept, + .connection_created = NULL, + .msg_process = ipc_proxy_dispatch, + .connection_closed = ipc_proxy_closed, + .connection_destroyed = ipc_proxy_destroy +}; + +static struct qb_ipcs_service_handlers pacemakerd_proxy_callbacks = { + .connection_accept = pacemakerd_proxy_accept, + .connection_created = NULL, + .msg_process = NULL, + .connection_closed = NULL, + .connection_destroyed = NULL +}; + +static struct qb_ipcs_service_handlers cib_proxy_callbacks_ro = { + .connection_accept = cib_proxy_accept_ro, + .connection_created = NULL, + .msg_process = ipc_proxy_dispatch, + .connection_closed = ipc_proxy_closed, + .connection_destroyed = ipc_proxy_destroy +}; + +static struct qb_ipcs_service_handlers cib_proxy_callbacks_rw = { + .connection_accept = cib_proxy_accept_rw, + .connection_created = NULL, + .msg_process = ipc_proxy_dispatch, + .connection_closed = ipc_proxy_closed, + .connection_destroyed = ipc_proxy_destroy +}; + +void +ipc_proxy_add_provider(pcmk__client_t *ipc_proxy) +{ + // Prepending ensures the most recent connection is always first + ipc_providers = g_list_prepend(ipc_providers, ipc_proxy); +} + +void +ipc_proxy_remove_provider(pcmk__client_t *ipc_proxy) +{ + GHashTableIter iter; + pcmk__client_t *ipc_client = NULL; + char *key = NULL; + GList *remove_these = NULL; + GList *gIter = NULL; + + ipc_providers = g_list_remove(ipc_providers, ipc_proxy); + + g_hash_table_iter_init(&iter, ipc_clients); + while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & ipc_client)) { + const char *proxy_id = ipc_client->userdata; + if (pcmk__str_eq(proxy_id, ipc_proxy->id, pcmk__str_casei)) { + crm_info("ipc proxy connection for client %s pid %d destroyed because cluster node disconnected.", + ipc_client->id, ipc_client->pid); + /* we can't remove during the iteration, so copy items + * to a list we can destroy later */ + remove_these = g_list_append(remove_these, ipc_client); + } + } + + for (gIter = remove_these; gIter != NULL; gIter = gIter->next) { + ipc_client = gIter->data; + + // Disconnection callback will free the client here + qb_ipcs_disconnect(ipc_client->ipcs); + } + + /* just frees the list, not the elements in the list */ + g_list_free(remove_these); +} + +void +ipc_proxy_init(void) +{ + ipc_clients = pcmk__strkey_table(NULL, NULL); + + pcmk__serve_based_ipc(&cib_ro, &cib_rw, &cib_shm, &cib_proxy_callbacks_ro, + &cib_proxy_callbacks_rw); + pcmk__serve_attrd_ipc(&attrd_ipcs, &attrd_proxy_callbacks); + pcmk__serve_fenced_ipc(&stonith_ipcs, &stonith_proxy_callbacks); + pcmk__serve_pacemakerd_ipc(&pacemakerd_ipcs, &pacemakerd_proxy_callbacks); + crmd_ipcs = pcmk__serve_controld_ipc(&crmd_proxy_callbacks); + if (crmd_ipcs == NULL) { + crm_err("Failed to create controller: exiting and inhibiting respawn"); + crm_warn("Verify pacemaker and pacemaker_remote are not both enabled"); + crm_exit(CRM_EX_FATAL); + } +} + +void +ipc_proxy_cleanup(void) +{ + if (ipc_providers) { + g_list_free(ipc_providers); + ipc_providers = NULL; + } + if (ipc_clients) { + g_hash_table_destroy(ipc_clients); + ipc_clients = NULL; + } + pcmk__stop_based_ipc(cib_ro, cib_rw, cib_shm); + qb_ipcs_destroy(attrd_ipcs); + qb_ipcs_destroy(stonith_ipcs); + qb_ipcs_destroy(pacemakerd_ipcs); + qb_ipcs_destroy(crmd_ipcs); + cib_ro = NULL; + cib_rw = NULL; + cib_shm = NULL; +} diff --git a/daemons/execd/remoted_tls.c b/daemons/execd/remoted_tls.c new file mode 100644 index 0000000..c65e3f3 --- /dev/null +++ b/daemons/execd/remoted_tls.c @@ -0,0 +1,428 @@ +/* + * Copyright 2012-2023 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * + * This source code is licensed under the GNU Lesser General Public License + * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. + */ + +#include <crm_internal.h> + +#include <glib.h> +#include <unistd.h> + +#include <crm/crm.h> +#include <crm/msg_xml.h> +#include <crm/crm.h> +#include <crm/msg_xml.h> +#include <crm/common/mainloop.h> +#include <crm/common/remote_internal.h> +#include <crm/lrmd_internal.h> + +#include <netdb.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <arpa/inet.h> + +#include "pacemaker-execd.h" + +#ifdef HAVE_GNUTLS_GNUTLS_H + +# include <gnutls/gnutls.h> + +# define LRMD_REMOTE_AUTH_TIMEOUT 10000 +gnutls_psk_server_credentials_t psk_cred_s; +gnutls_dh_params_t dh_params; +static int ssock = -1; +extern int lrmd_call_id; + +static void +debug_log(int level, const char *str) +{ + fputs(str, stderr); +} + +/*! + * \internal + * \brief Read (more) TLS handshake data from client + * + * \param[in,out] client IPC client doing handshake + * + * \return 0 on success or more data needed, -1 on error + */ +static int +remoted__read_handshake_data(pcmk__client_t *client) +{ + int rc = pcmk__read_handshake_data(client); + + if (rc == EAGAIN) { + /* No more data is available at the moment. Just return for now; + * we'll get invoked again once the client sends more. + */ + return 0; + } else if (rc != pcmk_rc_ok) { + return -1; + } + + if (client->remote->auth_timeout) { + g_source_remove(client->remote->auth_timeout); + } + client->remote->auth_timeout = 0; + + pcmk__set_client_flags(client, pcmk__client_tls_handshake_complete); + crm_notice("Remote client connection accepted"); + + /* Only a client with access to the TLS key can connect, so we can treat + * it as privileged. + */ + pcmk__set_client_flags(client, pcmk__client_privileged); + + // Alert other clients of the new connection + notify_of_new_client(client); + return 0; +} + +static int +lrmd_remote_client_msg(gpointer data) +{ + int id = 0; + int rc; + xmlNode *request = NULL; + pcmk__client_t *client = data; + + if (!pcmk_is_set(client->flags, + pcmk__client_tls_handshake_complete)) { + return remoted__read_handshake_data(client); + } + + switch (pcmk__remote_ready(client->remote, 0)) { + case pcmk_rc_ok: + break; + case ETIME: // No message available to read + return 0; + default: // Error + crm_info("Remote client disconnected while polling it"); + return -1; + } + + rc = pcmk__read_remote_message(client->remote, -1); + + request = pcmk__remote_message_xml(client->remote); + while (request) { + crm_element_value_int(request, F_LRMD_REMOTE_MSG_ID, &id); + crm_trace("Processing remote client request %d", id); + if (!client->name) { + const char *value = crm_element_value(request, F_LRMD_CLIENTNAME); + + if (value) { + client->name = strdup(value); + } + } + + lrmd_call_id++; + if (lrmd_call_id < 1) { + lrmd_call_id = 1; + } + + crm_xml_add(request, F_LRMD_CLIENTID, client->id); + crm_xml_add(request, F_LRMD_CLIENTNAME, client->name); + crm_xml_add_int(request, F_LRMD_CALLID, lrmd_call_id); + + process_lrmd_message(client, id, request); + free_xml(request); + + /* process all the messages in the current buffer */ + request = pcmk__remote_message_xml(client->remote); + } + + if (rc == ENOTCONN) { + crm_info("Remote client disconnected while reading from it"); + return -1; + } + + return 0; +} + +static void +lrmd_remote_client_destroy(gpointer user_data) +{ + pcmk__client_t *client = user_data; + + if (client == NULL) { + return; + } + + crm_notice("Cleaning up after remote client %s disconnected", + pcmk__client_name(client)); + + ipc_proxy_remove_provider(client); + + /* if this is the last remote connection, stop recurring + * operations */ + if (pcmk__ipc_client_count() == 1) { + client_disconnect_cleanup(NULL); + } + + if (client->remote->tls_session) { + void *sock_ptr; + int csock; + + sock_ptr = gnutls_transport_get_ptr(*client->remote->tls_session); + csock = GPOINTER_TO_INT(sock_ptr); + + gnutls_bye(*client->remote->tls_session, GNUTLS_SHUT_RDWR); + gnutls_deinit(*client->remote->tls_session); + gnutls_free(client->remote->tls_session); + close(csock); + } + + lrmd_client_destroy(client); + return; +} + +static gboolean +lrmd_auth_timeout_cb(gpointer data) +{ + pcmk__client_t *client = data; + + client->remote->auth_timeout = 0; + + if (pcmk_is_set(client->flags, + pcmk__client_tls_handshake_complete)) { + return FALSE; + } + + mainloop_del_fd(client->remote->source); + client->remote->source = NULL; + crm_err("Remote client authentication timed out"); + + return FALSE; +} + +// Dispatch callback for remote server socket +static int +lrmd_remote_listen(gpointer data) +{ + int csock = -1; + gnutls_session_t *session = NULL; + pcmk__client_t *new_client = NULL; + + // For client socket + static struct mainloop_fd_callbacks lrmd_remote_fd_cb = { + .dispatch = lrmd_remote_client_msg, + .destroy = lrmd_remote_client_destroy, + }; + + CRM_CHECK(ssock >= 0, return TRUE); + + if (pcmk__accept_remote_connection(ssock, &csock) != pcmk_rc_ok) { + return TRUE; + } + + session = pcmk__new_tls_session(csock, GNUTLS_SERVER, GNUTLS_CRD_PSK, + psk_cred_s); + if (session == NULL) { + close(csock); + return TRUE; + } + + new_client = pcmk__new_unauth_client(NULL); + new_client->remote = calloc(1, sizeof(pcmk__remote_t)); + pcmk__set_client_flags(new_client, pcmk__client_tls); + new_client->remote->tls_session = session; + + // Require the client to authenticate within this time + new_client->remote->auth_timeout = g_timeout_add(LRMD_REMOTE_AUTH_TIMEOUT, + lrmd_auth_timeout_cb, + new_client); + crm_info("Remote client pending authentication " + CRM_XS " %p id: %s", new_client, new_client->id); + + new_client->remote->source = + mainloop_add_fd("pacemaker-remote-client", G_PRIORITY_DEFAULT, csock, + new_client, &lrmd_remote_fd_cb); + return TRUE; +} + +static void +tls_server_dropped(gpointer user_data) +{ + crm_notice("TLS server session ended"); + return; +} + +// \return 0 on success, -1 on error (gnutls_psk_server_credentials_function) +static int +lrmd_tls_server_key_cb(gnutls_session_t session, const char *username, gnutls_datum_t * key) +{ + return (lrmd__init_remote_key(key) == pcmk_rc_ok)? 0 : -1; +} + +static int +bind_and_listen(struct addrinfo *addr) +{ + int optval; + int fd; + int rc; + char buffer[INET6_ADDRSTRLEN] = { 0, }; + + pcmk__sockaddr2str(addr->ai_addr, buffer); + crm_trace("Attempting to bind to address %s", buffer); + + fd = socket(addr->ai_family, addr->ai_socktype, addr->ai_protocol); + if (fd < 0) { + crm_perror(LOG_ERR, "Listener socket creation failed"); + return -1; + } + + /* reuse address */ + optval = 1; + rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &optval, sizeof(optval)); + if (rc < 0) { + crm_perror(LOG_ERR, "Local address reuse not allowed on %s", buffer); + close(fd); + return -1; + } + + if (addr->ai_family == AF_INET6) { + optval = 0; + rc = setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &optval, sizeof(optval)); + if (rc < 0) { + crm_perror(LOG_INFO, "Couldn't disable IPV6-only on %s", buffer); + close(fd); + return -1; + } + } + + if (bind(fd, addr->ai_addr, addr->ai_addrlen) != 0) { + crm_perror(LOG_ERR, "Cannot bind to %s", buffer); + close(fd); + return -1; + } + + if (listen(fd, 10) == -1) { + crm_perror(LOG_ERR, "Cannot listen on %s", buffer); + close(fd); + return -1; + } + return fd; +} + +static int +get_address_info(const char *bind_name, int port, struct addrinfo **res) +{ + int rc; + char port_str[6]; // at most "65535" + struct addrinfo hints; + + memset(&hints, 0, sizeof(struct addrinfo)); + hints.ai_flags = AI_PASSIVE; + hints.ai_family = AF_UNSPEC; // IPv6 or IPv4 + hints.ai_socktype = SOCK_STREAM; + hints.ai_protocol = IPPROTO_TCP; + + snprintf(port_str, sizeof(port_str), "%d", port); + rc = getaddrinfo(bind_name, port_str, &hints, res); + if (rc) { + crm_err("Unable to get IP address(es) for %s: %s", + (bind_name? bind_name : "local node"), gai_strerror(rc)); + return -EADDRNOTAVAIL; + } + return pcmk_ok; +} + +int +lrmd_init_remote_tls_server(void) +{ + int filter; + int port = crm_default_remote_port(); + struct addrinfo *res = NULL, *iter; + gnutls_datum_t psk_key = { NULL, 0 }; + const char *bind_name = getenv("PCMK_remote_address"); + + static struct mainloop_fd_callbacks remote_listen_fd_callbacks = { + .dispatch = lrmd_remote_listen, + .destroy = tls_server_dropped, + }; + + CRM_CHECK(ssock == -1, return ssock); + + crm_debug("Starting TLS listener on %s port %d", + (bind_name? bind_name : "all addresses on"), port); + crm_gnutls_global_init(); + gnutls_global_set_log_function(debug_log); + + if (pcmk__init_tls_dh(&dh_params) != pcmk_rc_ok) { + return -1; + } + gnutls_psk_allocate_server_credentials(&psk_cred_s); + gnutls_psk_set_server_credentials_function(psk_cred_s, lrmd_tls_server_key_cb); + gnutls_psk_set_server_dh_params(psk_cred_s, dh_params); + + /* The key callback won't get called until the first client connection + * attempt. Do it once here, so we can warn the user at start-up if we can't + * read the key. We don't error out, though, because it's fine if the key is + * going to be added later. + */ + if (lrmd__init_remote_key(&psk_key) != pcmk_rc_ok) { + crm_warn("A cluster connection will not be possible until the key is available"); + } + gnutls_free(psk_key.data); + + if (get_address_info(bind_name, port, &res) != pcmk_ok) { + return -1; + } + + /* Currently we listen on only one address from the resulting list (the + * first IPv6 address we can bind to if possible, otherwise the first IPv4 + * address we can bind to). When bind_name is NULL, this should be the + * respective wildcard address. + * + * @TODO If there is demand for specifying more than one address, allow + * bind_name to be a space-separated list, call getaddrinfo() for each, + * and create a socket for each result (set IPV6_V6ONLY on IPv6 sockets + * since IPv4 listeners will have their own sockets). + */ + iter = res; + filter = AF_INET6; + while (iter) { + if (iter->ai_family == filter) { + ssock = bind_and_listen(iter); + } + if (ssock != -1) { + break; + } + + iter = iter->ai_next; + if (iter == NULL && filter == AF_INET6) { + iter = res; + filter = AF_INET; + } + } + + if (ssock >= 0) { + mainloop_add_fd("pacemaker-remote-server", G_PRIORITY_DEFAULT, ssock, + NULL, &remote_listen_fd_callbacks); + crm_debug("Started TLS listener on %s port %d", + (bind_name? bind_name : "all addresses on"), port); + } + freeaddrinfo(res); + return ssock; +} + +void +execd_stop_tls_server(void) +{ + if (psk_cred_s) { + gnutls_psk_free_server_credentials(psk_cred_s); + psk_cred_s = 0; + } + + if (ssock >= 0) { + close(ssock); + ssock = -1; + } +} +#endif |