diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 07:52:36 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 07:52:36 +0000 |
commit | 7de03e4e519705301265c0415b3c0af85263a7ac (patch) | |
tree | 29d819c5227e3619d18a67d2a5dde963b3229dbe /tools/sfex_daemon.c | |
parent | Initial commit. (diff) | |
download | resource-agents-7de03e4e519705301265c0415b3c0af85263a7ac.tar.xz resource-agents-7de03e4e519705301265c0415b3c0af85263a7ac.zip |
Adding upstream version 1:4.13.0.upstream/1%4.13.0
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'tools/sfex_daemon.c')
-rw-r--r-- | tools/sfex_daemon.c | 345 |
1 files changed, 345 insertions, 0 deletions
diff --git a/tools/sfex_daemon.c b/tools/sfex_daemon.c new file mode 100644 index 0000000..9a761b8 --- /dev/null +++ b/tools/sfex_daemon.c @@ -0,0 +1,345 @@ +#include <config.h> +#include <stdio.h> +#include <unistd.h> +#include <string.h> +#include <stdlib.h> +#include <signal.h> +#include <limits.h> +#include <sys/mman.h> +#include <string.h> +#include <errno.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <syslog.h> +#include "sfex.h" +#include "sfex_lib.h" + +#if HAVE_GLUE_CONFIG_H +#include <glue_config.h> /* for HA_LOG_FACILITY */ +#endif + +static int sysrq_fd; +static int lock_index = 1; /* default 1st lock */ +static time_t collision_timeout = 1; /* default 1 sec */ +static time_t lock_timeout = 60; /* default 60 sec */ +time_t unlock_timeout = 60; +static time_t monitor_interval = 10; + +static sfex_controldata cdata; +static sfex_lockdata ldata; +static sfex_lockdata ldata_new; + +static const char *device; +const char *progname; +char *nodename; +static const char *rsc_id = "sfex"; + +static void usage(FILE *dist) { + fprintf(dist, "usage: %s [-i <index>] [-c <collision_timeout>] [-t <lock_timeout>] <device>\n", progname); +} + +static void acquire_lock(void) +{ + if (read_lockdata(&cdata, &ldata, lock_index) == -1) { + cl_log(LOG_ERR, "read_lockdata failed in acquire_lock\n"); + exit(EXIT_FAILURE); + } + + if ((ldata.status == SFEX_STATUS_LOCK) && (strncmp(nodename, (const char*)(ldata.nodename), sizeof(ldata.nodename)))) { + unsigned int t = lock_timeout; + while (t > 0) + t = sleep(t); + read_lockdata(&cdata, &ldata_new, lock_index); + if (ldata.count != ldata_new.count) { + cl_log(LOG_ERR, "can\'t acquire lock: the lock's already hold by some other node.\n"); + exit(2); + } + } + + /* The lock acquisition is possible because it was not updated. */ + ldata.status = SFEX_STATUS_LOCK; + ldata.count = SFEX_NEXT_COUNT(ldata.count); + strncpy((char*)(ldata.nodename), nodename, sizeof(ldata.nodename) - 1); + if (write_lockdata(&cdata, &ldata, lock_index) == -1) { + cl_log(LOG_ERR, "write_lockdata failed\n"); + exit(EXIT_FAILURE); + } + + /* detect the collision of lock */ + /* The collision occurs when two or more nodes do the reservation + processing of the lock at the same time. It waits for collision_timeout + seconds to detect this,and whether the superscription of lock data by + another node is done is checked. If the superscription was done by + another node, the lock acquisition with the own node is given up. + */ + { + unsigned int t = collision_timeout; + while (t > 0) + t = sleep(t); + if (read_lockdata(&cdata, &ldata_new, lock_index) == -1) { + cl_log(LOG_ERR, "read_lockdata failed in collision detection\n"); + } + if (strncmp((char*)(ldata.nodename), (const char*)(ldata_new.nodename), sizeof(ldata.nodename))) { + cl_log(LOG_ERR, "can\'t acquire lock: collision detected in the air.\n"); + exit(2); + } + } + + /* extension of lock */ + /* Validly time of the lock is extended. It is because of spending at + the collision_timeout seconds to detect the collision. */ + ldata.count = SFEX_NEXT_COUNT(ldata.count); + if (write_lockdata(&cdata, &ldata, lock_index) == -1) { + cl_log(LOG_ERR, "write_lockdata failed in extension of lock\n"); + exit(EXIT_FAILURE); + } + cl_log(LOG_INFO, "lock acquired\n"); +} + +static void error_todo (void) +{ + if (fork() == 0) { + cl_log(LOG_INFO, "Execute \"crm_resource -F -r %s --node %s\" command\n", rsc_id, nodename); + execl("/usr/sbin/crm_resource", "crm_resource", "-F", "-r", rsc_id, "--node", nodename, NULL); + } else { + exit(EXIT_FAILURE); + } +} + +static void failure_todo(void) +{ +#ifdef SFEX_TESTING + exit(EXIT_FAILURE); +#else + /*execl("/usr/sbin/crm_resource", "crm_resource", "-F", "-r", rsc_id, "--node", nodename, NULL); */ + int ret; + + cl_log(LOG_INFO, "Force reboot node %s\n", nodename); + ret = write(sysrq_fd, "b\n", 2); + if (ret == -1) { + cl_log(LOG_ERR, "%s\n", strerror(errno)); + } + close(sysrq_fd); + exit(EXIT_FAILURE); +#endif +} + +static void update_lock(void) +{ + /* read lock data */ + if (read_lockdata(&cdata, &ldata, lock_index) == -1) { + cl_log(LOG_ERR, "read_lockdata failed in update_lock\n"); + error_todo(); + exit(EXIT_FAILURE); + } + + /* check current lock status */ + /* if own node is not locking, lock update is failed */ + if (ldata.status != SFEX_STATUS_LOCK || strncmp((const char*)(ldata.nodename), nodename, sizeof(ldata.nodename))) { + cl_log(LOG_ERR, "can't update lock.\n"); + failure_todo(); + exit(EXIT_FAILURE); + } + + /* lock update */ + ldata.count = SFEX_NEXT_COUNT(ldata.count); + if (write_lockdata(&cdata, &ldata, lock_index) == -1) { + cl_log(LOG_ERR, "write_lockdata failed in update_lock\n"); + error_todo(); + exit(EXIT_FAILURE); + } +} + +static void release_lock(void) +{ + /* The only thing I care about in release_lock(), is to terminate the process */ + + /* read lock data */ + if (read_lockdata(&cdata, &ldata, lock_index) == -1) { + cl_log(LOG_ERR, "read_lockdata failed in release_lock\n"); + exit(EXIT_FAILURE); + } + + /* check current lock status */ + /* if own node is not locking, we judge that lock has been released already */ + if (ldata.status != SFEX_STATUS_LOCK || strncmp((const char*)(ldata.nodename), nodename, sizeof(ldata.nodename))) { + cl_log(LOG_ERR, "lock was already released.\n"); + exit(EXIT_FAILURE); + } + + /* lock release */ + ldata.status = SFEX_STATUS_UNLOCK; + if (write_lockdata(&cdata, &ldata, lock_index) == -1) { + /*FIXME: We are going to self-stop */ + cl_log(LOG_ERR, "write_lockdata failed in release_lock\n"); + exit(EXIT_FAILURE); + } + cl_log(LOG_INFO, "lock released\n"); +} + +static void quit_handler(int signo, siginfo_t *info, void *context) +{ + cl_log(LOG_INFO, "quit_handler called. now releasing lock\n"); + release_lock(); + cl_log(LOG_INFO, "Shutdown sfex_daemon with EXIT_SUCCESS\n"); + exit(EXIT_SUCCESS); +} + +int main(int argc, char *argv[]) +{ + + int ret; + + progname = get_progname(argv[0]); + nodename = get_nodename(); + + cl_log_set_entity(progname); + cl_log_set_facility(HA_LOG_FACILITY); + cl_inherit_logging_environment(0); + + /* read command line option */ + opterr = 0; + while (1) { + int c = getopt(argc, argv, "hi:c:t:m:n:r:"); + if (c == -1) + break; + switch (c) { + case 'h': /* help*/ + usage(stdout); + exit(EXIT_SUCCESS); + case 'i': /* -i <index> */ + { + unsigned long l = strtoul(optarg, NULL, 10); + if (l < SFEX_MIN_NUMLOCKS || l > SFEX_MAX_NUMLOCKS) { + cl_log(LOG_ERR, + "index %s is out of range or invalid. it must be integer value between %lu and %lu.\n", + optarg, + (unsigned long)SFEX_MIN_NUMLOCKS, + (unsigned long)SFEX_MAX_NUMLOCKS); + exit(4); + } + lock_index = l; + } + break; + case 'c': /* -c <collision_timeout> */ + { + unsigned long l = strtoul(optarg, NULL, 10); + if (l < 1 || l > INT_MAX) { + cl_log(LOG_ERR, + "collision_timeout %s is out of range or invalid. it must be integer value between %lu and %lu.\n", + optarg, + (unsigned long)1, + (unsigned long)INT_MAX); + exit(4); + } + collision_timeout = l; + } + break; + case 'm': /* -m <monitor_interval> */ + { + unsigned long l = strtoul(optarg, NULL, 10); + if (l < 1 || l > INT_MAX) { + cl_log(LOG_ERR, + "monitor_interval %s is out of range or invalid. it must be integer value between %lu and %lu.\n", + optarg, + (unsigned long)1, + (unsigned long)INT_MAX); + exit(4); + } + monitor_interval = l; + } + break; + case 't': /* -t <lock_timeout> */ + { + unsigned long l = strtoul(optarg, NULL, 10); + if (l < 1 || l > INT_MAX) { + cl_log(LOG_ERR, + "lock_timeout %s is out of range or invalid. it must be integer value between %lu and %lu.\n", + optarg, + (unsigned long)1, + (unsigned long)INT_MAX); + exit(4); + } + lock_timeout = l; + } + break; + case 'n': + { + free(nodename); + if (strlen(optarg) > SFEX_MAX_NODENAME) { + cl_log(LOG_ERR, "nodename %s is too long. must be less than %d byte.\n", + optarg, + (unsigned int)SFEX_MAX_NODENAME); + exit(EXIT_FAILURE); + } + nodename = strdup(optarg); + } + break; + case 'r': + { + rsc_id = strdup(optarg); + } + break; + case '?': /* error */ + usage(stderr); + exit(4); + } + } + /* check parameter except the option */ + if (optind >= argc) { + cl_log(LOG_ERR, "no device specified.\n"); + usage(stderr); + exit(EXIT_FAILURE); + } else if (optind + 1 < argc) { + cl_log(LOG_ERR, "too many arguments.\n"); + usage(stderr); + exit(EXIT_FAILURE); + } + device = argv[optind]; + + prepare_lock(device); +#if !SFEX_TESTING + sysrq_fd = open("/proc/sysrq-trigger", O_WRONLY); + if (sysrq_fd == -1) { + cl_log(LOG_ERR, "failed to open /proc/sysrq-trigger due to %s\n", strerror(errno)); + exit(EXIT_FAILURE); + } +#endif + + ret = lock_index_check(&cdata, lock_index); + if (ret == -1) + exit(EXIT_FAILURE); + + { + struct sigaction sig_act; + sigemptyset (&sig_act.sa_mask); + sig_act.sa_flags = SA_SIGINFO; + + sig_act.sa_sigaction = quit_handler; + ret = sigaction(SIGTERM, &sig_act, NULL); + if (ret == -1) { + cl_log(LOG_ERR, "sigaction failed\n"); + exit(EXIT_FAILURE); + } + } + + cl_log(LOG_INFO, "Starting SFeX Daemon...\n"); + + /* acquire lock first.*/ + acquire_lock(); + + if (daemon(0, 1) != 0) { + cl_perror("%s::%d: daemon() failed.", __FUNCTION__, __LINE__); + release_lock(); + exit(EXIT_FAILURE); + } + + cl_make_realtime(-1, -1, 128, 128); + + cl_log(LOG_INFO, "SFeX Daemon started.\n"); + while (1) { + sleep (monitor_interval); + update_lock(); + } +} |