Adding upstream version 4.2.upstream/4.2

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2023-02-24 14:34:34 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2023-02-24 14:34:34 +0000
commit: 946b54554e13d6a97940df936123855e0a305abc (patch)
tree: 80a778fbd7bb3c7858cfac572df1cb08cfa4f988 /mdmon.c
parent: Initial commit. (diff)
download: mdadm-946b54554e13d6a97940df936123855e0a305abc.tar.xz
mdadm-946b54554e13d6a97940df936123855e0a305abc.zip
1 files changed, 594 insertions, 0 deletions
diff --git a/mdmon.c b/mdmon.c
new file mode 100644
index 0000000..c71e62c
--- /dev/null
+++ b/mdmon.c
@@ -0,0 +1,594 @@
+/*
+ * mdmon - monitor external metadata arrays
+ *
+ * Copyright (C) 2007-2009 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2007-2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+/*
+ * md array manager.
+ * When md arrays have user-space managed metadata, this is the program
+ * that does the managing.
+ *
+ * Given one argument: the name of the array (e.g. /dev/md0) that is
+ * the container.
+ * We fork off a helper that runs high priority and mlocked.  It responds to
+ * device failures and other events that might stop writeout, or that are
+ * trivial to deal with.
+ * The main thread then watches for new arrays being created in the container
+ * and starts monitoring them too ... along with a few other tasks.
+ *
+ * The main thread communicates with the priority thread by writing over
+ * a pipe.
+ * Separate programs can communicate with the main thread via Unix-domain
+ * socket.
+ * The two threads share address space and open file table.
+ *
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include	<unistd.h>
+#include	<stdlib.h>
+#include	<sys/types.h>
+#include	<sys/stat.h>
+#include	<sys/socket.h>
+#include	<sys/un.h>
+#include	<sys/mman.h>
+#include	<sys/syscall.h>
+#include	<sys/wait.h>
+#include	<stdio.h>
+#include	<errno.h>
+#include	<string.h>
+#include	<fcntl.h>
+#include	<signal.h>
+#include	<dirent.h>
+#ifdef USE_PTHREADS
+#include	<pthread.h>
+#else
+#include	<sched.h>
+#endif
+
+#include	"mdadm.h"
+#include	"mdmon.h"
+
+char const Name[] = "mdmon";
+
+struct active_array *discard_this;
+struct active_array *pending_discard;
+
+int mon_tid, mgr_tid;
+
+int sigterm;
+
+#ifdef USE_PTHREADS
+static void *run_child(void *v)
+{
+	struct supertype *c = v;
+
+	mon_tid = syscall(SYS_gettid);
+	do_monitor(c);
+	return 0;
+}
+
+static int clone_monitor(struct supertype *container)
+{
+	pthread_attr_t attr;
+	pthread_t thread;
+	int rc;
+
+	mon_tid = -1;
+	pthread_attr_init(&attr);
+	pthread_attr_setstacksize(&attr, 4096);
+	pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+	rc = pthread_create(&thread, &attr, run_child, container);
+	if (rc)
+		return rc;
+	while (mon_tid == -1)
+		usleep(10);
+	pthread_attr_destroy(&attr);
+
+	mgr_tid = syscall(SYS_gettid);
+
+	return mon_tid;
+}
+#else /* USE_PTHREADS */
+static int run_child(void *v)
+{
+	struct supertype *c = v;
+
+	do_monitor(c);
+	return 0;
+}
+
+#ifdef __ia64__
+int __clone2(int (*fn)(void *),
+	    void *child_stack_base, size_t stack_size,
+	    int flags, void *arg, ...
+	 /* pid_t *pid, struct user_desc *tls, pid_t *ctid */ );
+#endif
+static int clone_monitor(struct supertype *container)
+{
+	static char stack[4096];
+
+#ifdef __ia64__
+	mon_tid = __clone2(run_child, stack, sizeof(stack),
+		   CLONE_FS|CLONE_FILES|CLONE_VM|CLONE_SIGHAND|CLONE_THREAD,
+		   container);
+#else
+	mon_tid = clone(run_child, stack+4096-64,
+		   CLONE_FS|CLONE_FILES|CLONE_VM|CLONE_SIGHAND|CLONE_THREAD,
+		   container);
+#endif
+
+	mgr_tid = syscall(SYS_gettid);
+
+	return mon_tid;
+}
+#endif /* USE_PTHREADS */
+
+static int make_pidfile(char *devname)
+{
+	char path[100];
+	char pid[10];
+	int fd;
+	int n;
+
+	if (mkdir(MDMON_DIR, 0755) < 0 &&
+	    errno != EEXIST)
+		return -errno;
+	sprintf(path, "%s/%s.pid", MDMON_DIR, devname);
+
+	fd = open(path, O_RDWR|O_CREAT|O_EXCL, 0600);
+	if (fd < 0)
+		return -errno;
+	sprintf(pid, "%d\n", getpid());
+	n = write(fd, pid, strlen(pid));
+	close(fd);
+	if (n < 0)
+		return -errno;
+	return 0;
+}
+
+static void try_kill_monitor(pid_t pid, char *devname, int sock)
+{
+	char buf[100];
+	int fd;
+	int n;
+	long fl;
+	int rv;
+
+	/* first rule of survival... don't off yourself */
+	if (pid == getpid())
+		return;
+
+	/* kill this process if it is mdmon */
+	sprintf(buf, "/proc/%lu/cmdline", (unsigned long) pid);
+	fd = open(buf, O_RDONLY);
+	if (fd < 0)
+		return;
+
+	n = read(fd, buf, sizeof(buf)-1);
+	buf[sizeof(buf)-1] = 0;
+	close(fd);
+
+	if (n < 0 || !(strstr(buf, "mdmon") ||
+		       strstr(buf, "@dmon")))
+		return;
+
+	kill(pid, SIGTERM);
+
+	if (sock < 0)
+		return;
+
+	/* Wait for monitor to exit by reading from the socket, after
+	 * clearing the non-blocking flag */
+	fl = fcntl(sock, F_GETFL, 0);
+	fl &= ~O_NONBLOCK;
+	fcntl(sock, F_SETFL, fl);
+	n = read(sock, buf, 100);
+
+	/* If there is I/O going on it might took some time to get to
+	 * clean state. Wait for monitor to exit fully to avoid races.
+	 * Ping it with SIGUSR1 in case that it is sleeping  */
+	for (n = 0; n < 25; n++) {
+		rv = kill(pid, SIGUSR1);
+		if (rv < 0)
+			break;
+		usleep(200000);
+	}
+}
+
+void remove_pidfile(char *devname)
+{
+	char buf[100];
+
+	sprintf(buf, "%s/%s.pid", MDMON_DIR, devname);
+	unlink(buf);
+	sprintf(buf, "%s/%s.sock", MDMON_DIR, devname);
+	unlink(buf);
+}
+
+static int make_control_sock(char *devname)
+{
+	char path[100];
+	int sfd;
+	long fl;
+	struct sockaddr_un addr;
+
+	if (sigterm)
+		return -1;
+
+	sprintf(path, "%s/%s.sock", MDMON_DIR, devname);
+	unlink(path);
+	sfd = socket(PF_LOCAL, SOCK_STREAM, 0);
+	if (sfd < 0)
+		return -1;
+
+	addr.sun_family = PF_LOCAL;
+	strcpy(addr.sun_path, path);
+	umask(077); /* ensure no world write access */
+	if (bind(sfd, (struct sockaddr*)&addr, sizeof(addr)) < 0) {
+		close(sfd);
+		return -1;
+	}
+	listen(sfd, 10);
+	fl = fcntl(sfd, F_GETFL, 0);
+	fl |= O_NONBLOCK;
+	fcntl(sfd, F_SETFL, fl);
+	return sfd;
+}
+
+static void term(int sig)
+{
+	sigterm = 1;
+}
+
+static void wake_me(int sig)
+{
+
+}
+
+/* if we are debugging and starting mdmon by hand then don't fork */
+static int do_fork(void)
+{
+	#ifdef DEBUG
+	if (check_env("MDADM_NO_MDMON"))
+		return 0;
+	#endif
+
+	return 1;
+}
+
+void usage(void)
+{
+	fprintf(stderr,
+"Usage: mdmon [options] CONTAINER\n"
+"\n"
+"Options are:\n"
+"  --help        -h   : This message\n"
+"  --all         -a   : All devices\n"
+"  --foreground  -F   : Run in foreground (do not fork)\n"
+"  --takeover    -t   : Takeover container\n"
+);
+	exit(2);
+}
+
+static int mdmon(char *devnm, int must_fork, int takeover);
+
+int main(int argc, char *argv[])
+{
+	char *container_name = NULL;
+	char *devnm = NULL;
+	int status = 0;
+	int opt;
+	int all = 0;
+	int takeover = 0;
+	int dofork = 1;
+	static struct option options[] = {
+		{"all", 0, NULL, 'a'},
+		{"takeover", 0, NULL, 't'},
+		{"help", 0, NULL, 'h'},
+		{"offroot", 0, NULL, OffRootOpt},
+		{"foreground", 0, NULL, 'F'},
+		{NULL, 0, NULL, 0}
+	};
+
+	if (in_initrd()) {
+		/*
+		 * set first char of argv[0] to @. This is used by
+		 * systemd to signal that the task was launched from
+		 * initrd/initramfs and should be preserved during shutdown
+		 */
+		argv[0][0] = '@';
+	}
+
+	while ((opt = getopt_long(argc, argv, "thaF", options, NULL)) != -1) {
+		switch (opt) {
+		case 'a':
+			container_name = argv[optind-1];
+			all = 1;
+			break;
+		case 't':
+			takeover = 1;
+			break;
+		case 'F':
+			dofork = 0;
+			break;
+		case OffRootOpt:
+			argv[0][0] = '@';
+			break;
+		case 'h':
+		default:
+			usage();
+			break;
+		}
+	}
+
+	if (all == 0 && container_name == NULL) {
+		if (argv[optind])
+			container_name = argv[optind];
+	}
+
+	if (container_name == NULL)
+		usage();
+
+	if (argc - optind > 1)
+		usage();
+
+	if (strcmp(container_name, "/proc/mdstat") == 0)
+		all = 1;
+
+	if (all) {
+		struct mdstat_ent *mdstat, *e;
+		int container_len = strlen(container_name);
+
+		/* launch an mdmon instance for each container found */
+		mdstat = mdstat_read(0, 0);
+		for (e = mdstat; e; e = e->next) {
+			if (e->metadata_version &&
+			    strncmp(e->metadata_version, "external:", 9) == 0 &&
+			    !is_subarray(&e->metadata_version[9])) {
+				/* update cmdline so this mdmon instance can be
+				 * distinguished from others in a call to ps(1)
+				 */
+				if (strlen(e->devnm) <= (unsigned)container_len) {
+					memset(container_name, 0, container_len);
+					sprintf(container_name, "%s", e->devnm);
+				}
+				status |= mdmon(e->devnm, 1, takeover);
+			}
+		}
+		free_mdstat(mdstat);
+
+		return status;
+	} else if (strncmp(container_name, "md", 2) == 0) {
+		int id = devnm2devid(container_name);
+		if (id)
+			devnm = container_name;
+	} else {
+		struct stat st;
+
+		if (stat(container_name, &st) == 0)
+			devnm = xstrdup(stat2devnm(&st));
+	}
+
+	if (!devnm) {
+		pr_err("%s is not a valid md device name\n",
+			container_name);
+		exit(1);
+	}
+	return mdmon(devnm, dofork && do_fork(), takeover);
+}
+
+static int mdmon(char *devnm, int must_fork, int takeover)
+{
+	int mdfd;
+	struct mdinfo *mdi, *di;
+	struct supertype *container;
+	sigset_t set;
+	struct sigaction act;
+	int pfd[2];
+	int status;
+	int ignore;
+	pid_t victim = -1;
+	int victim_sock = -1;
+
+	dprintf("starting mdmon for %s\n", devnm);
+
+	mdfd = open_dev(devnm);
+	if (mdfd < 0) {
+		pr_err("%s: %s\n", devnm, strerror(errno));
+		return 1;
+	}
+
+	/* Fork, and have the child tell us when they are ready */
+	if (must_fork) {
+		if (pipe(pfd) != 0) {
+			pr_err("failed to create pipe\n");
+			return 1;
+		}
+		switch(fork()) {
+		case -1:
+			pr_err("failed to fork: %s\n", strerror(errno));
+			return 1;
+		case 0: /* child */
+			close(pfd[0]);
+			break;
+		default: /* parent */
+			close(pfd[1]);
+			if (read(pfd[0], &status, sizeof(status)) != sizeof(status)) {
+				wait(&status);
+				status = WEXITSTATUS(status);
+			}
+			close(pfd[0]);
+			return status;
+		}
+	} else
+		pfd[0] = pfd[1] = -1;
+
+	container = xcalloc(1, sizeof(*container));
+	strcpy(container->devnm, devnm);
+	container->arrays = NULL;
+	container->sock = -1;
+
+	mdi = sysfs_read(mdfd, container->devnm, GET_VERSION|GET_LEVEL|GET_DEVS);
+
+	if (!mdi) {
+		pr_err("failed to load sysfs info for %s\n", container->devnm);
+		exit(3);
+	}
+	if (mdi->array.level != UnSet) {
+		pr_err("%s is not a container - cannot monitor\n", devnm);
+		exit(3);
+	}
+	if (mdi->array.major_version != -1 ||
+	    mdi->array.minor_version != -2) {
+		pr_err("%s does not use external metadata - cannot monitor\n",
+			devnm);
+		exit(3);
+	}
+
+	container->ss = version_to_superswitch(mdi->text_version);
+	if (container->ss == NULL) {
+		pr_err("%s uses unsupported metadata: %s\n",
+			devnm, mdi->text_version);
+		exit(3);
+	}
+
+	container->devs = NULL;
+	for (di = mdi->devs; di; di = di->next) {
+		struct mdinfo *cd = xmalloc(sizeof(*cd));
+		*cd = *di;
+		cd->next = container->devs;
+		container->devs = cd;
+	}
+	sysfs_free(mdi);
+
+	/* SIGUSR is sent between parent and child.  So both block it
+	 * and enable it only with pselect.
+	 */
+	sigemptyset(&set);
+	sigaddset(&set, SIGUSR1);
+	sigaddset(&set, SIGTERM);
+	sigprocmask(SIG_BLOCK, &set, NULL);
+	act.sa_handler = wake_me;
+	act.sa_flags = 0;
+	sigaction(SIGUSR1, &act, NULL);
+	act.sa_handler = term;
+	sigaction(SIGTERM, &act, NULL);
+	act.sa_handler = SIG_IGN;
+	sigaction(SIGPIPE, &act, NULL);
+
+	victim = mdmon_pid(container->devnm);
+	if (victim >= 0)
+		victim_sock = connect_monitor(container->devnm);
+
+	ignore = chdir("/");
+	if (!takeover && victim > 0 && victim_sock >= 0) {
+		if (fping_monitor(victim_sock) == 0) {
+			pr_err("%s already managed\n", container->devnm);
+			exit(3);
+		}
+		close(victim_sock);
+		victim_sock = -1;
+	}
+	if (container->ss->load_container(container, mdfd, devnm)) {
+		pr_err("Cannot load metadata for %s\n", devnm);
+		exit(3);
+	}
+	close(mdfd);
+
+	/* Ok, this is close enough.  We can say goodbye to our parent now.
+	 */
+	if (victim > 0)
+		remove_pidfile(devnm);
+	if (make_pidfile(devnm) < 0) {
+		exit(3);
+	}
+	container->sock = make_control_sock(devnm);
+
+	status = 0;
+	if (pfd[1] >= 0) {
+		if (write(pfd[1], &status, sizeof(status)) < 0)
+			pr_err("failed to notify our parent: %d\n",
+			       getppid());
+		close(pfd[1]);
+	}
+
+	mlockall(MCL_CURRENT | MCL_FUTURE);
+
+	if (clone_monitor(container) < 0) {
+		pr_err("failed to start monitor process: %s\n",
+			strerror(errno));
+		exit(2);
+	}
+
+	if (victim > 0) {
+		try_kill_monitor(victim, container->devnm, victim_sock);
+		if (victim_sock >= 0)
+			close(victim_sock);
+	}
+
+	setsid();
+	manage_fork_fds(0);
+
+	/* This silliness is to stop the compiler complaining
+	 * that we ignore 'ignore'
+	 */
+	if (ignore)
+		ignore++;
+
+	do_manager(container);
+
+	exit(0);
+}
+
+/* Some stub functions so super-* can link with us */
+int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape,
+		  struct supertype *st, unsigned long blocks,
+		  int *fds, unsigned long long *offsets,
+		  int dests, int *destfd, unsigned long long *destoffsets)
+{
+	return 0;
+}
+
+int restore_stripes(int *dest, unsigned long long *offsets,
+		    int raid_disks, int chunk_size, int level, int layout,
+		    int source, unsigned long long read_offset,
+		    unsigned long long start, unsigned long long length,
+		    char *src_buf)
+{
+	return 1;
+}
+
+int save_stripes(int *source, unsigned long long *offsets,
+		 int raid_disks, int chunk_size, int level, int layout,
+		 int nwrites, int *dest,
+		 unsigned long long start, unsigned long long length,
+		 char *buf)
+{
+	return 0;
+}
+
+struct superswitch super0 = {
+	.name = "0.90",
+};
+struct superswitch super1 = {
+	.name = "1.x",
+};
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2023-02-24 14:34:34 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2023-02-24 14:34:34 +0000
commit	946b54554e13d6a97940df936123855e0a305abc (patch)
tree	80a778fbd7bb3c7858cfac572df1cb08cfa4f988 /mdmon.c
parent	Initial commit. (diff)
download	mdadm-946b54554e13d6a97940df936123855e0a305abc.tar.xz mdadm-946b54554e13d6a97940df936123855e0a305abc.zip