Adding upstream version 4.3.upstream/4.3

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-10 20:55:34 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-10 20:55:34 +0000
commit: 7f1d6c8fec531fa1762d6d65576aecbee837982c (patch)
tree: b37177c380fa30d0336aad7cac9c72035523206a /Manage.c
parent: Initial commit. (diff)
download: mdadm-7f1d6c8fec531fa1762d6d65576aecbee837982c.tar.xz
mdadm-7f1d6c8fec531fa1762d6d65576aecbee837982c.zip
1 files changed, 1854 insertions, 0 deletions
diff --git a/Manage.c b/Manage.c
new file mode 100644
index 0000000..30302ac
--- /dev/null
+++ b/Manage.c
@@ -0,0 +1,1854 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de>
+ *
+ *
+ *    This program is free software; you can redistribute it and/or modify
+ *    it under the terms of the GNU General Public License as published by
+ *    the Free Software Foundation; either version 2 of the License, or
+ *    (at your option) any later version.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU General Public License for more details.
+ *
+ *    You should have received a copy of the GNU General Public License
+ *    along with this program; if not, write to the Free Software
+ *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ *    Author: Neil Brown
+ *    Email: <neilb@suse.de>
+ */
+
+#include "mdadm.h"
+#include "md_u.h"
+#include "md_p.h"
+#include "udev.h"
+#include <ctype.h>
+
+int Manage_ro(char *devname, int fd, int readonly)
+{
+	/* switch to readonly or rw
+	 *
+	 * requires >= 0.90.0
+	 * first check that array is runing
+	 * use RESTART_ARRAY_RW or STOP_ARRAY_RO
+	 *
+	 */
+	struct mdinfo *mdi;
+	int rv = 0;
+
+	/* If this is an externally-managed array, we need to modify the
+	 * metadata_version so that mdmon doesn't undo our change.
+	 */
+	mdi = sysfs_read(fd, NULL, GET_LEVEL|GET_VERSION);
+	if (mdi &&
+	    mdi->array.major_version == -1 &&
+	    is_subarray(mdi->text_version)) {
+		char vers[64];
+		strcpy(vers, "external:");
+		strcat(vers, mdi->text_version);
+		if (readonly > 0) {
+			int rv;
+			/* We set readonly ourselves. */
+			vers[9] = '-';
+			sysfs_set_str(mdi, NULL, "metadata_version", vers);
+
+			close(fd);
+			rv = sysfs_set_str(mdi, NULL, "array_state", "readonly");
+
+			if (rv < 0) {
+				pr_err("failed to set readonly for %s: %s\n",
+					devname, strerror(errno));
+
+				vers[9] = mdi->text_version[0];
+				sysfs_set_str(mdi, NULL, "metadata_version", vers);
+				rv = 1;
+				goto out;
+			}
+		} else {
+			char *cp;
+			/* We cannot set read/write - must signal mdmon */
+			vers[9] = '/';
+			sysfs_set_str(mdi, NULL, "metadata_version", vers);
+
+			cp = strchr(vers+10, '/');
+			if (cp)
+				*cp = 0;
+			ping_monitor(vers+10);
+			if (mdi->array.level <= 0)
+				sysfs_set_str(mdi, NULL, "array_state", "active");
+		}
+		goto out;
+	}
+
+	if (!md_array_active(fd)) {
+		pr_err("%s does not appear to be active.\n", devname);
+		rv = 1;
+		goto out;
+	}
+
+	if (readonly > 0) {
+		if (ioctl(fd, STOP_ARRAY_RO, NULL)) {
+			pr_err("failed to set readonly for %s: %s\n",
+				devname, strerror(errno));
+			rv = 1;
+			goto out;
+		}
+	} else if (readonly < 0) {
+		if (ioctl(fd, RESTART_ARRAY_RW, NULL)) {
+			pr_err("failed to set writable for %s: %s\n",
+				devname, strerror(errno));
+			rv = 1;
+			goto out;
+		}
+	}
+out:
+	sysfs_free(mdi);
+	return rv;
+}
+
+static void remove_devices(char *devnm, char *path)
+{
+	/*
+	 * Remove names at 'path' - possibly with
+	 * partition suffixes - which link to the 'standard'
+	 * name for devnm.  These were probably created
+	 * by mdadm when the array was assembled.
+	 */
+	char base[40];
+	char *path2;
+	char link[1024];
+	int n;
+	int part;
+	char *be;
+	char *pe;
+
+	if (!path)
+		return;
+
+	sprintf(base, "/dev/%s", devnm);
+	be = base + strlen(base);
+
+	path2 = xmalloc(strlen(path)+20);
+	strcpy(path2, path);
+	pe = path2 + strlen(path2);
+
+	for (part = 0; part < 16; part++) {
+		if (part) {
+			sprintf(be, "p%d", part);
+
+			if (isdigit(pe[-1]))
+				sprintf(pe, "p%d", part);
+			else
+				sprintf(pe, "%d", part);
+		}
+		n = readlink(path2, link, sizeof(link));
+		if (n > 0 && (int)strlen(base) == n &&
+		    strncmp(link, base, n) == 0)
+			unlink(path2);
+	}
+	free(path2);
+}
+
+int Manage_run(char *devname, int fd, struct context *c)
+{
+	/* Run the array.  Array must already be configured
+	 *  Requires >= 0.90.0
+	 */
+	char nm[32], *nmp;
+
+	nmp = fd2devnm(fd);
+	if (!nmp) {
+		pr_err("Cannot find %s in sysfs!!\n", devname);
+		return 1;
+	}
+	strcpy(nm, nmp);
+	return IncrementalScan(c, nm);
+}
+
+int Manage_stop(char *devname, int fd, int verbose, int will_retry)
+{
+	/* Stop the array.  Array must already be configured
+	 * 'will_retry' means that error messages are not wanted.
+	 */
+	int rv = 0;
+	struct map_ent *map = NULL;
+	struct mdinfo *mdi;
+	char devnm[32];
+	char container[32];
+	int err;
+	int count;
+	char buf[SYSFS_MAX_BUF_SIZE];
+	unsigned long long rd1, rd2;
+
+	if (will_retry && verbose == 0)
+		verbose = -1;
+
+	strcpy(devnm, fd2devnm(fd));
+	/* Get EXCL access first.  If this fails, then attempting
+	 * to stop is probably a bad idea.
+	 */
+	mdi = sysfs_read(fd, NULL, GET_LEVEL|GET_COMPONENT|GET_VERSION);
+	if (mdi && is_subarray(mdi->text_version)) {
+		char *sl;
+		strncpy(container, mdi->text_version+1, sizeof(container));
+		container[sizeof(container)-1] = 0;
+		sl = strchr(container, '/');
+		if (sl)
+			*sl = 0;
+	} else
+		container[0] = 0;
+	close(fd);
+	count = 5;
+	while (((fd = ((devname[0] == '/')
+		       ?open(devname, O_RDONLY|O_EXCL)
+		       :open_dev_flags(devnm, O_RDONLY|O_EXCL))) < 0 ||
+		strcmp(fd2devnm(fd), devnm) != 0) && container[0] &&
+	       mdmon_running(container) && count) {
+		/* Can't open, so something might be wrong.  However it
+		 * is a container, so we might be racing with mdmon, so
+		 * retry for a bit.
+		 */
+		if (fd >= 0)
+			close(fd);
+		flush_mdmon(container);
+		count--;
+	}
+	if (fd < 0 || strcmp(fd2devnm(fd), devnm) != 0) {
+		if (fd >= 0)
+			close(fd);
+		if (verbose >= 0)
+			pr_err("Cannot get exclusive access to %s:Perhaps a running process, mounted filesystem or active volume group?\n",
+			       devname);
+		sysfs_free(mdi);
+		return 1;
+	}
+	/* If this is an mdmon managed array, just write 'inactive'
+	 * to the array state and let mdmon clear up.
+	 */
+	if (mdi &&
+	    mdi->array.level > 0 &&
+	    is_subarray(mdi->text_version)) {
+		int err;
+		/* This is mdmon managed. */
+		close(fd);
+
+		/* As we had an O_EXCL open, any use of the device
+		 * which blocks STOP_ARRAY is probably a transient use,
+		 * so it is reasonable to retry for a while - 5 seconds.
+		 */
+		count = 25;
+		while (count &&
+		       (err = sysfs_set_str(mdi, NULL,
+					    "array_state",
+					    "inactive")) < 0 &&
+		       errno == EBUSY) {
+			sleep_for(0, MSEC_TO_NSEC(200), true);
+			count--;
+		}
+		if (err) {
+			if (verbose >= 0)
+				pr_err("failed to stop array %s: %s\n",
+				       devname, strerror(errno));
+			rv = 1;
+			goto out;
+		}
+
+		/* Give monitor a chance to act */
+		ping_monitor(mdi->text_version);
+
+		fd = open_dev_excl(devnm);
+		if (fd < 0) {
+			if (verbose >= 0)
+				pr_err("failed to completely stop %s: Device is busy\n",
+				       devname);
+			rv = 1;
+			goto out;
+		}
+	} else if (mdi &&
+		   mdi->array.major_version == -1 &&
+		   mdi->array.minor_version == -2 &&
+		   !is_subarray(mdi->text_version)) {
+		struct mdstat_ent *mds, *m;
+		/* container, possibly mdmon-managed.
+		 * Make sure mdmon isn't opening it, which
+		 * would interfere with the 'stop'
+		 */
+		ping_monitor(mdi->sys_name);
+
+		/* now check that there are no existing arrays
+		 * which are members of this array
+		 */
+		mds = mdstat_read(0, 0);
+		for (m = mds; m; m = m->next)
+			if (m->metadata_version &&
+			    strncmp(m->metadata_version, "external:", 9)==0 &&
+			    metadata_container_matches(m->metadata_version+9,
+						       devnm)) {
+				if (verbose >= 0)
+					pr_err("Cannot stop container %s: member %s still active\n",
+					       devname, m->devnm);
+				free_mdstat(mds);
+				rv = 1;
+				goto out;
+			}
+	}
+
+	/* If the array is undergoing a reshape which changes the number
+	 * of devices, then it would be nice to stop it at a point where
+	 * it has completed a full number of stripes in both old and
+	 * new layouts as this will allow the reshape to be reverted.
+	 * So if 'sync_action' is "reshape" and 'raid_disks' shows two
+	 * different numbers, then
+	 *  - freeze reshape
+	 *  - set sync_max to next multiple of both data_disks and
+	 *    chunk sizes (or next but one)
+	 *  - unfreeze reshape
+	 *  - wait on 'sync_completed' for that point to be reached.
+	 */
+	if (mdi && is_level456(mdi->array.level) &&
+	    sysfs_attribute_available(mdi, NULL, "sync_action") &&
+	    sysfs_attribute_available(mdi, NULL, "reshape_direction") &&
+	    sysfs_get_str(mdi, NULL, "sync_action", buf, sizeof(buf)) > 0 &&
+	    strcmp(buf, "reshape\n") == 0 &&
+	    sysfs_get_two(mdi, NULL, "raid_disks", &rd1, &rd2) == 2) {
+		unsigned long long position, curr;
+		unsigned long long chunk1, chunk2;
+		unsigned long long rddiv, chunkdiv;
+		unsigned long long sectors;
+		unsigned long long sync_max, old_sync_max;
+		unsigned long long completed;
+		int backwards = 0;
+		int delay;
+		int scfd;
+
+		delay = 40;
+		while (rd1 > rd2 && delay > 0 &&
+		       sysfs_get_ll(mdi, NULL, "sync_max", &old_sync_max) == 0) {
+			/* must be in the critical section - wait a bit */
+			delay -= 1;
+			sleep_for(0, MSEC_TO_NSEC(100), true);
+		}
+
+		if (sysfs_set_str(mdi, NULL, "sync_action", "frozen") != 0)
+			goto done;
+		/* Array is frozen */
+
+		rd1 -= mdi->array.level == 6 ? 2 : 1;
+		rd2 -= mdi->array.level == 6 ? 2 : 1;
+		sysfs_get_str(mdi, NULL, "reshape_direction", buf, sizeof(buf));
+		if (strncmp(buf, "back", 4) == 0)
+			backwards = 1;
+		if (sysfs_get_ll(mdi, NULL, "reshape_position", &position) != 0) {
+			/* reshape must have finished now */
+			sysfs_set_str(mdi, NULL, "sync_action", "idle");
+			goto done;
+		}
+		sysfs_get_two(mdi, NULL, "chunk_size", &chunk1, &chunk2);
+		chunk1 /= 512;
+		chunk2 /= 512;
+		rddiv = GCD(rd1, rd2);
+		chunkdiv = GCD(chunk1, chunk2);
+		sectors = (chunk1/chunkdiv) * chunk2 * (rd1/rddiv) * rd2;
+
+		if (backwards) {
+			/* Need to subtract 'reshape_position' from
+			 * array size to get equivalent of sync_max.
+			 * Size calculation based on raid5_size in kernel.
+			 */
+			unsigned long long size = mdi->component_size;
+			size &= ~(chunk1-1);
+			size &= ~(chunk2-1);
+			/* rd1 must be smaller */
+			/* Reshape may have progressed further backwards than
+			 * recorded, so target even further back (hence "-1")
+			 */
+			position = (position / sectors - 1) * sectors;
+			/* rd1 is always the conversion factor between 'sync'
+			 * position and 'reshape' position.
+			 * We read 1 "new" stripe worth of data from where-ever,
+			 * and when write out that full stripe.
+			 */
+			sync_max = size - position/rd1;
+		} else {
+			/* Reshape will very likely be beyond position, and it may
+			 * be too late to stop at '+1', so aim for '+2'
+			 */
+			position = (position / sectors + 2) * sectors;
+			sync_max = position/rd1;
+		}
+		if (sysfs_get_ll(mdi, NULL, "sync_max", &old_sync_max) < 0)
+			old_sync_max = mdi->component_size;
+		/* Must not advance sync_max as that could confuse
+		 * the reshape monitor */
+		if (sync_max < old_sync_max)
+			sysfs_set_num(mdi, NULL, "sync_max", sync_max);
+		sysfs_set_str(mdi, NULL, "sync_action", "idle");
+
+		/* That should have set things going again.  Now we
+		 * wait a little while (3 second max) for sync_completed
+		 * to reach the target.
+		 * The reshape process can block for 500msec if
+		 * the sync speed limit is hit, so we need to wait
+		 * a lot longer than that. 1 second is usually
+		 * enough.  3 is safe.
+		 */
+		delay = 3000;
+		scfd = sysfs_open(mdi->sys_name, NULL, "sync_completed");
+		while (scfd >= 0 && delay > 0 && old_sync_max > 0) {
+			unsigned long long max_completed;
+			sysfs_get_ll(mdi, NULL, "reshape_position", &curr);
+			sysfs_fd_get_str(scfd, buf, sizeof(buf));
+			if (str_is_none(buf) == true) {
+				/* Either reshape has aborted, or hasn't
+				 * quite started yet.  Wait a bit and
+				 * check  'sync_action' to see.
+				 */
+				sleep_for(0, MSEC_TO_NSEC(10), true);
+				sysfs_get_str(mdi, NULL, "sync_action", buf, sizeof(buf));
+				if (strncmp(buf, "reshape", 7) != 0)
+					break;
+			}
+
+			if (sysfs_fd_get_two(scfd, &completed,
+					     &max_completed) == 2 &&
+			    /* 'completed' sometimes reads as max-uulong */
+			    completed < max_completed &&
+			    (completed > sync_max ||
+			     (completed == sync_max && curr != position))) {
+				while (completed > sync_max) {
+					sync_max += sectors / rd1;
+					if (backwards)
+						position -= sectors;
+					else
+						position += sectors;
+				}
+				if (sync_max < old_sync_max)
+					sysfs_set_num(mdi, NULL, "sync_max", sync_max);
+			}
+
+			if (!backwards && curr >= position)
+				break;
+			if (backwards && curr <= position)
+				break;
+			sysfs_wait(scfd, &delay);
+		}
+		if (scfd >= 0)
+			close(scfd);
+
+	}
+done:
+
+	/* As we have an O_EXCL open, any use of the device
+	 * which blocks STOP_ARRAY is probably a transient use,
+	 * so it is reasonable to retry for a while - 5 seconds.
+	 */
+	count = 25; err = 0;
+	while (count && fd >= 0 &&
+	       (err = ioctl(fd, STOP_ARRAY, NULL)) < 0 && errno == EBUSY) {
+		sleep_for(0, MSEC_TO_NSEC(200), true);
+		count --;
+	}
+	if (fd >= 0 && err) {
+		if (verbose >= 0) {
+			pr_err("failed to stop array %s: %s\n",
+			       devname, strerror(errno));
+			if (errno == EBUSY)
+				cont_err("Perhaps a running process, mounted filesystem or active volume group?\n");
+		}
+		rv = 1;
+		goto out;
+	}
+
+	if (devnm[0] && udev_is_available()) {
+		struct map_ent *mp = map_by_devnm(&map, devnm);
+		remove_devices(devnm, mp ? mp->path : NULL);
+	}
+
+	if (verbose >= 0)
+		pr_err("stopped %s\n", devname);
+	map_lock(&map);
+	map_remove(&map, devnm);
+	map_unlock(&map);
+out:
+	sysfs_free(mdi);
+
+	return rv;
+}
+
+static struct mddev_dev *add_one(struct mddev_dev *dv, char *name, char disp)
+{
+	struct mddev_dev *new;
+	new = xmalloc(sizeof(*new));
+	memset(new, 0, sizeof(*new));
+	new->devname = xstrdup(name);
+	new->disposition = disp;
+	new->next = dv->next;
+	dv->next = new;
+	return new;
+}
+
+static void add_faulty(struct mddev_dev *dv, int fd, char disp)
+{
+	mdu_array_info_t array;
+	mdu_disk_info_t disk;
+	int remaining_disks;
+	int i;
+
+	if (md_get_array_info(fd, &array) != 0)
+		return;
+
+	remaining_disks = array.nr_disks;
+	for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) {
+		char buf[40];
+		disk.number = i;
+		if (md_get_disk_info(fd, &disk) != 0)
+			continue;
+		if (disk.major == 0 && disk.minor == 0)
+			continue;
+		remaining_disks--;
+		if ((disk.state & 1) == 0) /* not faulty */
+			continue;
+		sprintf(buf, "%d:%d", disk.major, disk.minor);
+		dv = add_one(dv, buf, disp);
+	}
+}
+
+static void add_detached(struct mddev_dev *dv, int fd, char disp)
+{
+	mdu_array_info_t array;
+	mdu_disk_info_t disk;
+	int remaining_disks;
+	int i;
+
+	if (md_get_array_info(fd, &array) != 0)
+		return;
+
+	remaining_disks = array.nr_disks;
+	for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) {
+		char buf[40];
+		int sfd;
+		disk.number = i;
+		if (md_get_disk_info(fd, &disk) != 0)
+			continue;
+		if (disk.major == 0 && disk.minor == 0)
+			continue;
+		remaining_disks--;
+		if (disp == 'f' && (disk.state & 1) != 0) /* already faulty */
+			continue;
+		sprintf(buf, "%d:%d", disk.major, disk.minor);
+		sfd = dev_open(buf, O_RDONLY);
+		if (sfd >= 0) {
+			/* Not detached */
+			close(sfd);
+			continue;
+		}
+		if (errno != ENXIO)
+			/* Probably not detached */
+			continue;
+		dv = add_one(dv, buf, disp);
+	}
+}
+
+static void add_set(struct mddev_dev *dv, int fd, char set_char)
+{
+	mdu_array_info_t array;
+	mdu_disk_info_t disk;
+	int remaining_disks;
+	int copies, set;
+	int i;
+
+	if (md_get_array_info(fd, &array) != 0)
+		return;
+	if (array.level != 10)
+		return;
+	copies = ((array.layout & 0xff) *
+		  ((array.layout >> 8) & 0xff));
+	if (array.raid_disks % copies)
+		return;
+
+	remaining_disks = array.nr_disks;
+	for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) {
+		char buf[40];
+		disk.number = i;
+		if (md_get_disk_info(fd, &disk) != 0)
+			continue;
+		if (disk.major == 0 && disk.minor == 0)
+			continue;
+		remaining_disks--;
+		set = disk.raid_disk % copies;
+		if (set_char != set + 'A')
+			continue;
+		sprintf(buf, "%d:%d", disk.major, disk.minor);
+		dv = add_one(dv, buf, dv->disposition);
+	}
+}
+
+int attempt_re_add(int fd, int tfd, struct mddev_dev *dv,
+		   struct supertype *dev_st, struct supertype *tst,
+		   unsigned long rdev, enum update_opt update,
+		   char *devname, int verbose, mdu_array_info_t *array)
+{
+	struct mdinfo mdi;
+	int duuid[4];
+	int ouuid[4];
+
+	dev_st->ss->getinfo_super(dev_st, &mdi, NULL);
+	dev_st->ss->uuid_from_super(dev_st, ouuid);
+	if (tst->sb)
+		tst->ss->uuid_from_super(tst, duuid);
+	else
+		/* Assume uuid matches: kernel will check */
+		memcpy(duuid, ouuid, sizeof(ouuid));
+	if ((mdi.disk.state & (1<<MD_DISK_ACTIVE)) &&
+	    !(mdi.disk.state & (1<<MD_DISK_FAULTY)) &&
+	    memcmp(duuid, ouuid, sizeof(ouuid))==0) {
+		/* Looks like it is worth a
+		 * try.  Need to make sure
+		 * kernel will accept it
+		 * though.
+		 */
+		mdu_disk_info_t disc;
+		disc.number = mdi.disk.number;
+		if (md_get_disk_info(fd, &disc) != 0 ||
+		    disc.major != 0 || disc.minor != 0)
+			goto skip_re_add;
+		disc.major = major(rdev);
+		disc.minor = minor(rdev);
+		disc.number = mdi.disk.number;
+		disc.raid_disk = mdi.disk.raid_disk;
+		disc.state = mdi.disk.state;
+		if (array->state & (1 << MD_SB_CLUSTERED)) {
+			/* extra flags are needed when adding to a cluster as
+			 * there are two cases to distinguish
+			 */
+			if (dv->disposition == 'c')
+				disc.state |= (1 << MD_DISK_CANDIDATE);
+			else
+				disc.state |= (1 << MD_DISK_CLUSTER_ADD);
+		}
+		if (dv->writemostly == FlagSet)
+			disc.state |= 1 << MD_DISK_WRITEMOSTLY;
+		if (dv->writemostly == FlagClear)
+			disc.state &= ~(1 << MD_DISK_WRITEMOSTLY);
+		if (dv->failfast == FlagSet)
+			disc.state |= 1 << MD_DISK_FAILFAST;
+		if (dv->failfast == FlagClear)
+			disc.state &= ~(1 << MD_DISK_FAILFAST);
+		remove_partitions(tfd);
+		if (update || dv->writemostly != FlagDefault ||
+		    dv->failfast != FlagDefault) {
+			int rv = -1;
+			tfd = dev_open(dv->devname, O_RDWR);
+			if (tfd < 0) {
+				pr_err("failed to open %s for superblock update during re-add\n", dv->devname);
+				return -1;
+			}
+
+			if (dv->writemostly == FlagSet)
+				rv = dev_st->ss->update_super(
+					dev_st, NULL, UOPT_SPEC_WRITEMOSTLY,
+					devname, verbose, 0, NULL);
+			if (dv->writemostly == FlagClear)
+				rv = dev_st->ss->update_super(
+					dev_st, NULL, UOPT_SPEC_READWRITE,
+					devname, verbose, 0, NULL);
+			if (dv->failfast == FlagSet)
+				rv = dev_st->ss->update_super(
+					dev_st, NULL, UOPT_SPEC_FAILFAST,
+					devname, verbose, 0, NULL);
+			if (dv->failfast == FlagClear)
+				rv = dev_st->ss->update_super(
+					dev_st, NULL, UOPT_SPEC_NOFAILFAST,
+					devname, verbose, 0, NULL);
+			if (update)
+				rv = dev_st->ss->update_super(
+					dev_st, NULL, update,
+					devname, verbose, 0, NULL);
+			if (rv == 0)
+				rv = dev_st->ss->store_super(dev_st, tfd);
+			close(tfd);
+			if (rv != 0) {
+				pr_err("failed to update superblock during re-add\n");
+				return -1;
+			}
+		}
+		/* don't even try if disk is marked as faulty */
+		errno = 0;
+		if (ioctl(fd, ADD_NEW_DISK, &disc) == 0) {
+			if (verbose >= 0)
+				pr_err("re-added %s\n", dv->devname);
+			return 1;
+		}
+		if (errno == ENOMEM || errno == EROFS) {
+			pr_err("add new device failed for %s: %s\n",
+			       dv->devname, strerror(errno));
+			if (dv->disposition == 'M')
+				return 0;
+			return -1;
+		}
+	}
+skip_re_add:
+	return 0;
+}
+
+int Manage_add(int fd, int tfd, struct mddev_dev *dv,
+	       struct supertype *tst, mdu_array_info_t *array,
+	       int force, int verbose, char *devname,
+	       enum update_opt update, unsigned long rdev,
+	       unsigned long long array_size, int raid_slot)
+{
+	unsigned long long ldsize;
+	struct supertype *dev_st;
+	int j;
+	mdu_disk_info_t disc;
+	struct map_ent *map = NULL;
+
+	if (!get_dev_size(tfd, dv->devname, &ldsize)) {
+		if (dv->disposition == 'M')
+			return 0;
+		else
+			return -1;
+	}
+
+	if (tst->ss == &super0 && ldsize > 4ULL*1024*1024*1024*1024) {
+		/* More than 4TB is wasted on v0.90 */
+		if (!force) {
+			pr_err("%s is larger than %s can effectively use.\n"
+			       "       Add --force is you really want to add this device.\n",
+			       dv->devname, devname);
+			return -1;
+		}
+		pr_err("%s is larger than %s can effectively use.\n"
+		       "       Adding anyway as --force was given.\n",
+		       dv->devname, devname);
+	}
+
+	if (array->not_persistent == 0 || tst->ss->external) {
+
+		/* need to find a sample superblock to copy, and
+		 * a spare slot to use.
+		 * For 'external' array (well, container based),
+		 * We can just load the metadata for the array->
+		 */
+		int array_failed;
+		if (tst->sb)
+			/* already loaded */;
+		else if (tst->ss->external) {
+			tst->ss->load_container(tst, fd, NULL);
+		} else for (j = 0; j < tst->max_devs; j++) {
+				char *dev;
+				int dfd;
+				disc.number = j;
+				if (md_get_disk_info(fd, &disc))
+					continue;
+				if (disc.major==0 && disc.minor==0)
+					continue;
+				if ((disc.state & 4)==0) /* sync */
+					continue;
+				/* Looks like a good device to try */
+				dev = map_dev(disc.major, disc.minor, 1);
+				if (!dev)
+					continue;
+				dfd = dev_open(dev, O_RDONLY);
+				if (dfd < 0)
+					continue;
+				if (tst->ss->load_super(tst, dfd,
+							NULL)) {
+					close(dfd);
+					continue;
+				}
+				close(dfd);
+				break;
+			}
+		/* FIXME this is a bad test to be using */
+		if (!tst->sb && (dv->disposition != 'a' &&
+				 dv->disposition != 'S')) {
+			/* we are re-adding a device to a
+			 * completely dead array - have to depend
+			 * on kernel to check
+			 */
+		} else if (!tst->sb) {
+			pr_err("cannot load array metadata from %s\n", devname);
+			return -1;
+		}
+
+		/* Make sure device is large enough */
+		if (dv->disposition != 'j' &&  /* skip size check for Journal */
+		    tst->sb &&
+		    tst->ss->avail_size(tst, ldsize/512, INVALID_SECTORS) <
+		    array_size) {
+			if (dv->disposition == 'M')
+				return 0;
+			pr_err("%s not large enough to join array\n",
+			       dv->devname);
+			return -1;
+		}
+
+		/* Possibly this device was recently part of
+		 * the array and was temporarily removed, and
+		 * is now being re-added.  If so, we can
+		 * simply re-add it.
+		 */
+
+		if (array->not_persistent == 0) {
+			dev_st = dup_super(tst);
+			dev_st->ss->load_super(dev_st, tfd, NULL);
+			if (dev_st->sb && dv->disposition != 'S') {
+				int rv;
+
+				rv = attempt_re_add(fd, tfd, dv, dev_st, tst,
+						    rdev, update, devname,
+						    verbose, array);
+				dev_st->ss->free_super(dev_st);
+				if (rv) {
+					free(dev_st);
+					return rv;
+				}
+			}
+			if (dev_st) {
+				dev_st->ss->free_super(dev_st);
+				free(dev_st);
+			}
+		}
+		if (dv->disposition == 'M') {
+			if (verbose > 0)
+				pr_err("--re-add for %s to %s is not possible\n",
+				       dv->devname, devname);
+			return 0;
+		}
+		if (dv->disposition == 'A') {
+			pr_err("--re-add for %s to %s is not possible\n",
+			       dv->devname, devname);
+			return -1;
+		}
+		if (array->active_disks < array->raid_disks) {
+			char *avail = xcalloc(array->raid_disks, 1);
+			int d;
+			int found = 0;
+
+			for (d = 0; d < MAX_DISKS && found < array->nr_disks; d++) {
+				disc.number = d;
+				if (md_get_disk_info(fd, &disc))
+					continue;
+				if (disc.major == 0 && disc.minor == 0)
+					continue;
+				if (!(disc.state & (1<<MD_DISK_SYNC)))
+					continue;
+				avail[disc.raid_disk] = 1;
+				found++;
+			}
+			array_failed = !enough(array->level, array->raid_disks,
+					       array->layout, 1, avail);
+			free(avail);
+		} else
+			array_failed = 0;
+		if (array_failed) {
+			pr_err("%s has failed so using --add cannot work and might destroy\n",
+			       devname);
+			pr_err("data on %s.  You should stop the array and re-assemble it.\n",
+			       dv->devname);
+			return -1;
+		}
+	} else {
+		/* non-persistent. Must ensure that new drive
+		 * is at least array->size big.
+		 */
+		if (ldsize/512 < array_size) {
+			pr_err("%s not large enough to join array\n",
+			       dv->devname);
+			return -1;
+		}
+	}
+	/* committed to really trying this device now*/
+	remove_partitions(tfd);
+
+	/* in 2.6.17 and earlier, version-1 superblocks won't
+	 * use the number we write, but will choose a free number.
+	 * we must choose the same free number, which requires
+	 * starting at 'raid_disks' and counting up
+	 */
+	for (j = array->raid_disks; j < tst->max_devs; j++) {
+		disc.number = j;
+		if (md_get_disk_info(fd, &disc))
+			break;
+		if (disc.major==0 && disc.minor==0)
+			break;
+		if (disc.state & 8) /* removed */
+			break;
+	}
+	disc.major = major(rdev);
+	disc.minor = minor(rdev);
+	if (raid_slot < 0)
+		disc.number = j;
+	else
+		disc.number = raid_slot;
+	disc.state = 0;
+
+	/* only add journal to array that supports journaling */
+	if (dv->disposition == 'j') {
+		struct mdinfo *mdp;
+
+		mdp = sysfs_read(fd, NULL, GET_ARRAY_STATE);
+		if (!mdp) {
+			pr_err("%s unable to read array state.\n", devname);
+			return -1;
+		}
+
+		if (mdp->array_state != ARRAY_READONLY) {
+			sysfs_free(mdp);
+			pr_err("%s is not readonly, cannot add journal.\n", devname);
+			return -1;
+		}
+
+		sysfs_free(mdp);
+
+		disc.raid_disk = 0;
+	}
+
+	if (map_lock(&map))
+		pr_err("failed to get exclusive lock on mapfile when add disk\n");
+
+	if (array->not_persistent==0) {
+		int dfd;
+		if (dv->disposition == 'j')
+			disc.state |= (1 << MD_DISK_JOURNAL) | (1 << MD_DISK_SYNC);
+		if (dv->writemostly == FlagSet)
+			disc.state |= 1 << MD_DISK_WRITEMOSTLY;
+		if (dv->failfast == FlagSet)
+			disc.state |= 1 << MD_DISK_FAILFAST;
+		dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT);
+		if (tst->ss->add_to_super(tst, &disc, dfd,
+					  dv->devname, INVALID_SECTORS))
+			goto unlock;
+		if (tst->ss->write_init_super(tst))
+			goto unlock;
+	} else if (dv->disposition == 'A') {
+		/*  this had better be raid1.
+		 * As we are "--re-add"ing we must find a spare slot
+		 * to fill.
+		 */
+		char *used = xcalloc(array->raid_disks, 1);
+		for (j = 0; j < tst->max_devs; j++) {
+			mdu_disk_info_t disc2;
+			disc2.number = j;
+			if (md_get_disk_info(fd, &disc2))
+				continue;
+			if (disc2.major==0 && disc2.minor==0)
+				continue;
+			if (disc2.state & 8) /* removed */
+				continue;
+			if (disc2.raid_disk < 0)
+				continue;
+			if (disc2.raid_disk > array->raid_disks)
+				continue;
+			used[disc2.raid_disk] = 1;
+		}
+		for (j = 0 ; j < array->raid_disks; j++)
+			if (!used[j]) {
+				disc.raid_disk = j;
+				disc.state |= (1<<MD_DISK_SYNC);
+				break;
+			}
+		free(used);
+	}
+
+	if (array->state & (1 << MD_SB_CLUSTERED)) {
+		if (dv->disposition == 'c')
+			disc.state |= (1 << MD_DISK_CANDIDATE);
+		else
+			disc.state |= (1 << MD_DISK_CLUSTER_ADD);
+	}
+
+	if (dv->writemostly == FlagSet)
+		disc.state |= (1 << MD_DISK_WRITEMOSTLY);
+	if (dv->failfast == FlagSet)
+		disc.state |= (1 << MD_DISK_FAILFAST);
+	if (tst->ss->external) {
+		/* add a disk
+		 * to an external metadata container */
+		struct mdinfo new_mdi;
+		struct mdinfo *sra;
+		int container_fd;
+		char devnm[32];
+		int dfd;
+
+		strcpy(devnm, fd2devnm(fd));
+
+		container_fd = open_dev_excl(devnm);
+		if (container_fd < 0) {
+			pr_err("add failed for %s: could not get exclusive access to container\n",
+			       dv->devname);
+			tst->ss->free_super(tst);
+			goto unlock;
+		}
+
+		/* Check if metadata handler is able to accept the drive */
+		if (!tst->ss->validate_geometry(tst, LEVEL_CONTAINER, 0, 1, NULL,
+		    0, 0, dv->devname, NULL, 0, 1)) {
+			close(container_fd);
+			goto unlock;
+		}
+
+		Kill(dv->devname, NULL, 0, -1, 0);
+		dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT);
+		if (tst->ss->add_to_super(tst, &disc, dfd,
+					  dv->devname, INVALID_SECTORS)) {
+			close(dfd);
+			close(container_fd);
+			goto unlock;
+		}
+		if (!mdmon_running(tst->container_devnm))
+			tst->ss->sync_metadata(tst);
+
+		sra = sysfs_read(container_fd, NULL, 0);
+		if (!sra) {
+			pr_err("add failed for %s: sysfs_read failed\n",
+			       dv->devname);
+			close(container_fd);
+			tst->ss->free_super(tst);
+			goto unlock;
+		}
+		sra->array.level = LEVEL_CONTAINER;
+		/* Need to set data_offset and component_size */
+		tst->ss->getinfo_super(tst, &new_mdi, NULL);
+		new_mdi.disk.major = disc.major;
+		new_mdi.disk.minor = disc.minor;
+		new_mdi.recovery_start = 0;
+		/* Make sure fds are closed as they are O_EXCL which
+		 * would block add_disk */
+		tst->ss->free_super(tst);
+		if (sysfs_add_disk(sra, &new_mdi, 0) != 0) {
+			pr_err("add new device to external metadata failed for %s\n", dv->devname);
+			close(container_fd);
+			sysfs_free(sra);
+			goto unlock;
+		}
+		ping_monitor(devnm);
+		sysfs_free(sra);
+		close(container_fd);
+	} else {
+		tst->ss->free_super(tst);
+		if (ioctl(fd, ADD_NEW_DISK, &disc)) {
+			if (dv->disposition == 'j')
+				pr_err("Failed to hot add %s as journal, "
+				       "please try restart %s.\n", dv->devname, devname);
+			else
+				pr_err("add new device failed for %s as %d: %s\n",
+				       dv->devname, j, strerror(errno));
+			goto unlock;
+		}
+		if (dv->disposition == 'j') {
+			pr_err("Journal added successfully, making %s read-write\n", devname);
+			if (Manage_ro(devname, fd, -1))
+				pr_err("Failed to make %s read-write\n", devname);
+		}
+
+	}
+	if (verbose >= 0)
+		pr_err("added %s\n", dv->devname);
+	map_unlock(&map);
+	return 1;
+unlock:
+	map_unlock(&map);
+	return -1;
+}
+
+int Manage_remove(struct supertype *tst, int fd, struct mddev_dev *dv,
+		  int sysfd, unsigned long rdev, int force, int verbose, char *devname)
+{
+	int lfd = -1;
+	int err;
+
+	if (tst->ss->external) {
+		/* To remove a device from a container, we must
+		 * check that it isn't in use in an array.
+		 * This involves looking in the 'holders'
+		 * directory - there must be just one entry,
+		 * the container.
+		 * To ensure that it doesn't get used as a
+		 * hot spare while we are checking, we
+		 * get an O_EXCL open on the container
+		 */
+		int ret;
+		char devnm[32];
+		strcpy(devnm, fd2devnm(fd));
+		lfd = open_dev_excl(devnm);
+		if (lfd < 0) {
+			pr_err("Cannot get exclusive access  to container - odd\n");
+			return -1;
+		}
+		/* We may not be able to check on holders in
+		 * sysfs, either because we don't have the dev num
+		 * (rdev == 0) or because the device has been detached
+		 * and the 'holders' directory no longer exists
+		 * (ret == -1).  In that case, assume it is OK to
+		 * remove.
+		 */
+		if (rdev == 0)
+			ret = -1;
+		else {
+			/*
+			 * The drive has already been set to 'faulty', however
+			 * monitor might not have had time to process it and the
+			 * drive might still have an entry in the 'holders'
+			 * directory. Try a few times to avoid a false error
+			 */
+			int count = 20;
+
+			do {
+				ret = sysfs_unique_holder(devnm, rdev);
+				if (ret < 2)
+					break;
+				sleep_for(0, MSEC_TO_NSEC(100), true);
+			} while (--count > 0);
+
+			if (ret == 0) {
+				pr_err("%s is not a member, cannot remove.\n",
+					dv->devname);
+				close(lfd);
+				return -1;
+			}
+			if (ret >= 2) {
+				pr_err("%s is still in use, cannot remove.\n",
+					dv->devname);
+				close(lfd);
+				return -1;
+			}
+		}
+	}
+	/* FIXME check that it is a current member */
+	if (sysfd >= 0) {
+		/* device has been removed and we don't know
+		 * the major:minor number
+		 */
+		err = sys_hot_remove_disk(sysfd, force);
+	} else {
+		err = hot_remove_disk(fd, rdev, force);
+		if (err && errno == ENODEV) {
+			/* Old kernels rejected this if no personality
+			 * is registered */
+			struct mdinfo *sra = sysfs_read(fd, NULL, GET_DEVS);
+			struct mdinfo *dv = NULL;
+			if (sra)
+				dv = sra->devs;
+			for ( ; dv ; dv=dv->next)
+				if (dv->disk.major == (int)major(rdev) &&
+				    dv->disk.minor == (int)minor(rdev))
+					break;
+			if (dv)
+				err = sysfs_set_str(sra, dv,
+						    "state", "remove");
+			else
+				err = -1;
+			sysfs_free(sra);
+		}
+	}
+	if (err) {
+		pr_err("hot remove failed for %s: %s\n",	dv->devname,
+		       strerror(errno));
+		if (lfd >= 0)
+			close(lfd);
+		return -1;
+	}
+	if (tst->ss->external) {
+		/*
+		 * Before dropping our exclusive open we make an
+		 * attempt at preventing mdmon from seeing an
+		 * 'add' event before reconciling this 'remove'
+		 * event.
+		 */
+		char *devnm = fd2devnm(fd);
+
+		if (!devnm) {
+			pr_err("unable to get container name\n");
+			return -1;
+		}
+
+		ping_manager(devnm);
+	}
+	if (lfd >= 0)
+		close(lfd);
+	if (verbose >= 0)
+		pr_err("hot removed %s from %s\n",
+		       dv->devname, devname);
+	return 1;
+}
+
+int Manage_replace(struct supertype *tst, int fd, struct mddev_dev *dv,
+		   unsigned long rdev, int verbose, char *devname)
+{
+	struct mdinfo *mdi, *di;
+	if (tst->ss->external) {
+		pr_err("--replace only supported for native metadata (0.90 or 1.x)\n");
+		return -1;
+	}
+	/* Need to find the device in sysfs and add 'want_replacement' to the
+	 * status.
+	 */
+	mdi = sysfs_read(fd, NULL, GET_DEVS);
+	if (!mdi || !mdi->devs) {
+		pr_err("Cannot find status of %s to enable replacement - strange\n",
+		       devname);
+		return -1;
+	}
+	for (di = mdi->devs; di; di = di->next)
+		if (di->disk.major == (int)major(rdev) &&
+		    di->disk.minor == (int)minor(rdev))
+			break;
+	if (di) {
+		int rv;
+		if (di->disk.raid_disk < 0) {
+			pr_err("%s is not active and so cannot be replaced.\n",
+			       dv->devname);
+			sysfs_free(mdi);
+			return -1;
+		}
+		rv = sysfs_set_str(mdi, di,
+				   "state", "want_replacement");
+		if (rv) {
+			sysfs_free(mdi);
+			pr_err("Failed to request replacement for %s\n",
+			       dv->devname);
+			return -1;
+		}
+		if (verbose >= 0)
+			pr_err("Marked %s (device %d in %s) for replacement\n",
+			       dv->devname, di->disk.raid_disk, devname);
+		/* If there is a matching 'with', we need to tell it which
+		 * raid disk
+		 */
+		while (dv && dv->disposition != 'W')
+			dv = dv->next;
+		if (dv) {
+			dv->disposition = 'w';
+			dv->used = di->disk.raid_disk;
+		}
+		return 1;
+	}
+	sysfs_free(mdi);
+	pr_err("%s not found in %s so cannot --replace it\n",
+	       dv->devname, devname);
+	return -1;
+}
+
+int Manage_with(struct supertype *tst, int fd, struct mddev_dev *dv,
+		unsigned long rdev, int verbose, char *devname)
+{
+	struct mdinfo *mdi, *di;
+	/* try to set 'slot' for 'rdev' in 'fd' to 'dv->used' */
+	mdi = sysfs_read(fd, NULL, GET_DEVS|GET_STATE);
+	if (!mdi || !mdi->devs) {
+		pr_err("Cannot find status of %s to enable replacement - strange\n",
+		       devname);
+		return -1;
+	}
+	for (di = mdi->devs; di; di = di->next)
+		if (di->disk.major == (int)major(rdev) &&
+		    di->disk.minor == (int)minor(rdev))
+			break;
+	if (di) {
+		int rv;
+		if (di->disk.state & (1<<MD_DISK_FAULTY)) {
+			pr_err("%s is faulty and cannot be a replacement\n",
+			       dv->devname);
+			sysfs_free(mdi);
+			return -1;
+		}
+		if (di->disk.raid_disk >= 0) {
+			pr_err("%s is active and cannot be a replacement\n",
+			       dv->devname);
+			sysfs_free(mdi);
+			return -1;
+		}
+		rv = sysfs_set_num(mdi, di,
+				   "slot", dv->used);
+		if (rv) {
+			sysfs_free(mdi);
+			pr_err("Failed to set %s as preferred replacement.\n",
+			       dv->devname);
+			return -1;
+		}
+		if (verbose >= 0)
+			pr_err("Marked %s in %s as replacement for device %d\n",
+			       dv->devname, devname, dv->used);
+		return 1;
+	}
+	sysfs_free(mdi);
+	pr_err("%s not found in %s so cannot make it preferred replacement\n",
+	       dv->devname, devname);
+	return -1;
+}
+
+/**
+ * is_remove_safe() - Check if remove is safe.
+ * @array: Array info.
+ * @fd: Array file descriptor.
+ * @devname: Name of device to remove.
+ * @verbose: Verbose.
+ *
+ * The function determines if array will be operational
+ * after removing &devname.
+ *
+ * Return: True if array will be operational, false otherwise.
+ */
+bool is_remove_safe(mdu_array_info_t *array, const int fd, char *devname, const int verbose)
+{
+	dev_t devid = devnm2devid(devname + 5);
+	struct mdinfo *mdi = sysfs_read(fd, NULL, GET_DEVS | GET_DISKS | GET_STATE);
+
+	if (!mdi) {
+		if (verbose)
+			pr_err("Failed to read sysfs attributes for %s\n", devname);
+		return false;
+	}
+
+	char *avail = xcalloc(array->raid_disks, sizeof(char));
+
+	for (mdi = mdi->devs; mdi; mdi = mdi->next) {
+		if (mdi->disk.raid_disk < 0)
+			continue;
+		if (!(mdi->disk.state & (1 << MD_DISK_SYNC)))
+			continue;
+		if (makedev(mdi->disk.major, mdi->disk.minor) == devid)
+			continue;
+		avail[mdi->disk.raid_disk] = 1;
+	}
+	sysfs_free(mdi);
+
+	bool is_enough = enough(array->level, array->raid_disks,
+				array->layout, 1, avail);
+
+	free(avail);
+	return is_enough;
+}
+
+/**
+ * Manage_subdevs() - Execute operation depending on devmode.
+ *
+ * @devname: name of the device.
+ * @fd: file descriptor.
+ * @devlist: list of sub-devices to manage.
+ * @verbose: verbose level.
+ * @test: test flag.
+ * @update: type of update.
+ * @force: force flag.
+ *
+ * This function executes operation defined by devmode
+ * for each dev from devlist.
+ * Devmode can be:
+ * 'a' - add the device
+ * 'S' - add the device as a spare - don't try re-add
+ * 'j' - add the device as a journal device
+ * 'A' - re-add the device
+ * 'r' - remove the device: HOT_REMOVE_DISK
+ *       device can be 'faulty' or 'detached' in which case all
+ *       matching devices are removed.
+ * 'f' - set the device faulty SET_DISK_FAULTY
+ *       device can be 'detached' in which case any device that
+ *       is inaccessible will be marked faulty.
+ * 'I' - remove device by using incremental fail
+ *       which is executed when device is removed surprisingly.
+ * 'R' - mark this device as wanting replacement.
+ * 'W' - this device is added if necessary and activated as
+ *       a replacement for a previous 'R' device.
+ * -----
+ * 'w' - 'W' will be changed to 'w' when it is paired with
+ *       a 'R' device.  If a 'W' is found while walking the list
+ *       it must be unpaired, and is an error.
+ * 'M' - this is created by a 'missing' target.  It is a slight
+ *       variant on 'A'
+ * 'F' - Another variant of 'A', where the device was faulty
+ *       so must be removed from the array first.
+ * 'c' - confirm the device as found (for clustered environments)
+ *
+ * For 'f' and 'r', the device can also be a kernel-internal
+ * name such as 'sdb'.
+ *
+ * Return: 0 on success, otherwise 1 or 2.
+ */
+int Manage_subdevs(char *devname, int fd,
+		   struct mddev_dev *devlist, int verbose, int test,
+		   enum update_opt update, int force)
+{
+	mdu_array_info_t array;
+	unsigned long long array_size;
+	struct mddev_dev *dv;
+	int tfd = -1;
+	struct supertype *tst = NULL;
+	char *subarray = NULL;
+	int sysfd = -1;
+	int count = 0; /* number of actions taken */
+	struct mdinfo info;
+	struct mdinfo devinfo;
+	int frozen = 0;
+	int busy = 0;
+	int raid_slot = -1;
+
+	if (sysfs_init(&info, fd, NULL)) {
+		pr_err("sysfs not availabile for %s\n", devname);
+		goto abort;
+	}
+
+	if (md_get_array_info(fd, &array)) {
+		pr_err("Cannot get array info for %s\n", devname);
+		goto abort;
+	}
+	/* array.size is only 32 bits and may be truncated.
+	 * So read from sysfs if possible, and record number of sectors
+	 */
+
+	array_size = get_component_size(fd);
+	if (array_size <= 0)
+		array_size = array.size * 2;
+
+	tst = super_by_fd(fd, &subarray);
+	if (!tst) {
+		pr_err("unsupport array - version %d.%d\n",
+			array.major_version, array.minor_version);
+		goto abort;
+	}
+
+	for (dv = devlist; dv; dv = dv->next) {
+		dev_t rdev = 0; /* device to add/remove etc */
+		int rv;
+		int mj,mn;
+
+		raid_slot = -1;
+		if (dv->disposition == 'c') {
+			rv = parse_cluster_confirm_arg(dv->devname,
+						       &dv->devname,
+						       &raid_slot);
+			if (rv) {
+				pr_err("Could not get the devname of cluster\n");
+				goto abort;
+			}
+		}
+
+		if (strcmp(dv->devname, "failed") == 0 ||
+		    strcmp(dv->devname, "faulty") == 0) {
+			if (dv->disposition != 'A' && dv->disposition != 'r') {
+				pr_err("%s only meaningful with -r or --re-add, not -%c\n",
+					dv->devname, dv->disposition);
+				goto abort;
+			}
+			add_faulty(dv, fd, (dv->disposition == 'A'
+					    ? 'F' : 'r'));
+			continue;
+		}
+		if (strcmp(dv->devname, "detached") == 0) {
+			if (dv->disposition != 'r' && dv->disposition != 'f') {
+				pr_err("%s only meaningful with -r of -f, not -%c\n",
+					dv->devname, dv->disposition);
+				goto abort;
+			}
+			add_detached(dv, fd, dv->disposition);
+			continue;
+		}
+
+		if (strcmp(dv->devname, "missing") == 0) {
+			struct mddev_dev *add_devlist;
+			struct mddev_dev **dp;
+			if (dv->disposition == 'c') {
+				rv = ioctl(fd, CLUSTERED_DISK_NACK, NULL);
+				break;
+			}
+
+			if (dv->disposition != 'A') {
+				pr_err("'missing' only meaningful with --re-add\n");
+				goto abort;
+			}
+			add_devlist = conf_get_devs();
+			if (add_devlist == NULL) {
+				pr_err("no devices to scan for missing members.\n");
+				continue;
+			}
+			for (dp = &add_devlist; *dp; dp = & (*dp)->next)
+				/* 'M' (for 'missing') is like 'A' without errors */
+				(*dp)->disposition = 'M';
+			*dp = dv->next;
+			dv->next = add_devlist;
+			continue;
+		}
+
+		if (strncmp(dv->devname, "set-", 4) == 0 &&
+		    strlen(dv->devname) == 5) {
+			int copies;
+
+			if (dv->disposition != 'r' &&
+			    dv->disposition != 'f') {
+				pr_err("'%s' only meaningful with -r or -f\n",
+				       dv->devname);
+				goto abort;
+			}
+			if (array.level != 10) {
+				pr_err("'%s' only meaningful with RAID10 arrays\n",
+				       dv->devname);
+				goto abort;
+			}
+			copies = ((array.layout & 0xff) *
+				  ((array.layout >> 8) & 0xff));
+			if (array.raid_disks % copies != 0 ||
+			    dv->devname[4] < 'A' ||
+			    dv->devname[4] >= 'A' + copies ||
+			    copies > 26) {
+				pr_err("'%s' not meaningful with this array\n",
+				       dv->devname);
+				goto abort;
+			}
+			add_set(dv, fd, dv->devname[4]);
+			continue;
+		}
+
+		if (strchr(dv->devname, '/') == NULL &&
+		    strchr(dv->devname, ':') == NULL &&
+		    strlen(dv->devname) < 50) {
+			/* Assume this is a kernel-internal name like 'sda1' */
+			int found = 0;
+			char dname[55];
+			if (dv->disposition != 'r' && dv->disposition != 'f' &&
+			    dv->disposition != 'I') {
+				pr_err("%s only meaningful with -r, -f or -I, not -%c\n",
+					dv->devname, dv->disposition);
+				goto abort;
+			}
+
+			sprintf(dname, "dev-%s", dv->devname);
+			sysfd = sysfs_open(fd2devnm(fd), dname, "block/dev");
+			if (sysfd >= 0) {
+				char dn[SYSFS_MAX_BUF_SIZE];
+				if (sysfs_fd_get_str(sysfd, dn, sizeof(dn)) > 0 &&
+				    sscanf(dn, "%d:%d", &mj,&mn) == 2) {
+					rdev = makedev(mj,mn);
+					found = 1;
+				}
+				close(sysfd);
+				sysfd = -1;
+			}
+			if (!found) {
+				sysfd = sysfs_open(fd2devnm(fd), dname, "state");
+				if (sysfd < 0) {
+					pr_err("%s does not appear to be a component of %s\n",
+						dv->devname, devname);
+					goto abort;
+				}
+			}
+		} else if ((dv->disposition == 'r' ||
+			    dv->disposition == 'f') &&
+			   get_maj_min(dv->devname, &mj, &mn)) {
+			/* for 'fail' and 'remove', the device might
+			 * not exist.
+			 */
+			rdev = makedev(mj, mn);
+		} else {
+			tfd = dev_open(dv->devname, O_RDONLY);
+			if (tfd >= 0) {
+				fstat_is_blkdev(tfd, dv->devname, &rdev);
+				close(tfd);
+			} else {
+				int open_err = errno;
+				if (!stat_is_blkdev(dv->devname, &rdev)) {
+					if (dv->disposition == 'M')
+						/* non-fatal. Also improbable */
+						continue;
+					goto abort;
+				}
+				if (dv->disposition == 'r')
+					/* Be happy, the stat worked, that is
+					 * enough for --remove
+					 */
+					;
+				else {
+					if (dv->disposition == 'M')
+						/* non-fatal */
+						continue;
+					pr_err("Cannot open %s: %s\n",
+					       dv->devname, strerror(open_err));
+					goto abort;
+				}
+			}
+		}
+		switch(dv->disposition){
+		default:
+			pr_err("internal error - devmode[%s]=%d\n",
+				dv->devname, dv->disposition);
+			goto abort;
+		case 'a':
+		case 'S': /* --add-spare */
+		case 'j': /* --add-journal */
+		case 'A':
+		case 'M': /* --re-add missing */
+		case 'F': /* --re-add faulty  */
+		case 'c': /* --cluster-confirm */
+			/* add the device */
+			if (subarray) {
+				pr_err("Cannot add disks to a \'member\' array, perform this operation on the parent container\n");
+				goto abort;
+			}
+
+			/* Let's first try to write re-add to sysfs */
+			if (rdev != 0 &&
+			    (dv->disposition == 'A' || dv->disposition == 'F')) {
+				sysfs_init_dev(&devinfo, rdev);
+				if (sysfs_set_str(&info, &devinfo, "state", "re-add") == 0) {
+					pr_err("re-add %s to %s succeed\n",
+						dv->devname, info.sys_name);
+					break;
+				}
+			}
+
+			if (dv->disposition == 'F')
+				/* Need to remove first */
+				hot_remove_disk(fd, rdev, force);
+			/* Make sure it isn't in use (in 2.6 or later) */
+			tfd = dev_open(dv->devname, O_RDONLY|O_EXCL);
+			if (tfd >= 0) {
+				/* We know no-one else is using it.  We'll
+				 * need non-exclusive access to add it, so
+				 * do that now.
+				 */
+				close(tfd);
+				tfd = dev_open(dv->devname, O_RDONLY);
+			}
+			if (tfd < 0) {
+				if (dv->disposition == 'M')
+					continue;
+				pr_err("Cannot open %s: %s\n",
+					dv->devname, strerror(errno));
+				goto abort;
+			}
+			if (!frozen) {
+				if (sysfs_freeze_array(&info) == 1)
+					frozen = 1;
+				else
+					frozen = -1;
+			}
+			rv = Manage_add(fd, tfd, dv, tst, &array,
+					force, verbose, devname, update,
+					rdev, array_size, raid_slot);
+			close(tfd);
+			tfd = -1;
+			if (rv < 0)
+				goto abort;
+			if (rv > 0)
+				count++;
+			break;
+
+		case 'r':
+			/* hot remove */
+			if (subarray) {
+				pr_err("Cannot remove disks from a \'member\' array, perform this operation on the parent container\n");
+				rv = -1;
+			} else
+				rv = Manage_remove(tst, fd, dv, sysfd,
+						   rdev, verbose, force,
+						   devname);
+			if (sysfd >= 0)
+				close(sysfd);
+			sysfd = -1;
+			if (rv < 0)
+				goto abort;
+			if (rv > 0)
+				count++;
+			break;
+
+		case 'f': /* set faulty */
+			if (!is_remove_safe(&array, fd, dv->devname, verbose)) {
+				pr_err("Cannot remove %s from %s, array will be failed.\n",
+				       dv->devname, devname);
+				if (sysfd >= 0)
+					close(sysfd);
+				goto abort;
+			}
+		case 'I': /* incremental fail */
+			if ((sysfd >= 0 && write(sysfd, "faulty", 6) != 6) ||
+			    (sysfd < 0 && ioctl(fd, SET_DISK_FAULTY,
+						rdev))) {
+				if (errno == EBUSY)
+					busy = 1;
+				pr_err("set device faulty failed for %s:  %s\n",
+					dv->devname, strerror(errno));
+				if (sysfd >= 0)
+					close(sysfd);
+				goto abort;
+			}
+			if (sysfd >= 0)
+				close(sysfd);
+			sysfd = -1;
+			count++;
+			if (verbose >= 0)
+				pr_err("set %s faulty in %s\n",
+					dv->devname, devname);
+			break;
+		case 'R': /* Mark as replaceable */
+			if (subarray) {
+				pr_err("Cannot replace disks in a \'member\' array, perform this operation on the parent container\n");
+				rv = -1;
+			} else {
+				if (!frozen) {
+					if (sysfs_freeze_array(&info) == 1)
+						frozen = 1;
+					else
+						frozen = -1;
+				}
+				rv = Manage_replace(tst, fd, dv,
+						    rdev, verbose,
+						    devname);
+			}
+			if (rv < 0)
+				goto abort;
+			if (rv > 0)
+				count++;
+			break;
+		case 'W': /* --with device that doesn't match */
+			pr_err("No matching --replace device for --with %s\n",
+			       dv->devname);
+			goto abort;
+		case 'w': /* --with device which was matched */
+			rv = Manage_with(tst, fd, dv,
+					 rdev, verbose, devname);
+			if (rv < 0)
+				goto abort;
+			break;
+		}
+	}
+	free(tst);
+	if (frozen > 0)
+		sysfs_set_str(&info, NULL, "sync_action","idle");
+	if (test && count == 0)
+		return 2;
+	return 0;
+
+abort:
+	free(tst);
+	if (frozen > 0)
+		sysfs_set_str(&info, NULL, "sync_action","idle");
+	return !test && busy ? 2 : 1;
+}
+
+int autodetect(void)
+{
+	/* Open any md device, and issue the RAID_AUTORUN ioctl */
+	int rv = 1;
+	int fd = dev_open("9:0", O_RDONLY);
+	if (fd >= 0) {
+		if (ioctl(fd, RAID_AUTORUN, 0) == 0)
+			rv = 0;
+		close(fd);
+	}
+	return rv;
+}
+
+int Update_subarray(char *dev, char *subarray, enum update_opt update,
+		    struct mddev_ident *ident, int verbose)
+{
+	struct supertype supertype, *st = &supertype;
+	int fd, rv = 2;
+	struct mdinfo *info = NULL;
+	char *update_verb = map_num(update_options, update);
+	bool allow_active = update == UOPT_PPL || update == UOPT_NO_PPL;
+
+	memset(st, 0, sizeof(*st));
+
+	fd = open_subarray(dev, subarray, st, verbose < 0);
+	if (fd < 0)
+		return 2;
+
+	if (!st->ss->update_subarray) {
+		if (verbose >= 0)
+			pr_err("Operation not supported for %s metadata\n",
+			       st->ss->name);
+		goto free_super;
+	}
+
+	if (!allow_active && is_subarray_active(subarray, st->devnm)) {
+		if (verbose >= 0)
+			pr_err("Subarray %s in %s is active, cannot update %s\n",
+				subarray, dev, update_verb);
+		goto free_super;
+	}
+
+	if (mdmon_running(st->devnm))
+		st->update_tail = &st->updates;
+
+	info = st->ss->container_content(st, subarray);
+
+	if (update == UOPT_PPL && !is_level456(info->array.level)) {
+		pr_err("RWH policy ppl is supported only for raid4, raid5 and raid6.\n");
+		goto free_super;
+	}
+
+	rv = st->ss->update_subarray(st, subarray, update, ident);
+
+	if (rv) {
+		if (verbose >= 0)
+			pr_err("Failed to update %s of subarray-%s in %s\n",
+				update_verb, subarray, dev);
+	} else if (st->update_tail)
+		flush_metadata_updates(st);
+	else
+		st->ss->sync_metadata(st);
+
+	if (rv == 0 && update == UOPT_NAME && verbose >= 0)
+		pr_err("Updated subarray-%s name from %s, UUIDs may have changed\n",
+		       subarray, dev);
+
+free_super:
+	if (info)
+		free(info);
+	st->ss->free_super(st);
+	close(fd);
+
+	return rv;
+}
+
+/* Move spare from one array to another If adding to destination array fails
+ * add back to original array.
+ * Returns 1 on success, 0 on failure */
+int move_spare(char *from_devname, char *to_devname, dev_t devid)
+{
+	struct mddev_dev devlist;
+	char devname[20];
+
+	/* try to remove and add */
+	int fd1 = open(to_devname, O_RDONLY);
+	int fd2 = open(from_devname, O_RDONLY);
+
+	if (fd1 < 0 || fd2 < 0) {
+		if (fd1 >= 0)
+			close(fd1);
+		if (fd2 >= 0)
+			close(fd2);
+		return 0;
+	}
+
+	devlist.next = NULL;
+	devlist.used = 0;
+	devlist.writemostly = FlagDefault;
+	devlist.failfast = FlagDefault;
+	devlist.devname = devname;
+	sprintf(devname, "%d:%d", major(devid), minor(devid));
+
+	devlist.disposition = 'r';
+	if (Manage_subdevs(from_devname, fd2, &devlist, -1, 0, UOPT_UNDEFINED, 0) == 0) {
+		devlist.disposition = 'a';
+		if (Manage_subdevs(to_devname, fd1, &devlist, -1, 0,
+				   UOPT_UNDEFINED, 0) == 0) {
+			/* make sure manager is aware of changes */
+			ping_manager(to_devname);
+			ping_manager(from_devname);
+			close(fd1);
+			close(fd2);
+			return 1;
+		}
+		else
+			Manage_subdevs(from_devname, fd2, &devlist,
+				       -1, 0, UOPT_UNDEFINED, 0);
+	}
+	close(fd1);
+	close(fd2);
+	return 0;
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-10 20:55:34 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-10 20:55:34 +0000
commit	7f1d6c8fec531fa1762d6d65576aecbee837982c (patch)
tree	b37177c380fa30d0336aad7cac9c72035523206a /Manage.c
parent	Initial commit. (diff)
download	mdadm-7f1d6c8fec531fa1762d6d65576aecbee837982c.tar.xz mdadm-7f1d6c8fec531fa1762d6d65576aecbee837982c.zip