diff options
Diffstat (limited to 'Incremental.c')
-rw-r--r-- | Incremental.c | 1764 |
1 files changed, 1764 insertions, 0 deletions
diff --git a/Incremental.c b/Incremental.c new file mode 100644 index 0000000..a57fc32 --- /dev/null +++ b/Incremental.c @@ -0,0 +1,1764 @@ +/* + * Incremental.c - support --incremental. Part of: + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2006-2013 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + * Paper: Neil Brown + * Novell Inc + * GPO Box Q1283 + * QVB Post Office, NSW 1230 + * Australia + */ + +#include "mdadm.h" +#include <sys/wait.h> +#include <dirent.h> +#include <ctype.h> + +static int count_active(struct supertype *st, struct mdinfo *sra, + int mdfd, char **availp, + struct mdinfo *info); +static void find_reject(int mdfd, struct supertype *st, struct mdinfo *sra, + int number, __u64 events, int verbose, + char *array_name); +static int try_spare(char *devname, int *dfdp, struct dev_policy *pol, + struct map_ent *target, + struct supertype *st, int verbose); + +static int Incremental_container(struct supertype *st, char *devname, + struct context *c, char *only); + +int Incremental(struct mddev_dev *devlist, struct context *c, + struct supertype *st) +{ + /* Add this device to an array, creating the array if necessary + * and starting the array if sensible or - if runstop>0 - if possible. + * + * This has several steps: + * + * 1/ Check if device is permitted by mdadm.conf, reject if not. + * 2/ Find metadata, reject if none appropriate (check + * version/name from args) + * 3/ Check if there is a match in mdadm.conf + * 3a/ if not, check for homehost match. If no match, assemble as + * a 'foreign' array. + * 4/ Determine device number. + * - If in mdadm.conf with std name, use that + * - UUID in /var/run/mdadm.map use that + * - If name is suggestive, use that. unless in use with different uuid. + * - Choose a free, high number. + * - Use a partitioned device unless strong suggestion not to. + * e.g. auto=md + * Don't choose partitioned for containers. + * 5/ Find out if array already exists + * 5a/ if it does not + * - choose a name, from mdadm.conf or 'name' field in array. + * - create the array + * - add the device + * 5b/ if it does + * - check one drive in array to make sure metadata is a reasonably + * close match. Reject if not (e.g. different type) + * - add the device + * 6/ Make sure /var/run/mdadm.map contains this array. + * 7/ Is there enough devices to possibly start the array? + * For a container, this means running Incremental_container. + * 7a/ if not, finish with success. + * 7b/ if yes, + * - read all metadata and arrange devices like -A does + * - if number of OK devices match expected, or -R and there are enough, + * start the array (auto-readonly). + */ + dev_t rdev, rdev2; + struct mdinfo info, dinfo; + struct mdinfo *sra = NULL, *d; + struct mddev_ident *match; + char chosen_name[1024]; + char *md_devname; + int rv = 1; + struct map_ent *mp, *map = NULL; + int dfd = -1, mdfd = -1; + char *avail = NULL; + int active_disks; + int trustworthy; + char *name_to_use; + struct dev_policy *policy = NULL; + struct map_ent target_array; + int have_target; + char *devname = devlist->devname; + int journal_device_missing = 0; + + struct createinfo *ci = conf_get_create_info(); + + if (!stat_is_blkdev(devname, &rdev)) + return rv; + dfd = dev_open(devname, O_RDONLY); + if (dfd < 0) { + if (c->verbose >= 0) + pr_err("cannot open %s: %s.\n", + devname, strerror(errno)); + return rv; + } + /* If the device is a container, we do something very different */ + if (must_be_container(dfd)) { + if (!st) + st = super_by_fd(dfd, NULL); + if (st && st->ss->load_container) + rv = st->ss->load_container(st, dfd, NULL); + + close(dfd); + if (!rv && st->ss->container_content) { + if (map_lock(&map)) + pr_err("failed to get exclusive lock on mapfile\n"); + if (c->export) + printf("MD_DEVNAME=%s\n", devname); + rv = Incremental_container(st, devname, c, NULL); + map_unlock(&map); + return rv; + } + + pr_err("%s is not part of an md array.\n", + devname); + return rv; + } + + /* 1/ Check if device is permitted by mdadm.conf */ + + for (;devlist; devlist = devlist->next) + if (conf_test_dev(devlist->devname)) + break; + if (!devlist) { + devlist = conf_get_devs(); + for (;devlist; devlist = devlist->next) { + if (stat_is_blkdev(devlist->devname, &rdev2) && + rdev2 == rdev) + break; + } + } + if (!devlist) { + if (c->verbose >= 0) + pr_err("%s not permitted by mdadm.conf.\n", + devname); + goto out; + } + + /* 2/ Find metadata, reject if none appropriate (check + * version/name from args) */ + + if (!fstat_is_blkdev(dfd, devname, &rdev)) + goto out; + + dinfo.disk.major = major(rdev); + dinfo.disk.minor = minor(rdev); + + policy = disk_policy(&dinfo); + have_target = policy_check_path(&dinfo, &target_array); + + if (st == NULL && (st = guess_super_type(dfd, guess_array)) == NULL) { + if (c->verbose >= 0) + pr_err("no recognisable superblock on %s.\n", + devname); + rv = try_spare(devname, &dfd, policy, + have_target ? &target_array : NULL, + NULL, c->verbose); + goto out; + } + st->ignore_hw_compat = 0; + + if (st->ss->compare_super == NULL || + st->ss->load_super(st, dfd, c->verbose >= 0 ? devname : NULL)) { + if (c->verbose >= 0) + pr_err("no RAID superblock on %s.\n", + devname); + rv = try_spare(devname, &dfd, policy, + have_target ? &target_array : NULL, + st, c->verbose); + free(st); + goto out; + } + close (dfd); dfd = -1; + + st->ss->getinfo_super(st, &info, NULL); + + /* 3/ Check if there is a match in mdadm.conf */ + match = conf_match(st, &info, devname, c->verbose, &rv); + if (!match && rv == 2) + goto out; + + if (match && match->devname && + strcasecmp(match->devname, "<ignore>") == 0) { + if (c->verbose >= 0) + pr_err("array containing %s is explicitly ignored by mdadm.conf\n", + devname); + goto out; + } + + /* 3a/ if not, check for homehost match. If no match, continue + * but don't trust the 'name' in the array. Thus a 'random' minor + * number will be assigned, and the device name will be based + * on that. */ + if (match) + trustworthy = LOCAL; + else if (st->ss->match_home(st, c->homehost) == 1) + trustworthy = LOCAL; + else if (st->ss->match_home(st, "any") == 1) + trustworthy = LOCAL_ANY; + else + trustworthy = FOREIGN; + + if (!match && !conf_test_metadata(st->ss->name, policy, + (trustworthy == LOCAL))) { + if (c->verbose >= 1) + pr_err("%s has metadata type %s for which auto-assembly is disabled\n", + devname, st->ss->name); + goto out; + } + if (trustworthy == LOCAL_ANY) + trustworthy = LOCAL; + + /* There are three possible sources for 'autof': command line, + * ARRAY line in mdadm.conf, or CREATE line in mdadm.conf. + * ARRAY takes precedence, then command line, then + * CREATE. + */ + if (match && match->autof) + c->autof = match->autof; + if (c->autof == 0) + c->autof = ci->autof; + + name_to_use = info.name; + if (name_to_use[0] == 0 && info.array.level == LEVEL_CONTAINER) { + name_to_use = info.text_version; + trustworthy = METADATA; + } + if (name_to_use[0] && trustworthy != LOCAL && + ! c->require_homehost && + conf_name_is_free(name_to_use)) + trustworthy = LOCAL; + + /* strip "hostname:" prefix from name if we have decided + * to treat it as LOCAL + */ + if (trustworthy == LOCAL && strchr(name_to_use, ':') != NULL) + name_to_use = strchr(name_to_use, ':')+1; + + /* 4/ Check if array exists. + */ + if (map_lock(&map)) + pr_err("failed to get exclusive lock on mapfile\n"); + /* Now check we can get O_EXCL. If not, probably "mdadm -A" has + * taken over + */ + dfd = dev_open(devname, O_RDONLY|O_EXCL); + if (dfd < 0) { + if (c->verbose >= 0) + pr_err("cannot reopen %s: %s.\n", + devname, strerror(errno)); + goto out_unlock; + } + /* Cannot hold it open while we add the device to the array, + * so we must release the O_EXCL and depend on the map_lock() + * So now is the best time to remove any partitions. + */ + remove_partitions(dfd); + close(dfd); + dfd = -1; + + mp = map_by_uuid(&map, info.uuid); + if (mp) + mdfd = open_dev(mp->devnm); + else + mdfd = -1; + + if (mdfd < 0) { + + /* Skip the clustered ones. This should be started by + * clustering resource agents + */ + if (info.array.state & (1 << MD_SB_CLUSTERED)) + goto out; + + /* Couldn't find an existing array, maybe make a new one */ + mdfd = create_mddev(match ? match->devname : NULL, + name_to_use, c->autof, trustworthy, chosen_name, 0); + + if (mdfd < 0) + goto out_unlock; + + if (sysfs_init(&info, mdfd, NULL)) { + pr_err("unable to initialize sysfs for %s\n", + chosen_name); + rv = 2; + goto out_unlock; + } + + if (set_array_info(mdfd, st, &info) != 0) { + pr_err("failed to set array info for %s: %s\n", + chosen_name, strerror(errno)); + rv = 2; + goto out_unlock; + } + + dinfo = info; + dinfo.disk.major = major(rdev); + dinfo.disk.minor = minor(rdev); + if (add_disk(mdfd, st, &info, &dinfo) != 0) { + pr_err("failed to add %s to new array %s: %s.\n", + devname, chosen_name, strerror(errno)); + ioctl(mdfd, STOP_ARRAY, 0); + rv = 2; + goto out_unlock; + } + sra = sysfs_read(mdfd, NULL, (GET_DEVS | GET_STATE | + GET_OFFSET | GET_SIZE)); + + if (!sra || !sra->devs || sra->devs->disk.raid_disk >= 0) { + /* It really should be 'none' - must be old buggy + * kernel, and mdadm -I may not be able to complete. + * So reject it. + */ + ioctl(mdfd, STOP_ARRAY, NULL); + pr_err("You have an old buggy kernel which cannot support\n --incremental reliably. Aborting.\n"); + rv = 2; + goto out_unlock; + } + info.array.working_disks = 1; + /* 6/ Make sure /var/run/mdadm.map contains this array. */ + map_update(&map, fd2devnm(mdfd), + info.text_version, + info.uuid, chosen_name); + } else { + /* 5b/ if it does */ + /* - check one drive in array to make sure metadata is a reasonably */ + /* close match. Reject if not (e.g. different type) */ + /* - add the device */ + char dn[20]; + int dfd2; + int err; + struct supertype *st2; + struct mdinfo info2, *d; + + sra = sysfs_read(mdfd, NULL, (GET_DEVS | GET_STATE | + GET_OFFSET | GET_SIZE)); + + if (mp->path) + strcpy(chosen_name, mp->path); + else + strcpy(chosen_name, mp->devnm); + + /* It is generally not OK to add non-spare drives to a + * running array as they are probably missing because + * they failed. However if runstop is 1, then the + * array was possibly started early and our best bet is + * to add this anyway. + * Also if action policy is re-add or better we allow + * re-add. + * This doesn't apply to containers as the 'non-spare' + * flag has a different meaning. The test has to happen + * at the device level there + */ + if (!st->ss->external && + (info.disk.state & (1 << MD_DISK_SYNC)) != 0 && + !policy_action_allows(policy, st->ss->name, act_re_add) && + c->runstop < 1) { + if (md_array_active(mdfd)) { + pr_err("not adding %s to active array (without --run) %s\n", + devname, chosen_name); + rv = 2; + goto out_unlock; + } + } + if (!sra) { + rv = 2; + goto out_unlock; + } + if (sra->devs) { + sprintf(dn, "%d:%d", sra->devs->disk.major, + sra->devs->disk.minor); + dfd2 = dev_open(dn, O_RDONLY); + if (dfd2 < 0) { + pr_err("unable to open %s\n", devname); + rv = 2; + goto out_unlock; + } + st2 = dup_super(st); + if (st2->ss->load_super(st2, dfd2, NULL) || + st->ss->compare_super(st, st2, 1) != 0) { + pr_err("metadata mismatch between %s and chosen array %s\n", + devname, chosen_name); + close(dfd2); + rv = 2; + goto out_unlock; + } + close(dfd2); + st2->ss->getinfo_super(st2, &info2, NULL); + st2->ss->free_super(st2); + if (info.array.level != info2.array.level || + memcmp(info.uuid, info2.uuid, 16) != 0 || + info.array.raid_disks != info2.array.raid_disks) { + pr_err("unexpected difference between %s and %s.\n", + chosen_name, devname); + rv = 2; + goto out_unlock; + } + } + info.disk.major = major(rdev); + info.disk.minor = minor(rdev); + /* add disk needs to know about containers */ + if (st->ss->external) + sra->array.level = LEVEL_CONTAINER; + + if (info.array.state & (1 << MD_SB_CLUSTERED)) + info.disk.state |= (1 << MD_DISK_CLUSTER_ADD); + + err = add_disk(mdfd, st, sra, &info); + if (err < 0 && errno == EBUSY) { + /* could be another device present with the same + * disk.number. Find and reject any such + */ + find_reject(mdfd, st, sra, info.disk.number, + info.events, c->verbose, chosen_name); + err = add_disk(mdfd, st, sra, &info); + } + if (err < 0 && errno == EINVAL && + info.disk.state & (1<<MD_DISK_SYNC)) { + /* Maybe it needs to be added as a spare */ + if (policy_action_allows(policy, st->ss->name, + act_force_spare)) { + info.disk.state &= ~(1<<MD_DISK_SYNC); + err = add_disk(mdfd, st, sra, &info); + } else + if (c->verbose >= 0) + pr_err("can only add %s to %s as a spare, and force-spare is not set.\n", + devname, chosen_name); + } + if (err < 0) { + pr_err("failed to add %s to existing array %s: %s.\n", + devname, chosen_name, strerror(errno)); + rv = 2; + goto out_unlock; + } + info.array.working_disks = 0; + for (d = sra->devs; d; d=d->next) + info.array.working_disks ++; + + } + if (strncmp(chosen_name, "/dev/md/", 8) == 0) + md_devname = chosen_name+8; + else + md_devname = chosen_name; + if (c->export) { + printf("MD_DEVICE=%s\n", fd2devnm(mdfd)); + printf("MD_DEVNAME=%s\n", md_devname); + printf("MD_FOREIGN=%s\n", trustworthy == FOREIGN ? "yes" : "no"); + } + + /* 7/ Is there enough devices to possibly start the array? */ + /* 7a/ if not, finish with success. */ + if (info.array.level == LEVEL_CONTAINER) { + char devnm[32]; + /* Try to assemble within the container */ + sysfs_uevent(sra, "change"); + if (!c->export && c->verbose >= 0) + pr_err("container %s now has %d device%s\n", + chosen_name, info.array.working_disks, + info.array.working_disks == 1?"":"s"); + sysfs_rules_apply(chosen_name, &info); + wait_for(chosen_name, mdfd); + if (st->ss->external) + strcpy(devnm, fd2devnm(mdfd)); + if (st->ss->load_container) + rv = st->ss->load_container(st, mdfd, NULL); + close(mdfd); + sysfs_free(sra); + if (!rv) + rv = Incremental_container(st, chosen_name, c, NULL); + map_unlock(&map); + /* after spare is added, ping monitor for external metadata + * so that it can eg. try to rebuild degraded array */ + if (st->ss->external) + ping_monitor(devnm); + return rv; + } + + /* We have added something to the array, so need to re-read the + * state. Eventually this state should be kept up-to-date as + * things change. + */ + sysfs_free(sra); + sra = sysfs_read(mdfd, NULL, (GET_DEVS | GET_STATE | + GET_OFFSET | GET_SIZE)); + active_disks = count_active(st, sra, mdfd, &avail, &info); + + journal_device_missing = (info.journal_device_required) && (info.journal_clean == 0); + + if (info.consistency_policy == CONSISTENCY_POLICY_PPL) + info.array.state |= 1; + + if (enough(info.array.level, info.array.raid_disks, + info.array.layout, info.array.state & 1, avail) == 0) { + if (c->export) { + printf("MD_STARTED=no\n"); + } else if (c->verbose >= 0) + pr_err("%s attached to %s, not enough to start (%d).\n", + devname, chosen_name, active_disks); + rv = 0; + goto out_unlock; + } + + /* 7b/ if yes, */ + /* - if number of OK devices match expected, or -R and there */ + /* are enough, */ + /* + add any bitmap file */ + /* + start the array (auto-readonly). */ + + if (md_array_active(mdfd)) { + if (c->export) { + printf("MD_STARTED=already\n"); + } else if (c->verbose >= 0) + pr_err("%s attached to %s which is already active.\n", + devname, chosen_name); + rv = 0; + goto out_unlock; + } + + map_unlock(&map); + if (c->runstop > 0 || (!journal_device_missing && active_disks >= info.array.working_disks)) { + struct mdinfo *dsk; + /* Let's try to start it */ + + if (journal_device_missing) + pr_err("Trying to run with missing journal device\n"); + if (info.reshape_active && !(info.reshape_active & RESHAPE_NO_BACKUP)) { + pr_err("%s: This array is being reshaped and cannot be started\n", + chosen_name); + cont_err("by --incremental. Please use --assemble\n"); + goto out; + } + if (match && match->bitmap_file) { + int bmfd = open(match->bitmap_file, O_RDWR); + if (bmfd < 0) { + pr_err("Could not open bitmap file %s.\n", + match->bitmap_file); + goto out; + } + if (ioctl(mdfd, SET_BITMAP_FILE, bmfd) != 0) { + close(bmfd); + pr_err("Failed to set bitmapfile for %s.\n", + chosen_name); + goto out; + } + close(bmfd); + } + /* Need to remove from the array any devices which + * 'count_active' discerned were too old or inappropriate + */ + for (d = sra ? sra->devs : NULL ; d ; d = d->next) + if (d->disk.state & (1<<MD_DISK_REMOVED)) + remove_disk(mdfd, st, sra, d); + + if ((sra == NULL || active_disks >= info.array.working_disks) && + trustworthy != FOREIGN) + rv = ioctl(mdfd, RUN_ARRAY, NULL); + else + rv = sysfs_set_str(sra, NULL, + "array_state", "read-auto"); + /* Array might be O_EXCL which will interfere with + * fsck and mount. So re-open without O_EXCL. + */ + reopen_mddev(mdfd); + if (rv == 0) { + if (c->export) { + printf("MD_STARTED=yes\n"); + } else if (c->verbose >= 0) + pr_err("%s attached to %s, which has been started.\n", + devname, chosen_name); + rv = 0; + wait_for(chosen_name, mdfd); + /* We just started the array, so some devices + * might have been evicted from the array + * because their event counts were too old. + * If the action=re-add policy is in-force for + * those devices we should re-add them now. + */ + for (dsk = sra->devs; dsk ; dsk = dsk->next) { + if (disk_action_allows(dsk, st->ss->name, + act_re_add) && + add_disk(mdfd, st, sra, dsk) == 0) + pr_err("%s re-added to %s\n", + dsk->sys_name, chosen_name); + } + } else { + pr_err("%s attached to %s, but failed to start: %s.\n", + devname, chosen_name, strerror(errno)); + rv = 1; + } + } else { + if (c->export) { + printf("MD_STARTED=unsafe\n"); + } else if (journal_device_missing) { + pr_err("Journal device is missing, not safe to start yet.\n"); + } else if (c->verbose >= 0) + pr_err("%s attached to %s, not enough to start safely.\n", + devname, chosen_name); + rv = 0; + } +out: + free(avail); + if (dfd >= 0) + close(dfd); + if (mdfd >= 0) + close(mdfd); + if (policy) + dev_policy_free(policy); + sysfs_free(sra); + return rv; +out_unlock: + map_unlock(&map); + goto out; +} + +static void find_reject(int mdfd, struct supertype *st, struct mdinfo *sra, + int number, __u64 events, int verbose, + char *array_name) +{ + /* Find a device attached to this array with a disk.number of number + * and events less than the passed events, and remove the device. + */ + struct mdinfo *d; + + if (md_array_active(mdfd)) + return; /* not safe to remove from active arrays + * without thinking more */ + + for (d = sra->devs; d ; d = d->next) { + char dn[24]; // 2*11 bytes for ints (including sign) + colon + null byte + int dfd; + struct mdinfo info; + sprintf(dn, "%d:%d", d->disk.major, d->disk.minor); + dfd = dev_open(dn, O_RDONLY); + if (dfd < 0) + continue; + if (st->ss->load_super(st, dfd, NULL)) { + close(dfd); + continue; + } + st->ss->getinfo_super(st, &info, NULL); + st->ss->free_super(st); + close(dfd); + + if (info.disk.number != number || info.events >= events) + continue; + + if (d->disk.raid_disk > -1) + sysfs_set_str(sra, d, "slot", "none"); + if (sysfs_set_str(sra, d, "state", "remove") == 0) + if (verbose >= 0) + pr_err("removing old device %s from %s\n", + d->sys_name+4, array_name); + } +} + +static int count_active(struct supertype *st, struct mdinfo *sra, + int mdfd, char **availp, + struct mdinfo *bestinfo) +{ + /* count how many devices in sra think they are active */ + struct mdinfo *d; + int cnt = 0; + int replcnt = 0; + __u64 max_events = 0; + __u64 max_journal_events = 0; + char *avail = NULL; + int *best = NULL; + char *devmap = NULL; + int numdevs = 0; + int devnum; + int b, i; + int raid_disks = 0; + + if (!sra) + return 0; + + for (d = sra->devs ; d ; d = d->next) + numdevs++; + for (d = sra->devs, devnum = 0 ; d ; d = d->next, devnum++) { + char dn[30]; + int dfd; + int ok; + struct mdinfo info; + + sprintf(dn, "%d:%d", d->disk.major, d->disk.minor); + dfd = dev_open(dn, O_RDONLY); + if (dfd < 0) + continue; + ok = st->ss->load_super(st, dfd, NULL); + close(dfd); + if (ok != 0) + continue; + + info.array.raid_disks = raid_disks; + st->ss->getinfo_super(st, &info, devmap + raid_disks * devnum); + if (info.disk.raid_disk == MD_DISK_ROLE_JOURNAL && + info.events > max_journal_events) + max_journal_events = info.events; + if (!avail) { + raid_disks = info.array.raid_disks; + avail = xcalloc(raid_disks, 1); + *availp = avail; + + best = xcalloc(raid_disks, sizeof(int)); + devmap = xcalloc(raid_disks, numdevs); + + st->ss->getinfo_super(st, &info, devmap); + } + + if (info.disk.state & (1<<MD_DISK_SYNC)) + { + if (cnt == 0) { + cnt++; + max_events = info.events; + avail[info.disk.raid_disk] = 2; + best[info.disk.raid_disk] = devnum; + st->ss->getinfo_super(st, bestinfo, NULL); + } else if (info.events == max_events) { + avail[info.disk.raid_disk] = 2; + best[info.disk.raid_disk] = devnum; + } else if (info.events == max_events-1) { + if (avail[info.disk.raid_disk] == 0) { + avail[info.disk.raid_disk] = 1; + best[info.disk.raid_disk] = devnum; + } + } else if (info.events < max_events - 1) + ; + else if (info.events == max_events+1) { + int i; + max_events = info.events; + for (i = 0; i < raid_disks; i++) + if (avail[i]) + avail[i]--; + avail[info.disk.raid_disk] = 2; + best[info.disk.raid_disk] = devnum; + st->ss->getinfo_super(st, bestinfo, NULL); + } else { /* info.events much bigger */ + memset(avail, 0, raid_disks); + max_events = info.events; + avail[info.disk.raid_disk] = 2; + best[info.disk.raid_disk] = devnum; + st->ss->getinfo_super(st, bestinfo, NULL); + } + } else if (info.disk.state & (1<<MD_DISK_REPLACEMENT)) + replcnt++; + st->ss->free_super(st); + } + if (max_journal_events >= max_events - 1) + bestinfo->journal_clean = 1; + + if (!avail) + return 0; + /* We need to reject any device that thinks the best device is + * failed or missing */ + for (b = 0; b < raid_disks; b++) + if (avail[b] == 2) + break; + cnt = 0; + for (i = 0 ; i < raid_disks ; i++) { + if (i != b && avail[i]) + if (devmap[raid_disks * best[i] + b] == 0) { + /* This device thinks 'b' is failed - + * don't use it */ + devnum = best[i]; + for (d=sra->devs ; devnum; d = d->next) + devnum--; + d->disk.state |= (1 << MD_DISK_REMOVED); + avail[i] = 0; + } + if (avail[i]) + cnt++; + } + /* Also need to reject any spare device with an event count that + * is too high + */ + for (d = sra->devs; d; d = d->next) { + if (!(d->disk.state & (1<<MD_DISK_SYNC)) && + d->events > max_events) + d->disk.state |= (1 << MD_DISK_REMOVED); + } + free(best); + free(devmap); + return cnt + replcnt; +} + +/* test if container has degraded member(s) */ +static int +container_members_max_degradation(struct map_ent *map, struct map_ent *me) +{ + struct mdinfo *sra; + int degraded, max_degraded = 0; + + for(; map; map = map->next) { + if (!metadata_container_matches(map->metadata, me->devnm)) + continue; + /* most accurate information regarding array degradation */ + sra = sysfs_read(-1, map->devnm, + GET_DISKS | GET_DEVS | GET_STATE); + if (!sra) + continue; + degraded = sra->array.raid_disks - sra->array.active_disks - + sra->array.spare_disks; + if (degraded > max_degraded) + max_degraded = degraded; + sysfs_free(sra); + } + + return max_degraded; +} + +static int array_try_spare(char *devname, int *dfdp, struct dev_policy *pol, + struct map_ent *target, int bare, + struct supertype *st, int verbose) +{ + /* This device doesn't have any md metadata + * The device policy allows 'spare' and if !bare, it allows spare-same-slot. + * If 'st' is not set, then we only know that some metadata allows this, + * others possibly don't. + * So look for a container or array to attach the device to. + * Prefer 'target' if that is set and the array is found. + * + * If st is set, then only arrays of that type are considered + * Return 0 on success, or some exit code on failure, probably 1. + */ + int rv = 1; + dev_t rdev; + struct map_ent *mp, *map = NULL; + struct mdinfo *chosen = NULL; + int dfd = *dfdp; + + if (!fstat_is_blkdev(dfd, devname, &rdev)) + return 1; + + /* + * Now we need to find a suitable array to add this to. + * We only accept arrays that: + * - match 'st' + * - are in the same domains as the device + * - are of an size for which the device will be useful + * and we choose the one that is the most degraded + */ + + if (map_lock(&map)) { + pr_err("failed to get exclusive lock on mapfile\n"); + return 1; + } + for (mp = map ; mp ; mp = mp->next) { + struct supertype *st2; + struct domainlist *dl = NULL; + struct mdinfo *sra; + unsigned long long devsize, freesize = 0; + struct spare_criteria sc = {0, 0}; + + if (is_subarray(mp->metadata)) + continue; + if (st) { + st2 = st->ss->match_metadata_desc(mp->metadata); + if (!st2 || + (st->minor_version >= 0 && + st->minor_version != st2->minor_version)) { + if (verbose > 1) + pr_err("not adding %s to %s as metadata type doesn't match\n", + devname, mp->path); + free(st2); + continue; + } + free(st2); + } + sra = sysfs_read(-1, mp->devnm, + GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE| + GET_COMPONENT|GET_VERSION); + if (sra) + sra->array.failed_disks = -1; + else + continue; + if (st == NULL) { + int i; + st2 = NULL; + for(i = 0; !st2 && superlist[i]; i++) + st2 = superlist[i]->match_metadata_desc( + sra->text_version); + if (!st2) { + if (verbose > 1) + pr_err("not adding %s to %s as metadata not recognised.\n", + devname, mp->path); + goto next; + } + /* Need to double check the 'act_spare' permissions applies + * to this metadata. + */ + if (!policy_action_allows(pol, st2->ss->name, act_spare)) + goto next; + if (!bare && !policy_action_allows(pol, st2->ss->name, + act_spare_same_slot)) + goto next; + } else + st2 = st; + /* update number of failed disks for mostly degraded + * container member */ + if (sra->array.failed_disks == -1) + sra->array.failed_disks = container_members_max_degradation(map, mp); + + get_dev_size(dfd, NULL, &devsize); + if (sra->component_size == 0) { + /* true for containers, here we must read superblock + * to obtain minimum spare size */ + struct supertype *st3 = dup_super(st2); + int mdfd = open_dev(mp->devnm); + if (mdfd < 0) { + free(st3); + goto next; + } + if (st3->ss->load_container && + !st3->ss->load_container(st3, mdfd, mp->path)) { + if (st3->ss->get_spare_criteria) + st3->ss->get_spare_criteria(st3, &sc); + st3->ss->free_super(st3); + } + free(st3); + close(mdfd); + } + if ((sra->component_size > 0 && + st2->ss->validate_geometry(st2, sra->array.level, sra->array.layout, + sra->array.raid_disks, &sra->array.chunk_size, + sra->component_size, + sra->devs ? sra->devs->data_offset : INVALID_SECTORS, + devname, &freesize, sra->consistency_policy, + 0) && + freesize < sra->component_size) || + (sra->component_size == 0 && devsize < sc.min_size)) { + if (verbose > 1) + pr_err("not adding %s to %s as it is too small\n", + devname, mp->path); + goto next; + } + /* test against target. + * If 'target' is set and 'bare' is false, we only accept + * arrays/containers that match 'target'. + * If 'target' is set and 'bare' is true, we prefer the + * array which matches 'target'. + * target is considered only if we deal with degraded array + */ + if (target && policy_action_allows(pol, st2->ss->name, + act_spare_same_slot)) { + if (strcmp(target->metadata, mp->metadata) == 0 && + memcmp(target->uuid, mp->uuid, + sizeof(target->uuid)) == 0 && + sra->array.failed_disks > 0) { + /* This is our target!! */ + sysfs_free(chosen); + chosen = sra; + sra = NULL; + /* skip to end so we don't check any more */ + while (mp->next) + mp = mp->next; + goto next; + } + /* not our target */ + if (!bare) + goto next; + } + + dl = domain_from_array(sra, st2->ss->name); + if (domain_test(dl, pol, st2->ss->name) != 1) { + /* domain test fails */ + if (verbose > 1) + pr_err("not adding %s to %s as it is not in a compatible domain\n", + devname, mp->path); + + goto next; + } + /* all tests passed, OK to add to this array */ + if (!chosen) { + chosen = sra; + sra = NULL; + } else if (chosen->array.failed_disks < sra->array.failed_disks) { + sysfs_free(chosen); + chosen = sra; + sra = NULL; + } + next: + sysfs_free(sra); + if (st != st2) + free(st2); + if (dl) + domain_free(dl); + } + if (chosen) { + /* add current device to chosen array as a spare */ + int mdfd = open_dev(chosen->sys_name); + if (mdfd >= 0) { + struct mddev_dev devlist; + char chosen_devname[24]; // 2*11 for int (including signs) + colon + null + devlist.next = NULL; + devlist.used = 0; + devlist.writemostly = FlagDefault; + devlist.failfast = FlagDefault; + devlist.devname = chosen_devname; + sprintf(chosen_devname, "%d:%d", major(rdev), + minor(rdev)); + devlist.disposition = 'a'; + close(dfd); + *dfdp = -1; + rv = Manage_subdevs(chosen->sys_name, mdfd, &devlist, + -1, 0, NULL, 0); + close(mdfd); + } + if (verbose > 0) { + if (rv == 0) + pr_err("added %s as spare for %s\n", + devname, chosen->sys_name); + else + pr_err("failed to add %s as spare for %s\n", + devname, chosen->sys_name); + } + sysfs_free(chosen); + } + map_unlock(&map); + return rv; +} + +static int partition_try_spare(char *devname, int *dfdp, struct dev_policy *pol, + struct supertype *st, int verbose) +{ + /* we know that at least one partition virtual-metadata is + * allowed to incorporate spares like this device. We need to + * find a suitable device to copy partition information from. + * + * Getting a list of all disk (not partition) devices is + * slightly non-trivial. We could look at /sys/block, but + * that is theoretically due to be removed. Maybe best to use + * /dev/disk/by-path/?* and ignore names ending '-partNN' as + * we depend on this directory of 'path' info. But that fails + * to find loop devices and probably others. Maybe don't + * worry about that, they aren't the real target. + * + * So: check things in /dev/disk/by-path to see if they are in + * a compatible domain, then load the partition table and see + * if it is OK for the new device, and choose the largest + * partition table that fits. + */ + DIR *dir; + struct dirent *de; + char *chosen = NULL; + unsigned long long chosen_size = 0; + struct supertype *chosen_st = NULL; + int fd; + + dir = opendir("/dev/disk/by-path"); + if (!dir) + return 1; + while ((de = readdir(dir)) != NULL) { + char *ep; + struct dev_policy *pol2 = NULL; + struct domainlist *domlist = NULL; + int fd = -1; + struct mdinfo info; + struct supertype *st2 = NULL; + char *devname = NULL; + unsigned long long devsectors; + char *pathlist[2]; + + if (de->d_ino == 0 || de->d_name[0] == '.' || + (de->d_type != DT_LNK && de->d_type != DT_UNKNOWN)) + goto next; + + ep = de->d_name + strlen(de->d_name); + while (ep > de->d_name && + isdigit(ep[-1])) + ep--; + if (ep > de->d_name + 5 && + strncmp(ep-5, "-part", 5) == 0) + /* This is a partition - skip it */ + goto next; + + pathlist[0] = de->d_name; + pathlist[1] = NULL; + pol2 = path_policy(pathlist, type_disk); + + domain_merge(&domlist, pol2, st ? st->ss->name : NULL); + if (domain_test(domlist, pol, st ? st->ss->name : NULL) != 1) + /* new device is incompatible with this device. */ + goto next; + + domain_free(domlist); + domlist = NULL; + + if (asprintf(&devname, "/dev/disk/by-path/%s", de->d_name) != 1) { + devname = NULL; + goto next; + } + fd = open(devname, O_RDONLY); + if (fd < 0) + goto next; + if (get_dev_size(fd, devname, &devsectors) == 0) + goto next; + devsectors >>= 9; + + if (st) + st2 = dup_super(st); + else + st2 = guess_super_type(fd, guess_partitions); + if (st2 == NULL || st2->ss->load_super(st2, fd, NULL) < 0) + goto next; + st2->ignore_hw_compat = 0; + + if (!st) { + /* Check domain policy again, this time referring to metadata */ + domain_merge(&domlist, pol2, st2->ss->name); + if (domain_test(domlist, pol, st2->ss->name) != 1) + /* Incompatible devices for this metadata type */ + goto next; + if (!policy_action_allows(pol, st2->ss->name, act_spare)) + /* Some partition types allow sparing, but not + * this one. + */ + goto next; + } + + st2->ss->getinfo_super(st2, &info, NULL); + if (info.component_size > devsectors) + /* This partitioning doesn't fit in the device */ + goto next; + + /* This is an acceptable device to copy partition + * metadata from. We could just stop here, but I + * think I want to keep looking incase a larger + * metadata which makes better use of the device can + * be found. + */ + if (chosen == NULL || chosen_size < info.component_size) { + chosen_size = info.component_size; + free(chosen); + chosen = devname; + devname = NULL; + if (chosen_st) { + chosen_st->ss->free_super(chosen_st); + free(chosen_st); + } + chosen_st = st2; + st2 = NULL; + } + + next: + free(devname); + domain_free(domlist); + dev_policy_free(pol2); + if (st2) + st2->ss->free_super(st2); + free(st2); + + if (fd >= 0) + close(fd); + } + + closedir(dir); + + if (!chosen) + return 1; + + /* 'chosen' is the best device we can find. Let's write its + * metadata to devname dfd is read-only so don't use that + */ + fd = open(devname, O_RDWR); + if (fd >= 0) { + chosen_st->ss->store_super(chosen_st, fd); + close(fd); + } + free(chosen); + chosen_st->ss->free_super(chosen_st); + free(chosen_st); + return 0; +} + +static int is_bare(int dfd) +{ + unsigned long long size = 0; + char bufpad[4096 + 4096]; + char *buf = (char*)(((long)bufpad + 4096) & ~4095); + + if (lseek(dfd, 0, SEEK_SET) != 0 || + read(dfd, buf, 4096) != 4096) + return 0; + + if (buf[0] != '\0' && buf[0] != '\x5a' && buf[0] != '\xff') + return 0; + if (memcmp(buf, buf+1, 4095) != 0) + return 0; + + /* OK, first 4K appear blank, try the end. */ + get_dev_size(dfd, NULL, &size); + if (lseek(dfd, size-4096, SEEK_SET) < 0 || + read(dfd, buf, 4096) != 4096) + return 0; + + if (buf[0] != '\0' && buf[0] != '\x5a' && buf[0] != '\xff') + return 0; + if (memcmp(buf, buf+1, 4095) != 0) + return 0; + + return 1; +} + +/* adding a spare to a regular array is quite different from adding one to + * a set-of-partitions virtual array. + * This function determines which is worth trying and tries as appropriate. + * Arrays are given priority over partitions. + */ +static int try_spare(char *devname, int *dfdp, struct dev_policy *pol, + struct map_ent *target, + struct supertype *st, int verbose) +{ + int i; + int rv; + int arrays_ok = 0; + int partitions_ok = 0; + int dfd = *dfdp; + int bare; + + /* Can only add a spare if device has at least one domain */ + if (pol_find(pol, pol_domain) == NULL) + return 1; + /* And only if some action allows spares */ + if (!policy_action_allows(pol, st?st->ss->name:NULL, act_spare)) + return 1; + + /* Now check if the device is bare. + * bare devices can always be added as a spare + * non-bare devices can only be added if spare-same-slot is permitted, + * and this device is replacing a previous device - in which case 'target' + * will be set. + */ + if (!is_bare(dfd)) { + /* Must have a target and allow same_slot */ + /* Later - may allow force_spare without target */ + if (!target || + !policy_action_allows(pol, st?st->ss->name:NULL, + act_spare_same_slot)) { + if (verbose > 1) + pr_err("%s is not bare, so not considering as a spare\n", + devname); + return 1; + } + bare = 0; + } else + bare = 1; + + /* It might be OK to add this device to an array - need to see + * what arrays might be candidates. + */ + if (st) { + /* just try to add 'array' or 'partition' based on this metadata */ + if (st->ss->add_to_super) + return array_try_spare(devname, dfdp, pol, target, bare, + st, verbose); + else + return partition_try_spare(devname, dfdp, pol, + st, verbose); + } + /* No metadata was specified or found so options are open. + * Check for whether any array metadata, or any partition metadata + * might allow adding the spare. This check is just help to avoid + * a more costly scan of all arrays when we can be sure that will + * fail. + */ + for (i = 0; (!arrays_ok || !partitions_ok) && superlist[i] ; i++) { + if (superlist[i]->add_to_super && !arrays_ok && + policy_action_allows(pol, superlist[i]->name, act_spare)) + arrays_ok = 1; + if (superlist[i]->add_to_super == NULL && !partitions_ok && + policy_action_allows(pol, superlist[i]->name, act_spare)) + partitions_ok = 1; + } + rv = 1; + if (arrays_ok) + rv = array_try_spare(devname, dfdp, pol, target, bare, + st, verbose); + if (rv != 0 && partitions_ok) + rv = partition_try_spare(devname, dfdp, pol, st, verbose); + return rv; +} + +int IncrementalScan(struct context *c, char *devnm) +{ + /* look at every device listed in the 'map' file. + * If one is found that is not running then: + * look in mdadm.conf for bitmap file. + * if one exists, but array has none, add it. + * try to start array in auto-readonly mode + */ + struct map_ent *mapl = NULL; + struct map_ent *me; + struct mddev_ident *devs, *mddev; + int rv = 0; + char container[32]; + char *only = NULL; + + map_read(&mapl); + devs = conf_get_ident(NULL); + +restart: + for (me = mapl ; me ; me = me->next) { + struct mdinfo *sra; + int mdfd; + + if (devnm && strcmp(devnm, me->devnm) != 0) + continue; + if (me->metadata[0] == '/') { + char *sl; + + if (!devnm) + continue; + + /* member array, need to work on container */ + strncpy(container, me->metadata+1, 32); + container[31] = 0; + sl = strchr(container, '/'); + if (sl) + *sl = 0; + only = devnm; + devnm = container; + goto restart; + } + mdfd = open_dev(me->devnm); + + if (!is_fd_valid(mdfd)) + continue; + if (!isdigit(me->metadata[0])) { + /* must be a container */ + struct supertype *st = super_by_fd(mdfd, NULL); + int ret = 0; + struct map_ent *map = NULL; + + if (st && st->ss->load_container) + ret = st->ss->load_container(st, mdfd, NULL); + close_fd(&mdfd); + if (!ret && st && st->ss->container_content) { + if (map_lock(&map)) + pr_err("failed to get exclusive lock on mapfile\n"); + ret = Incremental_container(st, me->path, c, only); + map_unlock(&map); + } + if (ret) + rv = 1; + continue; + } + if (md_array_active(mdfd)) { + close_fd(&mdfd); + continue; + } + /* Ok, we can try this one. Maybe it needs a bitmap */ + for (mddev = devs ; mddev ; mddev = mddev->next) + if (mddev->devname && me->path && + devname_matches(mddev->devname, me->path)) + break; + if (mddev && mddev->bitmap_file) { + /* + * Note: early kernels will wrongly fail this, so it + * is a hint only + */ + int added = -1; + int bmfd; + + bmfd = open(mddev->bitmap_file, O_RDWR); + if (is_fd_valid(bmfd)) { + added = ioctl(mdfd, SET_BITMAP_FILE, bmfd); + close_fd(&bmfd); + } + if (c->verbose >= 0) { + if (added == 0) + pr_err("Added bitmap %s to %s\n", + mddev->bitmap_file, me->path); + else if (errno != EEXIST) + pr_err("Failed to add bitmap to %s: %s\n", + me->path, strerror(errno)); + } + } + /* FIXME check for reshape_active and consider not + * starting array. + */ + sra = sysfs_read(mdfd, NULL, 0); + if (sra) { + if (sysfs_set_str(sra, NULL, + "array_state", "read-auto") == 0) { + if (c->verbose >= 0) + pr_err("started array %s\n", + me->path ?: me->devnm); + } else { + pr_err("failed to start array %s: %s\n", + me->path ?: me->devnm, + strerror(errno)); + rv = 1; + } + sysfs_free(sra); + } + close_fd(&mdfd); + } + map_free(mapl); + return rv; +} + +static char *container2devname(char *devname) +{ + char *mdname = NULL; + + if (devname[0] == '/') { + int fd = open(devname, O_RDONLY); + if (fd >= 0) { + mdname = xstrdup(fd2devnm(fd)); + close(fd); + } + } else { + int uuid[4]; + struct map_ent *mp, *map = NULL; + + if (!parse_uuid(devname, uuid)) + return mdname; + mp = map_by_uuid(&map, uuid); + if (mp) + mdname = xstrdup(mp->devnm); + map_free(map); + } + + return mdname; +} + +static int Incremental_container(struct supertype *st, char *devname, + struct context *c, char *only) +{ + /* Collect the contents of this container and for each + * array, choose a device name and assemble the array. + */ + + struct mdinfo *list; + struct mdinfo *ra; + struct map_ent *map = NULL; + struct mdinfo info; + int trustworthy; + struct mddev_ident *match; + int rv = 0; + int result = 0; + + st->ss->getinfo_super(st, &info, NULL); + + if ((c->runstop > 0 && info.container_enough >= 0) || + info.container_enough > 0) + /* pass */; + else { + if (c->export) { + printf("MD_STARTED=no\n"); + } else if (c->verbose) + pr_err("not enough devices to start the container\n"); + return 0; + } + + match = conf_match(st, &info, devname, c->verbose, &rv); + if (match == NULL && rv == 2) + return rv; + + /* Need to compute 'trustworthy' */ + if (match) + trustworthy = LOCAL; + else if (st->ss->match_home(st, c->homehost) == 1) + trustworthy = LOCAL; + else if (st->ss->match_home(st, "any") == 1) + trustworthy = LOCAL; + else + trustworthy = FOREIGN; + + list = st->ss->container_content(st, NULL); + /* when nothing to activate - quit */ + if (list == NULL) { + if (c->export) { + printf("MD_STARTED=nothing\n"); + } + return 0; + } + for (ra = list ; ra ; ra = ra->next) { + int mdfd; + char chosen_name[1024]; + struct map_ent *mp; + struct mddev_ident *match = NULL; + + /* do not activate arrays blocked by metadata handler */ + if (ra->array.state & (1 << MD_SB_BLOCK_VOLUME)) { + pr_err("Cannot activate array %s in %s.\n", + ra->text_version, devname); + continue; + } + mp = map_by_uuid(&map, ra->uuid); + + if (mp) { + mdfd = open_dev(mp->devnm); + if (mp->path) + strcpy(chosen_name, mp->path); + else + strcpy(chosen_name, mp->devnm); + } else if (!only) { + + /* Check in mdadm.conf for container == devname and + * member == ra->text_version after second slash. + */ + char *sub = strchr(ra->text_version+1, '/'); + struct mddev_ident *array_list; + if (sub) { + sub++; + array_list = conf_get_ident(NULL); + } else + array_list = NULL; + for(; array_list ; array_list = array_list->next) { + char *dn; + if (array_list->member == NULL || + array_list->container == NULL) + continue; + if (strcmp(array_list->member, sub) != 0) + continue; + if (array_list->uuid_set && + !same_uuid(ra->uuid, array_list->uuid, st->ss->swapuuid)) + continue; + dn = container2devname(array_list->container); + if (dn == NULL) + continue; + if (strncmp(dn, ra->text_version+1, + strlen(dn)) != 0 || + ra->text_version[strlen(dn)+1] != '/') { + free(dn); + continue; + } + free(dn); + /* we have a match */ + match = array_list; + if (c->verbose>0) + pr_err("match found for member %s\n", + array_list->member); + break; + } + + if (match && match->devname && + strcasecmp(match->devname, "<ignore>") == 0) { + if (c->verbose > 0) + pr_err("array %s/%s is explicitly ignored by mdadm.conf\n", + match->container, match->member); + continue; + } + if (match) + trustworthy = LOCAL; + + mdfd = create_mddev(match ? match->devname : NULL, + ra->name, + c->autof, + trustworthy, + chosen_name, 0); + } + if (only && (!mp || strcmp(mp->devnm, only) != 0)) + continue; + + if (mdfd < 0) { + pr_err("failed to open %s: %s.\n", + chosen_name, strerror(errno)); + return 2; + } + + assemble_container_content(st, mdfd, ra, c, + chosen_name, &result); + map_free(map); + map = NULL; + close(mdfd); + } + if (c->export && result) { + char sep = '='; + printf("MD_STARTED"); + if (result & INCR_NO) { + printf("%cno", sep); + sep = ','; + } + if (result & INCR_UNSAFE) { + printf("%cunsafe", sep); + sep = ','; + } + if (result & INCR_ALREADY) { + printf("%calready", sep); + sep = ','; + } + if (result & INCR_YES) { + printf("%cyes", sep); + sep = ','; + } + printf("\n"); + } + return 0; +} + +static void run_udisks(char *arg1, char *arg2) +{ + int pid = fork(); + int status; + if (pid == 0) { + manage_fork_fds(1); + execl("/usr/bin/udisks", "udisks", arg1, arg2, NULL); + execl("/bin/udisks", "udisks", arg1, arg2, NULL); + exit(1); + } + while (pid > 0 && wait(&status) != pid) + ; +} + +static int force_remove(char *devnm, int fd, struct mdinfo *mdi, int verbose) +{ + int rv; + int devid = devnm2devid(devnm); + + run_udisks("--unmount", map_dev(major(devid), minor(devid), 0)); + rv = Manage_stop(devnm, fd, verbose, 1); + if (rv) { + /* At least we can try to trigger a 'remove' */ + sysfs_uevent(mdi, "remove"); + if (verbose) + pr_err("Fail to stop %s too.\n", devnm); + } + return rv; +} + +static void remove_from_member_array(struct mdstat_ent *memb, + struct mddev_dev *devlist, int verbose) +{ + int rv; + struct mdinfo mmdi; + int subfd = open_dev(memb->devnm); + + if (subfd >= 0) { + rv = Manage_subdevs(memb->devnm, subfd, devlist, verbose, + 0, NULL, 0); + if (rv & 2) { + if (sysfs_init(&mmdi, -1, memb->devnm)) + pr_err("unable to initialize sysfs for: %s\n", + memb->devnm); + else + force_remove(memb->devnm, subfd, &mmdi, + verbose); + } + close(subfd); + } +} + +/* + * IncrementalRemove - Attempt to see if the passed in device belongs to any + * raid arrays, and if so first fail (if needed) and then remove the device. + * + * @devname - The device we want to remove + * @id_path - name as found in /dev/disk/by-path for this device + * + * Note: the device name must be a kernel name like "sda", so + * that we can find it in /proc/mdstat + */ +int IncrementalRemove(char *devname, char *id_path, int verbose) +{ + int mdfd; + int rv = 0; + struct mdstat_ent *ent; + struct mddev_dev devlist; + struct mdinfo mdi; + char buf[32]; + + if (!id_path) + dprintf("incremental removal without --path <id_path> lacks the possibility to re-add new device in this port\n"); + + if (strchr(devname, '/')) { + pr_err("incremental removal requires a kernel device name, not a file: %s\n", devname); + return 1; + } + ent = mdstat_by_component(devname); + if (!ent) { + if (verbose >= 0) + pr_err("%s does not appear to be a component of any array\n", devname); + return 1; + } + if (sysfs_init(&mdi, -1, ent->devnm)) { + pr_err("unable to initialize sysfs for: %s\n", devname); + return 1; + } + mdfd = open_dev_excl(ent->devnm); + if (is_fd_valid(mdfd)) { + close_fd(&mdfd); + if (sysfs_get_str(&mdi, NULL, "array_state", + buf, sizeof(buf)) > 0) { + if (strncmp(buf, "active", 6) == 0 || + strncmp(buf, "clean", 5) == 0) + sysfs_set_str(&mdi, NULL, + "array_state", "read-auto"); + } + } + mdfd = open_dev(ent->devnm); + if (mdfd < 0) { + if (verbose >= 0) + pr_err("Cannot open array %s!!\n", ent->devnm); + free_mdstat(ent); + return 1; + } + + if (id_path) { + struct map_ent *map = NULL, *me; + me = map_by_devnm(&map, ent->devnm); + if (me) + policy_save_path(id_path, me); + map_free(map); + } + + memset(&devlist, 0, sizeof(devlist)); + devlist.devname = devname; + devlist.disposition = 'f'; + /* for a container, we must fail each member array */ + if (ent->metadata_version && + strncmp(ent->metadata_version, "external:", 9) == 0) { + struct mdstat_ent *mdstat = mdstat_read(0, 0); + struct mdstat_ent *memb; + for (memb = mdstat ; memb ; memb = memb->next) { + if (is_container_member(memb, ent->devnm)) + remove_from_member_array(memb, + &devlist, verbose); + } + free_mdstat(mdstat); + } else { + rv |= Manage_subdevs(ent->devnm, mdfd, &devlist, + verbose, 0, NULL, 0); + if (rv & 2) { + /* Failed due to EBUSY, try to stop the array. + * Give udisks a chance to unmount it first. + */ + rv = force_remove(ent->devnm, mdfd, &mdi, verbose); + goto end; + } + } + + devlist.disposition = 'r'; + rv = Manage_subdevs(ent->devnm, mdfd, &devlist, + verbose, 0, NULL, 0); +end: + close(mdfd); + free_mdstat(ent); + return rv; +} |