diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-11-09 11:41:33 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-11-09 11:41:33 +0000 |
commit | e9922970d313f8bbf5440586f3020904ff7e057c (patch) | |
tree | 24090f3abf9370a2ff1ba6327d8c06c068f9c171 | |
parent | Releasing debian version 4.3+20240723-2. (diff) | |
download | mdadm-e9922970d313f8bbf5440586f3020904ff7e057c.tar.xz mdadm-e9922970d313f8bbf5440586f3020904ff7e057c.zip |
Merging upstream version 4.3+20241108.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
-rwxr-xr-x | .github/tools/install_ubuntu_packages.sh | 4 | ||||
-rw-r--r-- | .github/workflows/review.yml | 18 | ||||
-rw-r--r-- | Assemble.c | 14 | ||||
-rw-r--r-- | Build.c | 3 | ||||
-rw-r--r-- | Create.c | 6 | ||||
-rw-r--r-- | Detail.c | 16 | ||||
-rw-r--r-- | Examine.c | 30 | ||||
-rw-r--r-- | Grow.c | 150 | ||||
-rw-r--r-- | INSTALL | 13 | ||||
-rw-r--r-- | Incremental.c | 130 | ||||
-rw-r--r-- | Makefile | 30 | ||||
-rw-r--r-- | Manage.c | 187 | ||||
-rw-r--r-- | README.md | 55 | ||||
-rw-r--r-- | ReadMe.c | 324 | ||||
-rw-r--r-- | bitmap.c | 1 | ||||
-rw-r--r-- | config.c | 108 | ||||
-rw-r--r-- | dlink.c | 15 | ||||
-rw-r--r-- | dlink.h | 15 | ||||
-rw-r--r-- | lib.c | 2 | ||||
-rw-r--r-- | managemon.c | 106 | ||||
-rw-r--r-- | mapfile.c | 14 | ||||
-rw-r--r-- | md.4 | 67 | ||||
-rw-r--r-- | mdadm.8.in | 226 | ||||
-rw-r--r-- | mdadm.c | 79 | ||||
-rw-r--r-- | mdadm.conf.5.in | 58 | ||||
-rw-r--r-- | mdadm.h | 56 | ||||
-rw-r--r-- | mdadm_status.h | 13 | ||||
-rw-r--r-- | mdmon.c | 25 | ||||
-rw-r--r-- | mdmon.h | 12 | ||||
-rw-r--r-- | mdmonitor.c (renamed from Monitor.c) | 36 | ||||
-rw-r--r-- | mdopen.c | 196 | ||||
-rw-r--r-- | mdstat.c | 145 | ||||
-rw-r--r-- | monitor.c | 121 | ||||
-rw-r--r-- | msg.c | 10 | ||||
-rw-r--r-- | platform-intel.c | 448 | ||||
-rw-r--r-- | platform-intel.h | 7 | ||||
-rw-r--r-- | policy.c | 27 | ||||
-rw-r--r-- | restripe.c | 2 | ||||
-rw-r--r-- | sg_io.c | 60 | ||||
-rw-r--r-- | super-ddf.c | 2 | ||||
-rw-r--r-- | super-gpt.c | 7 | ||||
-rw-r--r-- | super-intel.c | 408 | ||||
-rw-r--r-- | super-mbr.c | 1 | ||||
-rw-r--r-- | super0.c | 14 | ||||
-rw-r--r-- | super1.c | 34 | ||||
-rw-r--r-- | sysfs.c | 96 | ||||
-rw-r--r-- | systemd/SUSE-mdadm_env.sh | 48 | ||||
-rw-r--r-- | systemd/mdmonitor.service | 22 | ||||
-rwxr-xr-x | test | 2 | ||||
-rw-r--r-- | tests/05r6tor0 | 4 | ||||
-rw-r--r-- | tests/07changelevels | 27 | ||||
-rw-r--r-- | tests/07changelevels.broken | 9 | ||||
-rw-r--r-- | tests/07reshape5intr.broken | 45 | ||||
-rw-r--r-- | tests/07testreshape5 | 1 | ||||
-rw-r--r-- | tests/07testreshape5.broken | 12 | ||||
-rw-r--r-- | tests/09imsm-assemble.broken | 6 | ||||
-rw-r--r-- | tests/18imsm-1d-takeover-r0_1d | 4 | ||||
-rw-r--r-- | tests/func.sh | 4 | ||||
-rw-r--r-- | tests/imsm-grow-template | 10 | ||||
-rw-r--r-- | udev-md-raid-assembly.rules | 4 | ||||
-rw-r--r-- | udev.c | 2 | ||||
-rw-r--r-- | util.c | 136 | ||||
-rw-r--r-- | xmalloc.c | 59 | ||||
-rw-r--r-- | xmalloc.h | 13 |
64 files changed, 2023 insertions, 1776 deletions
diff --git a/.github/tools/install_ubuntu_packages.sh b/.github/tools/install_ubuntu_packages.sh index 1a31ca4..4c88613 100755 --- a/.github/tools/install_ubuntu_packages.sh +++ b/.github/tools/install_ubuntu_packages.sh @@ -7,6 +7,6 @@ echo "Detected VERSION_CODENAME: $VERSION_CODENAME" sudo add-apt-repository -y "deb [arch=amd64] http://archive.ubuntu.com/ubuntu $VERSION_CODENAME \ main universe" # Install gcc -sudo apt-get -y update && sudo apt-get -y install gcc-$1 +sudo apt-get -y install gcc-$1 --no-upgrade --no-install-recommends --no-install-suggests # Install dependencies -sudo apt-get -y install make gcc libudev-dev devscripts +sudo apt-get -y install make gcc libudev-dev devscripts --no-upgrade --no-install-recommends --no-install-suggests diff --git a/.github/workflows/review.yml b/.github/workflows/review.yml index 3fa29f6..e5fbf4e 100644 --- a/.github/workflows/review.yml +++ b/.github/workflows/review.yml @@ -4,25 +4,29 @@ env: cflags: -Werror jobs: make: - runs-on: ubuntu-latest + # when gcc is not found, it may be needed to update runner version + runs-on: ubuntu-24.04 name: Compilation test with gcc strategy: matrix: - gcc-version: [7, 8, 9, 10, 11, 12, 13] + # gcc-versions are used to test up to 5 years old + gcc-version: [9, 10, 11, 12, 13, 14] steps: - uses: actions/checkout@v4 - name: 'Add ubuntu repository and install dependencies' run: .github/tools/install_ubuntu_packages.sh ${{ matrix.gcc-version }} + - name: 'Check if gcc was installed correctly' + run: gcc-${{ matrix.gcc-version }} --version - name: 'Make with DEBUG flag' - run: CC=gcc-${{ matrix.gcc-version }} && V=1 make -j$(nproc) -B CXFLAGS=-DEBUG && make clean + run: V=1 make -j$(nproc) -B CC=gcc-${{ matrix.gcc-version }} CXFLAGS=-DEBUG && make clean - name: 'Make with DEBIAN flag' - run: CC=gcc-${{ matrix.gcc-version }} && V=1 make -j$(nproc) -B CXGALGS=-DEBIAN && make clean + run: V=1 make -j$(nproc) -B CC=gcc-${{ matrix.gcc-version }} CXFLAGS=-DEBIAN && make clean - name: 'Make with USE_PTHREADS flag' - run: CC=gcc-${{ matrix.gcc-version }} && V=1 make -j$(nproc) -B CXFLAGS=-USE_PTHREADS && make clean + run: V=1 make -j$(nproc) -B CC=gcc-${{ matrix.gcc-version }} CXFLAGS=-USE_PTHREADS && make clean - name: 'Make with DNO_LIBUDEV flag' - run: CC=gcc-${{ matrix.gcc-version }} && V=1 make -j$(nproc) -B CXFLAGS=-DNO_LIBUDEV && make clean + run: V=1 make -j$(nproc) -B CC=gcc-${{ matrix.gcc-version }} CXFLAGS=-DNO_LIBUDEV && make clean - name: 'Make' - run: CC=gcc-${{ matrix.gcc-version }} && V=1 make -j$(nproc) + run: V=1 make -j$(nproc) CC=gcc-${{ matrix.gcc-version }} - name: hardening-check mdadm run: hardening-check mdadm - name: hardening-check mdmon @@ -23,6 +23,8 @@ */ #include "mdadm.h" +#include "xmalloc.h" + #include <ctype.h> mapping_t assemble_statuses[] = { @@ -114,14 +116,11 @@ static int is_member_busy(char *metadata_version) int busy = 0; for (ent = mdstat; ent; ent = ent->next) { - if (ent->metadata_version == NULL) - continue; - if (strncmp(ent->metadata_version, "external:", 9) != 0) - continue; - if (!is_subarray(&ent->metadata_version[9])) + if (!is_mdstat_ent_subarray(ent)) continue; + /* Skip first char - it can be '/' or '-' */ - if (strcmp(&ent->metadata_version[10], metadata_version+1) == 0) { + if (strcmp(&ent->metadata_version[10], metadata_version + 1) == 0) { busy = 1; break; } @@ -1574,8 +1573,7 @@ try_again: /* Ignore 'host:' prefix of name */ name = strchr(name, ':')+1; - mdfd = create_mddev(mddev, name, ident->autof, trustworthy, - chosen_name, 0); + mdfd = create_mddev(mddev, name, trustworthy, chosen_name, 0); } if (mdfd < 0) { st->ss->free_super(st); @@ -75,8 +75,7 @@ int Build(struct mddev_ident *ident, struct mddev_dev *devlist, struct shape *s, /* We need to create the device. It can have no name. */ map_lock(&map); - mdfd = create_mddev(ident->devname, NULL, c->autof, LOCAL, - chosen_name, 0); + mdfd = create_mddev(ident->devname, NULL, LOCAL, chosen_name, 0); if (mdfd < 0) { map_unlock(&map); return 1; @@ -23,9 +23,11 @@ */ #include "mdadm.h" -#include "udev.h" #include "md_u.h" #include "md_p.h" +#include "udev.h" +#include "xmalloc.h" + #include <ctype.h> #include <fcntl.h> #include <signal.h> @@ -1043,7 +1045,7 @@ int Create(struct supertype *st, struct mddev_ident *ident, int subdevs, /* We need to create the device */ map_lock(&map); - mdfd = create_mddev(ident->devname, ident->name, c->autof, LOCAL, chosen_name, 1); + mdfd = create_mddev(ident->devname, ident->name, LOCAL, chosen_name, 1); if (mdfd < 0) { map_unlock(&map); return 1; @@ -25,6 +25,8 @@ #include "mdadm.h" #include "md_p.h" #include "md_u.h" +#include "xmalloc.h" + #include <ctype.h> #include <dirent.h> @@ -549,16 +551,10 @@ int Detail(char *dev, struct context *c) } else if (inactive && !is_container) { printf(" State : inactive\n"); } - if (array.raid_disks) - printf(" Active Devices : %d\n", array.active_disks); - if (array.working_disks > 0) - printf(" Working Devices : %d\n", - array.working_disks); - if (array.raid_disks) { - printf(" Failed Devices : %d\n", array.failed_disks); - if (!external) - printf(" Spare Devices : %d\n", array.spare_disks); - } + printf(" Active Devices : %d\n", array.active_disks); + printf(" Working Devices : %d\n", array.working_disks); + printf(" Failed Devices : %d\n", array.failed_disks); + printf(" Spare Devices : %d\n", array.spare_disks); printf("\n"); if (array.level == 5) { str = map_num(r5layout, array.layout); @@ -22,14 +22,16 @@ * Email: <neilb@suse.de> */ -#include "mdadm.h" #include "dlink.h" +#include "mdadm.h" +#include "md_u.h" +#include "md_p.h" +#include "xmalloc.h" #if ! defined(__BIG_ENDIAN) && ! defined(__LITTLE_ENDIAN) #error no endian defined #endif -#include "md_u.h" -#include "md_p.h" + int Examine(struct mddev_dev *devlist, struct context *c, struct supertype *forcest) @@ -111,8 +113,10 @@ int Examine(struct mddev_dev *devlist, close(fd); if (err) { - if (st) + if (st) { st->ss->free_super(st); + free(st); + } continue; } @@ -152,19 +156,24 @@ int Examine(struct mddev_dev *devlist, if (st->ss->export_examine_super) st->ss->export_examine_super(st); st->ss->free_super(st); + free(st); } else { printf("%s:\n",devlist->devname); st->ss->examine_super(st, c->homehost); st->ss->free_super(st); + free(st); } } if (c->brief) { - struct array *ap; - for (ap = arrays; ap; ap = ap->next) { + struct array *ap = arrays, *next; + + while (ap) { char sep='='; char *d; int newline = 0; + next = ap->next; + ap->st->ss->brief_examine_super(ap->st, c->verbose > 0); if (ap->spares && !ap->st->ss->external) newline += printf(" spares=%d", ap->spares); @@ -182,10 +191,15 @@ int Examine(struct mddev_dev *devlist, printf("\n"); ap->st->ss->brief_examine_subarrays(ap->st, c->verbose); } - ap->st->ss->free_super(ap->st); - /* FIXME free ap */ if (ap->spares || c->verbose > 0) printf("\n"); + + ap->st->ss->free_super(ap->st); + free(ap->st); + dl_free_all(ap->devs); + free(ap); + + ap = next; } } return rv; @@ -23,6 +23,8 @@ */ #include "mdadm.h" #include "dlink.h" +#include "xmalloc.h" + #include <sys/mman.h> #include <stddef.h> #include <stdint.h> @@ -530,8 +532,10 @@ int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s) pr_err("Cannot add bitmap while array is resyncing or reshaping etc.\n"); pr_err("Cannot set bitmap file for %s: %s\n", devname, strerror(err)); + close_fd(&bitmap_fd); return 1; } + close_fd(&bitmap_fd); } return 0; @@ -1692,7 +1696,7 @@ char *analyse_change(char *devname, struct mdinfo *info, struct reshape *re) /* Current RAID6 layout has a RAID5 * equivalent - good */ - strcat(strcpy(layout, ls), "-6"); + snprintf(layout, 40, "%s-6", ls); l = map_name(r6layout, layout); if (l == UnSet) return "Cannot find RAID6 layout to convert to"; @@ -2032,7 +2036,8 @@ int Grow_reshape(char *devname, int fd, sysfs_free(sra); return 1; } else if (frozen < 0) { - pr_err("%s is performing resync/recovery and cannot be reshaped\n", devname); + pr_err("%s is performing resync/recovery and cannot be %s\n", devname, + (s->level != UnSet && s->level != array.level) ? "taken over" : "reshaped"); sysfs_free(sra); return 1; } @@ -2147,19 +2152,14 @@ int Grow_reshape(char *devname, int fd, if (s->size == MAX_SIZE) s->size = 0; array.size = s->size; - if (s->size & ~INT32_MAX) { - /* got truncated to 32bit, write to - * component_size instead - */ - rv = sysfs_set_num(sra, NULL, "component_size", s->size); - } else { - rv = md_set_array_info(fd, &array); + rv = sysfs_set_num(sra, NULL, "component_size", s->size); - /* manage array size when it is managed externally - */ - if ((rv == 0) && st->ss->external) - rv = set_array_size(st, sra, sra->text_version); - } + /* + * For native metadata, md/array_size is updated by kernel, + * for external management update it here. + */ + if (st->ss->external && rv == MDADM_STATUS_SUCCESS) + rv = set_array_size(st, sra, sra->text_version); if (raid0_takeover) { /* do not recync non-existing parity, @@ -2944,15 +2944,24 @@ static int impose_reshape(struct mdinfo *sra, * persists from some earlier problem. */ int err = 0; + if (sysfs_set_num(sra, NULL, "chunk_size", info->new_chunk) < 0) err = errno; + if (!err && sysfs_set_num(sra, NULL, "layout", reshape->after.layout) < 0) err = errno; + + /* new_level is introduced in kernel 6.12 */ + if (!err && sysfs_attribute_available(sra, NULL, "new_level") && + sysfs_set_num(sra, NULL, "new_level", info->new_level) < 0) + err = errno; + if (!err && subarray_set_num(container, sra, "raid_disks", reshape->after.data_disks + reshape->parity) < 0) err = errno; + if (err) { pr_err("Cannot set device shape for %s\n", devname); @@ -3028,6 +3037,13 @@ static int impose_level(int fd, int level, char *devname, int verbose) makedev(disk.major, disk.minor)); hot_remove_disk(fd, makedev(disk.major, disk.minor), 1); } + /* + * hot_remove_disk lets kernel set MD_RECOVERY_RUNNING + * and it can't set level. It needs to wait sometime + * to let md thread to clear the flag. + */ + pr_info("wait 5 seconds to give kernel space to finish job\n"); + sleep_for(5, 0, true); } c = map_num(pers, level); if (c) { @@ -3083,6 +3099,7 @@ static int reshape_array(char *container, int fd, char *devname, int done; struct mdinfo *sra = NULL; char buf[SYSFS_MAX_BUF_SIZE]; + bool located_backup = false; /* when reshaping a RAID0, the component_size might be zero. * So try to fix that up. @@ -3165,8 +3182,10 @@ static int reshape_array(char *container, int fd, char *devname, goto release; } - if (!backup_file) + if (!backup_file) { backup_file = locate_backup(sra->sys_name); + located_backup = true; + } goto started; } @@ -3261,7 +3280,12 @@ static int reshape_array(char *container, int fd, char *devname, /* This is a spare that wants to * be part of the array. */ - add_disk(fd, st, info2, d); + if (add_disk(fd, st, info2, d) < 0) { + pr_err("Can not add disk %s\n", + d->sys_name); + free(info2); + goto release; + } } } sysfs_free(info2); @@ -3607,15 +3631,13 @@ started: mdstat_wait(30 - (delayed-1) * 25); } while (delayed); mdstat_close(); - if (check_env("MDADM_GROW_VERIFY")) - fd = open(devname, O_RDONLY | O_DIRECT); - else - fd = -1; mlockall(MCL_FUTURE); if (signal_s(SIGTERM, catch_term) == SIG_ERR) goto release; + if (check_env("MDADM_GROW_VERIFY")) + fd = open(devname, O_RDONLY | O_DIRECT); if (st->ss->external) { /* metadata handler takes it from here */ done = st->ss->manage_reshape( @@ -3627,6 +3649,7 @@ started: fd, sra, &reshape, st, blocks, fdlist, offsets, d - odisks, fdlist + odisks, offsets + odisks); + close_fd(&fd); free(fdlist); free(offsets); @@ -3681,9 +3704,12 @@ started: set_array_size(st, info, info->text_version); if (info->new_level != reshape.level) { - if (fd < 0) - fd = open(devname, O_RDONLY); - impose_level(fd, info->new_level, devname, verbose); + fd = open_dev(sra->sys_name); + if (fd < 0) { + pr_err("Can't open %s\n", sra->sys_name); + goto out; + } + impose_level(fd, info->new_level, sra->sys_name, verbose); close(fd); if (info->new_level == 0) st->update_tail = NULL; @@ -3696,6 +3722,8 @@ out: exit(0); release: + if (located_backup) + free(backup_file); free(fdlist); free(offsets); if (orig_level != UnSet && sra) { @@ -3834,6 +3862,7 @@ int reshape_container(char *container, char *devname, pr_err("Unable to initialize sysfs for %s\n", mdstat->devnm); rv = 1; + close_fd(&fd); break; } @@ -4132,8 +4161,8 @@ int progress_reshape(struct mdinfo *info, struct reshape *reshape, * waiting forever on a dead array */ char action[SYSFS_MAX_BUF_SIZE]; - if (sysfs_get_str(info, NULL, "sync_action", action, sizeof(action)) <= 0 || - strncmp(action, "reshape", 7) != 0) + + if (sysfs_get_str(info, NULL, "sync_action", action, sizeof(action)) <= 0) break; /* Some kernels reset 'sync_completed' to zero * before setting 'sync_action' to 'idle'. @@ -4141,12 +4170,18 @@ int progress_reshape(struct mdinfo *info, struct reshape *reshape, */ if (completed == 0 && advancing && strncmp(action, "idle", 4) == 0 && - info->reshape_progress > 0) + info->reshape_progress > 0) { + info->reshape_progress = need_backup; break; + } if (completed == 0 && !advancing && strncmp(action, "idle", 4) == 0 && info->reshape_progress < - (info->component_size * reshape->after.data_disks)) + (info->component_size * reshape->after.data_disks)) { + info->reshape_progress = need_backup; + break; + } + if (strncmp(action, "reshape", 7) != 0) break; sysfs_wait(fd, NULL); if (sysfs_fd_get_ll(fd, &completed) < 0) @@ -4413,7 +4448,10 @@ static void validate(int afd, int bfd, unsigned long long offset) */ if (afd < 0) return; - lseek64(bfd, offset - 4096, 0); + if (lseek64(bfd, offset - 4096, 0) < 0) { + pr_err("lseek64 fails %d:%s\n", errno, strerror(errno)); + return; + } if (read(bfd, &bsb2, 512) != 512) fail("cannot read bsb"); if (bsb2.sb_csum != bsb_csum((char*)&bsb2, @@ -4444,12 +4482,19 @@ static void validate(int afd, int bfd, unsigned long long offset) } } - lseek64(bfd, offset, 0); + if (lseek64(bfd, offset, 0) < 0) { + pr_err("lseek64 fails %d:%s\n", errno, strerror(errno)); + goto out; + } if ((unsigned long long)read(bfd, bbuf, len) != len) { //printf("len %llu\n", len); fail("read first backup failed"); } - lseek64(afd, __le64_to_cpu(bsb2.arraystart)*512, 0); + + if (lseek64(afd, __le64_to_cpu(bsb2.arraystart)*512, 0) < 0) { + pr_err("lseek64 fails %d:%s\n", errno, strerror(errno)); + goto out; + } if ((unsigned long long)read(afd, abuf, len) != len) fail("read first from array failed"); if (memcmp(bbuf, abuf, len) != 0) @@ -4466,15 +4511,25 @@ static void validate(int afd, int bfd, unsigned long long offset) bbuf = xmalloc(abuflen); } - lseek64(bfd, offset+__le64_to_cpu(bsb2.devstart2)*512, 0); + if (lseek64(bfd, offset+__le64_to_cpu(bsb2.devstart2)*512, 0) < 0) { + pr_err("lseek64 fails %d:%s\n", errno, strerror(errno)); + goto out; + } if ((unsigned long long)read(bfd, bbuf, len) != len) fail("read second backup failed"); - lseek64(afd, __le64_to_cpu(bsb2.arraystart2)*512, 0); + if (lseek64(afd, __le64_to_cpu(bsb2.arraystart2)*512, 0) < 0) { + pr_err("lseek64 fails %d:%s\n", errno, strerror(errno)); + goto out; + } if ((unsigned long long)read(afd, abuf, len) != len) fail("read second from array failed"); if (memcmp(bbuf, abuf, len) != 0) fail("data2 compare failed"); } +out: + free(abuf); + free(bbuf); + return; } int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape, @@ -4692,6 +4747,7 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, unsigned long long *offsets; unsigned long long nstripe, ostripe; int ndata, odata; + int fd, backup_fd = -1; odata = info->array.raid_disks - info->delta_disks - 1; if (info->array.level == 6) @@ -4707,9 +4763,18 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, * been used */ old_disks = cnt; + + if (backup_file) { + backup_fd = open(backup_file, O_RDONLY); + if (!is_fd_valid(backup_fd)) { + pr_err("Can't open backup file %s : %s\n", + backup_file, strerror(errno)); + return -EINVAL; + } + } + for (i=old_disks-(backup_file?1:0); i<cnt; i++) { struct mdinfo dinfo; - int fd; int bsbsize; char *devname, namebuf[20]; unsigned long long lo, hi; @@ -4722,12 +4787,9 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, * else restore data and update all superblocks */ if (i == old_disks-1) { - fd = open(backup_file, O_RDONLY); - if (fd<0) { - pr_err("backup file %s inaccessible: %s\n", - backup_file, strerror(errno)); + if (!is_fd_valid(backup_fd)) continue; - } + fd = backup_fd; devname = backup_file; } else { fd = fdlist[i]; @@ -4882,6 +4944,7 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, pr_err("Error restoring backup from %s\n", devname); free(offsets); + close_fd(&backup_fd); return 1; } @@ -4898,6 +4961,7 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, pr_err("Error restoring second backup from %s\n", devname); free(offsets); + close_fd(&backup_fd); return 1; } @@ -4959,8 +5023,12 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, st->ss->store_super(st, fdlist[j]); st->ss->free_super(st); } + close_fd(&backup_fd); return 0; } + + close_fd(&backup_fd); + /* Didn't find any backup data, try to see if any * was needed. */ @@ -5033,7 +5101,11 @@ int Grow_continue_command(char *devname, int fd, struct context *c) goto Grow_continue_command_exit; } content = &array; - sysfs_init(content, fd, NULL); + if (sysfs_init(content, fd, NULL) < 0) { + pr_err("sysfs_init fails\n"); + ret_val = 1; + goto Grow_continue_command_exit; + } /* Need to load a superblock. * FIXME we should really get what we need from * sysfs diff --git a/INSTALL b/INSTALL deleted file mode 100644 index f7bcc3e..0000000 --- a/INSTALL +++ /dev/null @@ -1,13 +0,0 @@ - -To build mdadm, simply run: - - make - -to install, run - - make install - -as root. - - -No configuration is necessary. diff --git a/Incremental.c b/Incremental.c index 83db071..5e59b6d 100644 --- a/Incremental.c +++ b/Incremental.c @@ -29,6 +29,8 @@ */ #include "mdadm.h" +#include "xmalloc.h" + #include <sys/wait.h> #include <dirent.h> #include <ctype.h> @@ -105,8 +107,6 @@ int Incremental(struct mddev_dev *devlist, struct context *c, char *devname = devlist->devname; int journal_device_missing = 0; - struct createinfo *ci = conf_get_create_info(); - if (!stat_is_blkdev(devname, &rdev)) return rv; dfd = dev_open(devname, O_RDONLY); @@ -232,16 +232,6 @@ int Incremental(struct mddev_dev *devlist, struct context *c, if (trustworthy == LOCAL_ANY) trustworthy = LOCAL; - /* There are three possible sources for 'autof': command line, - * ARRAY line in mdadm.conf, or CREATE line in mdadm.conf. - * ARRAY takes precedence, then command line, then - * CREATE. - */ - if (match && match->autof) - c->autof = match->autof; - if (c->autof == 0) - c->autof = ci->autof; - name_to_use = info.name; if (name_to_use[0] == 0 && is_container(info.array.level)) { name_to_use = info.text_version; @@ -295,8 +285,8 @@ int Incremental(struct mddev_dev *devlist, struct context *c, goto out; /* Couldn't find an existing array, maybe make a new one */ - mdfd = create_mddev(match ? match->devname : NULL, - name_to_use, c->autof, trustworthy, chosen_name, 0); + mdfd = create_mddev(match ? match->devname : NULL, name_to_use, trustworthy, + chosen_name, 0); if (mdfd < 0) goto out_unlock; @@ -770,7 +760,7 @@ static int count_active(struct supertype *st, struct mdinfo *sra, replcnt++; st->ss->free_super(st); } - if (max_journal_events >= max_events - 1) + if (max_events > 0 && max_journal_events >= max_events - 1) bestinfo->journal_clean = 1; if (!avail) @@ -1113,7 +1103,7 @@ static int partition_try_spare(char *devname, int *dfdp, struct dev_policy *pol, int fd = -1; struct mdinfo info; struct supertype *st2 = NULL; - char *devname = NULL; + char *dev_path_name = NULL; unsigned long long devsectors; char *pathlist[2]; @@ -1142,14 +1132,14 @@ static int partition_try_spare(char *devname, int *dfdp, struct dev_policy *pol, domain_free(domlist); domlist = NULL; - if (asprintf(&devname, "/dev/disk/by-path/%s", de->d_name) != 1) { - devname = NULL; + if (asprintf(&dev_path_name, "/dev/disk/by-path/%s", de->d_name) != 1) { + dev_path_name = NULL; goto next; } - fd = open(devname, O_RDONLY); + fd = open(dev_path_name, O_RDONLY); if (fd < 0) goto next; - if (get_dev_size(fd, devname, &devsectors) == 0) + if (get_dev_size(fd, dev_path_name, &devsectors) == 0) goto next; devsectors >>= 9; @@ -1188,8 +1178,8 @@ static int partition_try_spare(char *devname, int *dfdp, struct dev_policy *pol, if (chosen == NULL || chosen_size < info.component_size) { chosen_size = info.component_size; free(chosen); - chosen = devname; - devname = NULL; + chosen = dev_path_name; + dev_path_name = NULL; if (chosen_st) { chosen_st->ss->free_super(chosen_st); free(chosen_st); @@ -1199,7 +1189,7 @@ static int partition_try_spare(char *devname, int *dfdp, struct dev_policy *pol, } next: - free(devname); + free(dev_path_name); domain_free(domlist); dev_policy_free(pol2); if (st2) @@ -1246,7 +1236,7 @@ static int is_bare(int dfd) /* OK, first 4K appear blank, try the end. */ get_dev_size(dfd, NULL, &size); - if (lseek(dfd, size-4096, SEEK_SET) < 0 || + if ((size >= 4096 && lseek(dfd, size-4096, SEEK_SET) < 0) || read(dfd, buf, 4096) != 4096) return 0; @@ -1605,10 +1595,7 @@ static int Incremental_container(struct supertype *st, char *devname, if (match) trustworthy = LOCAL; - mdfd = create_mddev(match ? match->devname : NULL, - ra->name, - c->autof, - trustworthy, + mdfd = create_mddev(match ? match->devname : NULL, ra->name, trustworthy, chosen_name, 0); if (!is_fd_valid(mdfd)) { @@ -1674,42 +1661,71 @@ static void remove_from_member_array(struct mdstat_ent *memb, } } -/* - * IncrementalRemove - Attempt to see if the passed in device belongs to any - * raid arrays, and if so first fail (if needed) and then remove the device. +/** + * is_devnode_path() - check if the devname passed might be devnode path. + * @devnode: the path to check. * - * @devname - The device we want to remove - * @id_path - name as found in /dev/disk/by-path for this device + * Devnode must be located directly in /dev directory. It is not checking existence of the file + * because the device might no longer exist during removal from raid array. + */ +static bool is_devnode_path(char *devnode) +{ + char *devnm = strrchr(devnode, '/'); + + if (!devnm || *(devnm + 1) == 0) + return false; + + if (strncmp(devnode, DEV_DIR, DEV_DIR_LEN) == 0 && devnode + DEV_DIR_LEN - 1 == devnm) + return true; + + return false; +} + +/** + * Incremental_remove() - Remove the device from all raid arrays. + * @devname: the device we want to remove, it could be kernel device name or devnode. + * @id_path: optional, /dev/disk/by-path path to save for bare scenarios support. + * @verbose: verbose flag. * - * Note: the device name must be a kernel name like "sda", so - * that we can find it in /proc/mdstat + * First, fail the device (if needed) and then remove the device from native raid array or external + * container. If it is external container, the device is removed from each subarray first. */ -int IncrementalRemove(char *devname, char *id_path, int verbose) +int Incremental_remove(char *devname, char *id_path, int verbose) { - int mdfd; - int rv = 0; + char *devnm = basename(devname); + struct mddev_dev devlist = {0}; + char buf[SYSFS_MAX_BUF_SIZE]; + struct mdstat_ent *mdstat; struct mdstat_ent *ent; - struct mddev_dev devlist; struct mdinfo mdi; - char buf[SYSFS_MAX_BUF_SIZE]; + int rv = 1; + int mdfd; - if (!id_path) - dprintf("incremental removal without --path <id_path> lacks the possibility to re-add new device in this port\n"); + if (strcmp(devnm, devname) != 0) + if (!is_devnode_path(devname)) { + pr_err("Cannot remove \"%s\", devnode path or kernel device name is allowed.\n", + devname); + return 1; + } - if (strchr(devname, '/')) { - pr_err("incremental removal requires a kernel device name, not a file: %s\n", devname); + mdstat = mdstat_read(0, 0); + if (!mdstat) { + pr_err("Cannot read /proc/mdstat file, aborting\n"); return 1; } - ent = mdstat_by_component(devname); + + ent = mdstat_find_by_member_name(mdstat, devnm); if (!ent) { if (verbose >= 0) - pr_err("%s does not appear to be a component of any array\n", devname); - return 1; + pr_vrb("%s does not appear to be a component of any array\n", devnm); + goto out; } + if (sysfs_init(&mdi, -1, ent->devnm)) { - pr_err("unable to initialize sysfs for: %s\n", devname); - return 1; + pr_err("unable to initialize sysfs for: %s\n", devnm); + goto out; } + mdfd = open_dev_excl(ent->devnm); if (is_fd_valid(mdfd)) { close_fd(&mdfd); @@ -1725,8 +1741,7 @@ int IncrementalRemove(char *devname, char *id_path, int verbose) if (mdfd < 0) { if (verbose >= 0) pr_err("Cannot open array %s!!\n", ent->devnm); - free_mdstat(ent); - return 1; + goto out; } if (id_path) { @@ -1737,20 +1752,16 @@ int IncrementalRemove(char *devname, char *id_path, int verbose) map_free(map); } - memset(&devlist, 0, sizeof(devlist)); - devlist.devname = devname; + devlist.devname = devnm; devlist.disposition = 'I'; /* for a container, we must fail each member array */ - if (ent->metadata_version && - strncmp(ent->metadata_version, "external:", 9) == 0) { - struct mdstat_ent *mdstat = mdstat_read(0, 0); + if (is_mdstat_ent_external(ent)) { struct mdstat_ent *memb; for (memb = mdstat ; memb ; memb = memb->next) { if (is_container_member(memb, ent->devnm)) remove_from_member_array(memb, &devlist, verbose); } - free_mdstat(mdstat); } else { /* * This 'I' incremental remove is a try-best effort, @@ -1765,7 +1776,8 @@ int IncrementalRemove(char *devname, char *id_path, int verbose) rv = Manage_subdevs(ent->devnm, mdfd, &devlist, verbose, 0, UOPT_UNDEFINED, 0); - close(mdfd); - free_mdstat(ent); + close_fd(&mdfd); +out: + free_mdstat(mdstat); return rv; } @@ -30,7 +30,7 @@ # define "CXFLAGS" to give extra flags to CC. # e.g. make CXFLAGS=-O to optimise -CXFLAGS ?=-O2 -D_FORTIFY_SOURCE=2 +CXFLAGS ?=-O2 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE TCC = tcc UCLIBC_GCC = $(shell for nm in i386-uclibc-linux-gcc i386-uclibc-gcc; do which $$nm > /dev/null && { echo $$nm ; exit; } ; done; echo false No uclibc found ) #DIET_GCC = diet gcc @@ -76,6 +76,27 @@ ifeq ($(origin STRINGOPOVERFLOW), undefined) endif endif +ifeq ($(origin NOSTRICTOVERFLOW), undefined) + NOSTRICTOVERFLOW := $(shell $(CC) -Q --help=warning 2>&1 | grep "strict-overflow" | wc -l) + ifneq "$(NOSTRICTOVERFLOW)" "0" + CWFLAGS += -fno-strict-overflow + endif +endif + +ifeq ($(origin NODELETENULLPOINTER), undefined) + NODELETENULLPOINTER := $(shell $(CC) -Q --help=optimizers 2>&1 | grep "delete-null-pointer-checks" | wc -l) + ifneq "$(NODELETENULLPOINTER)" "0" + CWFLAGS += -fno-delete-null-pointer-checks + endif +endif + +ifeq ($(origin WRAPV), undefined) + WRAPV := $(shell $(CC) -Q --help=optimizers 2>&1 | grep "wrapv" | wc -l) + ifneq "$(WRAPV)" "0" + CWFLAGS += -fwrapv + endif +endif + ifdef DEBIAN CPPFLAGS += -DDEBIAN endif @@ -165,11 +186,11 @@ endif OBJS = mdadm.o config.o policy.o mdstat.o ReadMe.o uuid.o util.o maps.o lib.o udev.o \ Manage.o Assemble.o Build.o \ - Create.o Detail.o Examine.o Grow.o Monitor.o dlink.o Kill.o Query.o \ + Create.o Detail.o Examine.o Grow.o mdmonitor.o dlink.o Kill.o Query.o \ Incremental.o Dump.o \ mdopen.o super0.o super1.o super-ddf.o super-intel.o bitmap.o \ super-mbr.o super-gpt.o \ - restripe.o sysfs.o sha1.o mapfile.o crc32.o sg_io.o msg.o xmalloc.o \ + restripe.o sysfs.o sha1.o mapfile.o crc32.o msg.o xmalloc.o \ platform-intel.o probe_roms.o crc32c.o drive_encryption.o CHECK_OBJS = restripe.o uuid.o sysfs.o maps.o lib.o xmalloc.o dlink.o @@ -180,7 +201,7 @@ INCL = mdadm.h part.h bitmap.h MON_OBJS = mdmon.o monitor.o managemon.o uuid.o util.o maps.o mdstat.o sysfs.o config.o mapfile.o mdopen.o\ policy.o lib.o udev.o \ - Kill.o sg_io.o dlink.o ReadMe.o super-intel.o \ + Kill.o dlink.o ReadMe.o super-intel.o \ super-mbr.o super-gpt.o \ super-ddf.o sha1.o crc32.o msg.o bitmap.o xmalloc.o \ platform-intel.o probe_roms.o crc32c.o drive_encryption.o @@ -320,7 +341,6 @@ install-systemd: systemd/mdmon@.service $(INSTALL) -D -m 755 .install.tmp.3 $(DESTDIR)$(SYSTEMD_DIR)-shutdown/$$file ; \ rm -f .install.tmp.3; \ done - if [ -f /etc/SuSE-release -o -n "$(SUSE)" ] ;then $(INSTALL) -D -m 755 systemd/SUSE-mdadm_env.sh $(DESTDIR)$(LIB_DIR)/mdadm_env.sh ;fi install-bin: mdadm mdmon $(INSTALL) -D $(STRIP) -m 755 mdadm $(DESTDIR)$(BINDIR)/mdadm @@ -26,6 +26,8 @@ #include "md_u.h" #include "md_p.h" #include "udev.h" +#include "xmalloc.h" + #include <ctype.h> int Manage_ro(char *devname, int fd, int readonly) @@ -238,13 +240,14 @@ int Manage_stop(char *devname, int fd, int verbose, int will_retry) "array_state", "inactive")) < 0 && errno == EBUSY) { + err = errno; sleep_for(0, MSEC_TO_NSEC(200), true); count--; } if (err) { if (verbose >= 0) pr_err("failed to stop array %s: %s\n", - devname, strerror(errno)); + devname, strerror(err)); rv = 1; goto out; } @@ -276,10 +279,8 @@ int Manage_stop(char *devname, int fd, int verbose, int will_retry) */ mds = mdstat_read(0, 0); for (m = mds; m; m = m->next) - if (m->metadata_version && - strncmp(m->metadata_version, "external:", 9)==0 && - metadata_container_matches(m->metadata_version+9, - devnm)) { + if (is_mdstat_ent_external(m) && + metadata_container_matches(m->metadata_version + 9, devnm)) { if (verbose >= 0) pr_err("Cannot stop container %s: member %s still active\n", devname, m->devnm); @@ -440,14 +441,15 @@ done: count = 25; err = 0; while (count && fd >= 0 && (err = ioctl(fd, STOP_ARRAY, NULL)) < 0 && errno == EBUSY) { + err = errno; sleep_for(0, MSEC_TO_NSEC(200), true); count --; } if (fd >= 0 && err) { if (verbose >= 0) { pr_err("failed to stop array %s: %s\n", - devname, strerror(errno)); - if (errno == EBUSY) + devname, strerror(err)); + if (err == EBUSY) cont_err("Perhaps a running process, mounted filesystem or active volume group?\n"); } rv = 1; @@ -791,6 +793,7 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv, int j; mdu_disk_info_t disc; struct map_ent *map = NULL; + bool add_new_super = false; if (!get_dev_size(tfd, dv->devname, &ldsize)) { if (dv->disposition == 'M') @@ -1009,6 +1012,7 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv, goto unlock; if (tst->ss->write_init_super(tst)) goto unlock; + add_new_super = true; } else if (dv->disposition == 'A') { /* this had better be raid1. * As we are "--re-add"ing we must find a spare slot @@ -1076,6 +1080,8 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv, map_unlock(&map); return 1; unlock: + if (add_new_super) + Kill(dv->devname, tst, 0, -1, 0); map_unlock(&map); return -1; } @@ -1439,29 +1445,25 @@ int Manage_subdevs(char *devname, int fd, for (dv = devlist; dv; dv = dv->next) { dev_t rdev = 0; /* device to add/remove etc */ - int rv; - int mj,mn; + int rv, err = 0; + int mj, mn; raid_slot = -1; if (dv->disposition == 'c') { - rv = parse_cluster_confirm_arg(dv->devname, - &dv->devname, - &raid_slot); + rv = parse_cluster_confirm_arg(dv->devname, &dv->devname, &raid_slot); if (rv) { pr_err("Could not get the devname of cluster\n"); goto abort; } } - if (strcmp(dv->devname, "failed") == 0 || - strcmp(dv->devname, "faulty") == 0) { + if (strcmp(dv->devname, "failed") == 0 || strcmp(dv->devname, "faulty") == 0) { if (dv->disposition != 'A' && dv->disposition != 'r') { pr_err("%s only meaningful with -r or --re-add, not -%c\n", dv->devname, dv->disposition); goto abort; } - add_faulty(dv, fd, (dv->disposition == 'A' - ? 'F' : 'r')); + add_faulty(dv, fd, (dv->disposition == 'A' ? 'F' : 'r')); continue; } if (strcmp(dv->devname, "detached") == 0) { @@ -1477,6 +1479,7 @@ int Manage_subdevs(char *devname, int fd, if (strcmp(dv->devname, "missing") == 0) { struct mddev_dev *add_devlist; struct mddev_dev **dp; + if (dv->disposition == 'c') { rv = ioctl(fd, CLUSTERED_DISK_NACK, NULL); break; @@ -1491,7 +1494,7 @@ int Manage_subdevs(char *devname, int fd, pr_err("no devices to scan for missing members.\n"); continue; } - for (dp = &add_devlist; *dp; dp = & (*dp)->next) + for (dp = &add_devlist; *dp; dp = &(*dp)->next) /* 'M' (for 'missing') is like 'A' without errors */ (*dp)->disposition = 'M'; *dp = dv->next; @@ -1499,74 +1502,50 @@ int Manage_subdevs(char *devname, int fd, continue; } - if (strncmp(dv->devname, "set-", 4) == 0 && - strlen(dv->devname) == 5) { + if (strncmp(dv->devname, "set-", 4) == 0 && strlen(dv->devname) == 5) { int copies; - if (dv->disposition != 'r' && - dv->disposition != 'f') { - pr_err("'%s' only meaningful with -r or -f\n", - dv->devname); + if (dv->disposition != 'r' && dv->disposition != 'f') { + pr_err("'%s' only meaningful with -r or -f\n", dv->devname); goto abort; } + if (array.level != 10) { - pr_err("'%s' only meaningful with RAID10 arrays\n", - dv->devname); + pr_err("'%s' only meaningful with RAID10 arrays\n", dv->devname); goto abort; } - copies = ((array.layout & 0xff) * - ((array.layout >> 8) & 0xff)); - if (array.raid_disks % copies != 0 || - dv->devname[4] < 'A' || - dv->devname[4] >= 'A' + copies || - copies > 26) { - pr_err("'%s' not meaningful with this array\n", - dv->devname); + + copies = ((array.layout & 0xff) * ((array.layout >> 8) & 0xff)); + + if (array.raid_disks % copies != 0 || dv->devname[4] < 'A' || + dv->devname[4] >= 'A' + copies || copies > 26) { + pr_err("'%s' not meaningful with this array\n", dv->devname); goto abort; } add_set(dv, fd, dv->devname[4]); continue; } - if (strchr(dv->devname, '/') == NULL && - strchr(dv->devname, ':') == NULL && + if (!strchr(dv->devname, '/') && !strchr(dv->devname, ':') && strlen(dv->devname) < 50) { - /* Assume this is a kernel-internal name like 'sda1' */ - int found = 0; - char dname[55]; - if (dv->disposition != 'r' && dv->disposition != 'f' && - dv->disposition != 'I') { + char *array_devnm = fd2devnm(fd); + + /* This is a kernel-internal name like 'sda1' */ + + if (!strchr("rfI", dv->disposition)) { pr_err("%s only meaningful with -r, -f or -I, not -%c\n", dv->devname, dv->disposition); goto abort; } - sprintf(dname, "dev-%s", dv->devname); - sysfd = sysfs_open(fd2devnm(fd), dname, "block/dev"); - if (sysfd >= 0) { - char dn[SYSFS_MAX_BUF_SIZE]; - if (sysfs_fd_get_str(sysfd, dn, sizeof(dn)) > 0 && - sscanf(dn, "%d:%d", &mj,&mn) == 2) { - rdev = makedev(mj,mn); - found = 1; - } - close_fd(&sysfd); - sysfd = -1; - } - if (!found) { - sysfd = sysfs_open(fd2devnm(fd), dname, "state"); - if (sysfd < 0) { - pr_err("%s does not appear to be a component of %s\n", - dv->devname, devname); + sysfd = sysfs_open_memb_attr(array_devnm, dv->devname, "state", O_RDWR); + if (!is_fd_valid(sysfd)) { + pr_err("%s does not appear to be a component of %s\n", dv->devname, + devname); goto abort; } - } - } else if ((dv->disposition == 'r' || - dv->disposition == 'f') && - get_maj_min(dv->devname, &mj, &mn)) { - /* for 'fail' and 'remove', the device might - * not exist. - */ + } else if (strchr("rf", dv->disposition) && get_maj_min(dv->devname, &mj, &mn)) { + /* for 'fail' and 'remove', the device might not exist. */ rdev = makedev(mj, mn); } else { tfd = dev_open(dv->devname, O_RDONLY); @@ -1575,6 +1554,7 @@ int Manage_subdevs(char *devname, int fd, close_fd(&tfd); } else { int open_err = errno; + if (!stat_is_blkdev(dv->devname, &rdev)) { if (dv->disposition == 'M') /* non-fatal. Also improbable */ @@ -1590,16 +1570,15 @@ int Manage_subdevs(char *devname, int fd, if (dv->disposition == 'M') /* non-fatal */ continue; - pr_err("Cannot open %s: %s\n", - dv->devname, strerror(open_err)); + pr_err("Cannot open %s: %s\n", dv->devname, + strerror(open_err)); goto abort; } } } - switch(dv->disposition){ + switch (dv->disposition) { default: - pr_err("internal error - devmode[%s]=%d\n", - dv->devname, dv->disposition); + pr_err("internal error - devmode[%s]=%d\n", dv->devname, dv->disposition); goto abort; case 'a': case 'S': /* --add-spare */ @@ -1615,12 +1594,11 @@ int Manage_subdevs(char *devname, int fd, } /* Let's first try to write re-add to sysfs */ - if (rdev != 0 && - (dv->disposition == 'A' || dv->disposition == 'F')) { + if (rdev != 0 && (dv->disposition == 'A' || dv->disposition == 'F')) { sysfs_init_dev(&devinfo, rdev); if (sysfs_set_str(&info, &devinfo, "state", "re-add") == 0) { - pr_err("re-add %s to %s succeed\n", - dv->devname, info.sys_name); + pr_err("re-add %s to %s succeed\n", dv->devname, + info.sys_name); break; } } @@ -1651,8 +1629,7 @@ int Manage_subdevs(char *devname, int fd, else frozen = -1; } - rv = Manage_add(fd, tfd, dv, tst, &array, - force, verbose, devname, update, + rv = Manage_add(fd, tfd, dv, tst, &array, force, verbose, devname, update, rdev, array_size, raid_slot); close_fd(&tfd); if (rv < 0) @@ -1667,12 +1644,10 @@ int Manage_subdevs(char *devname, int fd, pr_err("Cannot remove disks from a \'member\' array, perform this operation on the parent container\n"); rv = -1; } else - rv = Manage_remove(tst, fd, dv, sysfd, - rdev, verbose, force, + rv = Manage_remove(tst, fd, dv, sysfd, rdev, verbose, force, devname); - if (sysfd >= 0) - close_fd(&sysfd); - sysfd = -1; + close_fd(&sysfd); + if (rv < 0) goto abort; if (rv > 0) @@ -1686,23 +1661,31 @@ int Manage_subdevs(char *devname, int fd, close_fd(&sysfd); goto abort; } - case 'I': /* incremental fail */ - if ((sysfd >= 0 && write(sysfd, "faulty", 6) != 6) || - (sysfd < 0 && ioctl(fd, SET_DISK_FAULTY, - rdev))) { - if (errno == EBUSY) - busy = 1; - pr_err("set device faulty failed for %s: %s\n", - dv->devname, strerror(errno)); - close_fd(&sysfd); - goto abort; + case 'I': + if (is_fd_valid(sysfd)) { + static const char val[] = "faulty"; + + rv = sysfs_write_descriptor(sysfd, val, strlen(val), &err); + } else { + rv = ioctl(fd, SET_DISK_FAULTY, rdev); + if (rv) + err = errno; } + close_fd(&sysfd); - count++; - if (verbose >= 0) - pr_err("set %s faulty in %s\n", - dv->devname, devname); - break; + + if (rv == MDADM_STATUS_SUCCESS) { + count++; + + pr_vrb("set %s faulty in %s\n", dv->devname, devname); + break; + } + + if (err == EBUSY) + busy = 1; + + pr_err("set device faulty failed for %s: %s\n", dv->devname, strerror(err)); + goto abort; case 'R': /* Mark as replaceable */ if (subarray) { pr_err("Cannot replace disks in a \'member\' array, perform this operation on the parent container\n"); @@ -1714,9 +1697,7 @@ int Manage_subdevs(char *devname, int fd, else frozen = -1; } - rv = Manage_replace(tst, fd, dv, - rdev, verbose, - devname); + rv = Manage_replace(tst, fd, dv, rdev, verbose, devname); } if (rv < 0) goto abort; @@ -1724,12 +1705,10 @@ int Manage_subdevs(char *devname, int fd, count++; break; case 'W': /* --with device that doesn't match */ - pr_err("No matching --replace device for --with %s\n", - dv->devname); + pr_err("No matching --replace device for --with %s\n", dv->devname); goto abort; case 'w': /* --with device which was matched */ - rv = Manage_with(tst, fd, dv, - rdev, verbose, devname); + rv = Manage_with(tst, fd, dv, rdev, verbose, devname); if (rv < 0) goto abort; break; @@ -1737,7 +1716,7 @@ int Manage_subdevs(char *devname, int fd, } free(tst); if (frozen > 0) - sysfs_set_str(&info, NULL, "sync_action","idle"); + sysfs_set_str(&info, NULL, "sync_action", "idle"); if (test && count == 0) return 2; return 0; @@ -1745,7 +1724,7 @@ int Manage_subdevs(char *devname, int fd, abort: free(tst); if (frozen > 0) - sysfs_set_str(&info, NULL, "sync_action","idle"); + sysfs_set_str(&info, NULL, "sync_action", "idle"); return !test && busy ? 2 : 1; } @@ -87,6 +87,61 @@ If there are differences between github and kernel.org, please contact kernel.or We do not support kernel versions below **v3.10**. Please be aware that maintainers may remove workarounds and fixes for legacy issues. +# Dependencies + +The following packages are required for compilation: + +| RHEL | SLES | Debian/Ubuntu | +| :---: | :---: | :---: | +| `pkgconf` | `pkg-config` | `pkg-config` | +| `gcc` | `gcc` | `gcc` | +| `make` | `make` | `make` | +| `libudev-devel` | `libudev-devel` | `libudev-dev` | + +# Compiling mdadm + +Run `make` command to compile mdadm. + +Specifying more jobs e.g. `make -j4` can decrease compilation time significantly. + +Various values can be specified for the `CXFLAGS` variable to customize the build process: +- Run `make CXFLAGS=-ggdb` to include gdb debugging information. +- Run `make CXFLAGS=-DDEBUG` to enable additional debug information through dprintf statements +and call traces. +- Run `make CXFLAGS=-DNO_LIBUDEV` to compile without `libudev`. + +To build with more than one option specified in `CXFLAGS`, separate each option with a space, e.g. +`make CXFLAGS="-ggdb -DDEBUG"`. + +Additionally, the `EXTRAVERSION` variable can be set to build with user-friendly version label, +useful when customizing mdadm builds or labeling some instance in between major releases, +e.g. `make EXTRAVERSION="custom-label"`. + +# Installing mdadm + +Before installing mdadm, it is advised to uninstall vendor-provided packages (mdadm.deb, mdadm.rpm +etc.) in order to avoid configuration issues. + +Run `make install` command to install mdadm. This command invokes the following targets: +- `install-bin` +- `install-man` +- `install-udev` + +After installing mdadm, consider rebuilding initramfs to ensure the changes take effect. + +List of installation targets: +- Run `make install-bin` to install the mdadm and mdmon binary files. +- Run `make install-systemd` to install the systemd services. +- Run `make install-udev` to install the udev rules. +- Run `make install-man` to install the manual pages (`mdadm.8`, `md.4`, `mdadm.conf.5`, +`mdmon.8`). + +The following targets are deprecated and should not be used: +- `install-static` +- `install-tcc` +- `install-uclibc` +- `install-klibc` + # License It is released under the terms of the **GNU General Public License version 2** as published @@ -81,140 +81,138 @@ char Version[] = "mdadm - v" VERSION " - " VERS_DATE EXTRAVERSION "\n"; * found, it is started. */ -char short_options[]="-ABCDEFGIQhVXYWZ:vqbc:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:k:"; -char short_monitor_options[]="-ABCDEFGIQhVXYWZ:vqbc:i:l:p:m:r:n:x:u:c:d:z:U:N:safRSow1tye:k:"; -char short_bitmap_options[]= - "-ABCDEFGIQhVXYWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:k:"; -char short_bitmap_auto_options[]= - "-ABCDEFGIQhVXYWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sa:rfRSow1tye:k:"; +char short_opts[] = "-ABCDEFGIQhVXYWZ:vqbc:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:k:"; +char short_monitor_opts[] = "-ABCDEFGIQhVXYWZ:vqbc:i:l:p:m:r:n:x:u:c:d:z:U:N:safRSow1tye:k:"; +char short_bitmap_opts[] = "-ABCDEFGIQhVXYWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:k:"; +char short_bitmap_auto_opts[] = "-ABCDEFGIQhVXYWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sa:rfRSow1tye:k:"; struct option long_options[] = { - {"manage", 0, 0, ManageOpt}, - {"misc", 0, 0, MiscOpt}, - {"assemble", 0, 0, 'A'}, - {"build", 0, 0, 'B'}, - {"create", 0, 0, 'C'}, - {"detail", 0, 0, 'D'}, - {"examine", 0, 0, 'E'}, - {"follow", 0, 0, 'F'}, - {"grow", 0, 0, 'G'}, - {"incremental",0,0, 'I'}, - {"zero-superblock", 0, 0, KillOpt}, /* deliberately not a short_option */ - {"query", 0, 0, 'Q'}, - {"examine-bitmap", 0, 0, 'X'}, - {"auto-detect", 0, 0, AutoDetect}, - {"detail-platform", 0, 0, DetailPlatform}, - {"kill-subarray", 1, 0, KillSubarray}, - {"update-subarray", 1, 0, UpdateSubarray}, - {"udev-rules", 2, 0, UdevRules}, - {"offroot", 0, 0, OffRootOpt}, - {"examine-badblocks", 0, 0, ExamineBB}, - - {"dump", 1, 0, Dump}, - {"restore", 1, 0, Restore}, - - /* synonyms */ - {"monitor", 0, 0, 'F'}, - - /* after those will normally come the name of the md device */ - - {"help", 0, 0, 'h'}, - {"help-options",0,0, HelpOptions}, - {"version", 0, 0, 'V'}, - {"verbose", 0, 0, 'v'}, - {"quiet", 0, 0, 'q'}, - - /* For create or build: */ - {"chunk", 1, 0, ChunkSize}, - {"rounding", 1, 0, ChunkSize}, /* for linear, chunk is really a - * rounding number */ - {"level", 1, 0, 'l'}, /* 0,1,4,5,6,linear */ - {"parity", 1, 0, Layout}, /* {left,right}-{a,}symmetric */ - {"layout", 1, 0, Layout}, - {"raid-disks",1, 0, 'n'}, - {"raid-devices",1, 0, 'n'}, - {"spare-disks",1,0, 'x'}, - {"spare-devices",1,0, 'x'}, - {"size", 1, 0, 'z'}, - {"auto", 1, 0, Auto}, /* also for --assemble */ - {"assume-clean",0,0, AssumeClean }, - {"write-zeroes",0,0, WriteZeroes }, - {"metadata", 1, 0, 'e'}, /* superblock format */ - {"bitmap", 1, 0, Bitmap}, - {"bitmap-chunk", 1, 0, BitmapChunk}, - {"write-behind", 2, 0, WriteBehind}, - {"write-mostly",0, 0, WriteMostly}, - {"failfast", 0, 0, FailFast}, - {"nofailfast",0, 0, NoFailFast}, - {"re-add", 0, 0, ReAdd}, - {"homehost", 1, 0, HomeHost}, - {"data-offset",1, 0, DataOffset}, - {"nodes",1, 0, Nodes}, /* also for --assemble */ - {"home-cluster",1, 0, ClusterName}, - {"write-journal",1, 0, WriteJournal}, - {"consistency-policy", 1, 0, 'k'}, - - /* For assemble */ - {"uuid", 1, 0, 'u'}, - {"super-minor",1,0, SuperMinor}, - {"name", 1, 0, 'N'}, - {"config", 1, 0, ConfigFile}, - {"scan", 0, 0, 's'}, - {"force", 0, 0, Force}, - {"update", 1, 0, 'U'}, - {"freeze-reshape", 0, 0, FreezeReshape}, - - /* Management */ - {"add", 0, 0, Add}, - {"add-spare", 0, 0, AddSpare}, - {"add-journal", 0, 0, AddJournal}, - {"remove", 0, 0, Remove}, - {"fail", 0, 0, Fail}, - {"set-faulty",0, 0, Fail}, - {"replace", 0, 0, Replace}, - {"with", 0, 0, With}, - {"run", 0, 0, 'R'}, - {"stop", 0, 0, 'S'}, - {"readonly", 0, 0, 'o'}, - {"readwrite", 0, 0, 'w'}, - {"no-degraded",0,0, NoDegraded }, - {"wait", 0, 0, WaitOpt}, - {"wait-clean", 0, 0, Waitclean }, - {"action", 1, 0, Action }, - {"cluster-confirm", 0, 0, ClusterConfirm}, - - /* For Detail/Examine */ - {"brief", 0, 0, Brief}, - {"no-devices",0, 0, NoDevices}, - {"export", 0, 0, 'Y'}, - {"sparc2.2", 0, 0, Sparc22}, - {"test", 0, 0, 't'}, - {"prefer", 1, 0, Prefer}, - - /* For Follow/monitor */ - {"mail", 1, 0, EMail}, - {"program", 1, 0, ProgramOpt}, - {"alert", 1, 0, ProgramOpt}, - {"increment", 1, 0, Increment}, - {"delay", 1, 0, 'd'}, - {"daemonise", 0, 0, Fork}, - {"daemonize", 0, 0, Fork}, - {"oneshot", 0, 0, '1'}, - {"pid-file", 1, 0, 'i'}, - {"syslog", 0, 0, 'y'}, - {"no-sharing", 0, 0, NoSharing}, - - /* For Grow */ - {"backup-file", 1,0, BackupFile}, - {"invalid-backup",0,0,InvalidBackup}, - {"array-size", 1, 0, 'Z'}, - {"continue", 0, 0, Continue}, - - /* For Incremental */ - {"rebuild-map", 0, 0, RebuildMapOpt}, - {"path", 1, 0, IncrementalPath}, - - {0, 0, 0, 0} + {"manage", 0, 0, ManageOpt}, + {"misc", 0, 0, MiscOpt}, + {"assemble", 0, 0, 'A'}, + {"build", 0, 0, 'B'}, + {"create", 0, 0, 'C'}, + {"detail", 0, 0, 'D'}, + {"examine", 0, 0, 'E'}, + + {"follow", 0, 0, 'F'}, + {"monitor", 0, 0, 'F'}, + + {"grow", 0, 0, 'G'}, + {"incremental", 0, 0, 'I'}, + {"zero-superblock", 0, 0, KillOpt}, /* deliberately not a short_option */ + {"query", 0, 0, 'Q'}, + {"examine-bitmap", 0, 0, 'X'}, + {"auto-detect", 0, 0, AutoDetect}, + {"detail-platform", 0, 0, DetailPlatform}, + {"kill-subarray", 1, 0, KillSubarray}, + {"update-subarray", 1, 0, UpdateSubarray}, + {"udev-rules", 2, 0, UdevRules}, + {"offroot", 0, 0, OffRootOpt}, + {"examine-badblocks", 0, 0, ExamineBB}, + + {"dump", 1, 0, Dump}, + {"restore", 1, 0, Restore}, + + /* after those will normally come the name of the md device */ + {"help", 0, 0, 'h'}, + {"help-options", 0, 0, HelpOptions}, + {"version", 0, 0, 'V'}, + {"verbose", 0, 0, 'v'}, + {"quiet", 0, 0, 'q'}, + + /* For create or build: */ + {"chunk", 1, 0, ChunkSize}, + {"rounding", 1, 0, ChunkSize}, /* for linear, chunk is really a rounding number */ + {"level", 1, 0, 'l'}, /* 0, 1, 4, 5, 6, linear */ + {"parity", 1, 0, Layout}, /* {left,right}-{a,}symmetric */ + {"layout", 1, 0, Layout}, + {"raid-disks", 1, 0, 'n'}, + {"raid-devices", 1, 0, 'n'}, + {"spare-disks", 1, 0, 'x'}, + {"spare-devices", 1, 0, 'x'}, + {"size", 1, 0, 'z'}, + {"auto", 1, 0, Auto}, /* Deprecated, left for backward compatibility */ + {"assume-clean", 0, 0, AssumeClean }, + {"write-zeroes", 0, 0, WriteZeroes }, + {"metadata", 1, 0, 'e'}, /* superblock format */ + {"bitmap", 1, 0, Bitmap}, + {"bitmap-chunk", 1, 0, BitmapChunk}, + {"write-behind", 2, 0, WriteBehind}, + {"write-mostly", 0, 0, WriteMostly}, + {"failfast", 0, 0, FailFast}, + {"nofailfast", 0, 0, NoFailFast}, + {"re-add", 0, 0, ReAdd}, + {"homehost", 1, 0, HomeHost}, + {"data-offset", 1, 0, DataOffset}, + {"nodes", 1, 0, Nodes}, + {"home-cluster", 1, 0, ClusterName}, + {"write-journal", 1, 0, WriteJournal}, + {"consistency-policy", 1, 0, 'k'}, + + /* For assemble */ + {"uuid", 1, 0, 'u'}, + {"super-minor", 1, 0, SuperMinor}, + {"name", 1, 0, 'N'}, + {"config", 1, 0, ConfigFile}, + {"scan", 0, 0, 's'}, + {"force", 0, 0, Force}, + {"update", 1, 0, 'U'}, + {"freeze-reshape", 0, 0, FreezeReshape}, + + /* Management */ + {"add", 0, 0, Add}, + {"add-spare", 0, 0, AddSpare}, + {"add-journal", 0, 0, AddJournal}, + {"remove", 0, 0, Remove}, + {"fail", 0, 0, Fail}, + {"set-faulty", 0, 0, Fail}, + {"replace", 0, 0, Replace}, + {"with", 0, 0, With}, + {"run", 0, 0, 'R'}, + {"stop", 0, 0, 'S'}, + {"readonly", 0, 0, 'o'}, + {"readwrite", 0, 0, 'w'}, + {"no-degraded", 0, 0, NoDegraded}, + {"wait", 0, 0, WaitOpt}, + {"wait-clean", 0, 0, Waitclean}, + {"action", 1, 0, Action}, + {"cluster-confirm", 0, 0, ClusterConfirm}, + + /* For Detail/Examine */ + {"brief", 0, 0, Brief}, + {"no-devices", 0, 0, NoDevices}, + {"export", 0, 0, 'Y'}, + {"sparc2.2", 0, 0, Sparc22}, + {"test", 0, 0, 't'}, + {"prefer", 1, 0, Prefer}, + + /* For Follow/monitor */ + {"mail", 1, 0, EMail}, + {"program", 1, 0, ProgramOpt}, + {"alert", 1, 0, ProgramOpt}, + {"increment", 1, 0, Increment}, + {"delay", 1, 0, 'd'}, + + {"daemonise", 0, 0, Fork}, + {"daemonize", 0, 0, Fork}, + + {"oneshot", 0, 0, '1'}, + {"pid-file", 1, 0, 'i'}, + {"syslog", 0, 0, 'y'}, + {"no-sharing", 0, 0, NoSharing}, + + /* For Grow */ + {"backup-file", 1, 0, BackupFile}, + {"invalid-backup", 0, 0, InvalidBackup}, + {"array-size", 1, 0, 'Z'}, + {"continue", 0, 0, Continue}, + + /* For Incremental */ + {"rebuild-map", 0, 0, RebuildMapOpt}, + {"path", 1, 0, IncrementalPath}, + + {0, 0, 0, 0} }; char Usage[] = @@ -287,64 +285,6 @@ char OptionHelp[] = " device relates to the md driver\n" " --auto-detect : Start arrays auto-detected by the kernel\n" ; -/* -"\n" -" For create or build:\n" -" --bitmap= -b : File to store bitmap in - may pre-exist for --build\n" -" --chunk= -c : chunk size of kibibytes\n" -" --rounding= : rounding factor for linear array (==chunk size)\n" -" --level= -l : raid level: 0,1,4,5,6,10,linear, or mp for create.\n" -" : 0,1,10,mp,faulty or linear for build.\n" -" --parity= -p : raid5/6 parity algorithm: {left,right}-{,a}symmetric\n" -" --layout= : same as --parity, for RAID10: [fno]NN \n" -" --raid-devices= -n : number of active devices in array\n" -" --spare-devices= -x: number of spare (eXtra) devices in initial array\n" -" --size= -z : Size (in K) of each drive in RAID1/4/5/6/10 - optional\n" -" --force -f : Honour devices as listed on command line. Don't\n" -" : insert a missing drive for RAID5.\n" -" --assume-clean : Assume the array is already in-sync. This is dangerous for RAID5.\n" -" --bitmap-chunk= : chunksize of bitmap in bitmap file (Kilobytes)\n" -" --delay= -d : seconds between bitmap updates\n" -" --write-behind= : number of simultaneous write-behind requests to allow (requires bitmap)\n" -" --name= -N : Textual name for array - max 32 characters\n" -"\n" -" For assemble:\n" -" --bitmap= -b : File to find bitmap information in\n" -" --uuid= -u : uuid of array to assemble. Devices which don't\n" -" have this uuid are excluded\n" -" --super-minor= -m : minor number to look for in super-block when\n" -" choosing devices to use.\n" -" --name= -N : Array name to look for in super-block.\n" -" --config= -c : config file\n" -" --scan -s : scan config file for missing information\n" -" --force -f : Assemble the array even if some superblocks appear out-of-date\n" -" --update= -U : Update superblock: try '-A --update=?' for list of options.\n" -" --no-degraded : Do not start any degraded arrays - default unless --scan.\n" -"\n" -" For detail or examine:\n" -" --brief -b : Just print device name and UUID\n" -"\n" -" For follow/monitor:\n" -" --mail= -m : Address to mail alerts of failure to\n" -" --program= -p : Program to run when an event is detected\n" -" --alert= : same as --program\n" -" --delay= -d : seconds of delay between polling state. default=60\n" -"\n" -" General management:\n" -" --add -a : add, or hotadd subsequent devices\n" -" --re-add : re-add a recently removed device\n" -" --remove -r : remove subsequent devices\n" -" --fail -f : mark subsequent devices as faulty\n" -" --set-faulty : same as --fail\n" -" --replace : mark a device for replacement\n" -" --run -R : start a partially built array\n" -" --stop -S : deactivate array, releasing all resources\n" -" --readonly -o : mark array as readonly\n" -" --readwrite -w : mark array as readwrite\n" -" --zero-superblock : erase the MD superblock from a device.\n" -" --wait -W : wait for recovery/resync/reshape to finish.\n" -; -*/ char Help_create[] = "Usage: mdadm --create device --chunk=X --level=Y --raid-devices=Z devices\n" @@ -374,7 +314,7 @@ char Help_create[] = " --rounding= : rounding factor for linear array (==chunk size)\n" " --level= -l : raid level: 0,1,4,5,6,10,linear,multipath and synonyms\n" " --parity= -p : raid5/6 parity algorithm: {left,right}-{,a}symmetric\n" -" --layout= : same as --parity, for RAID10: [fno]NN \n" +" --layout= : same as --parity, for RAID10: [fno]NN\n" " --raid-devices= -n : number of active devices in array\n" " --spare-devices= -x : number of spare (eXtra) devices in initial array\n" " --size= -z : Size (in K) of each drive in RAID1/4/5/6/10 - optional\n" @@ -638,7 +578,7 @@ char Help_config[] = "\n" " Other configuration lines include:\n" " mailaddr, mailfrom, program, monitordelay used for --monitor mode\n" -" create, auto used when creating device names in /dev\n" +" create, used when creating device names in /dev\n" " homehost, homecluster, policy, part-policy used to guide policy in various\n" " situations\n" "\n" @@ -19,6 +19,7 @@ */ #include "mdadm.h" +#include "xmalloc.h" static inline void sb_le_to_cpu(bitmap_super_t *sb) { @@ -24,6 +24,8 @@ #include "mdadm.h" #include "dlink.h" +#include "xmalloc.h" + #include <dirent.h> #include <glob.h> #include <fnmatch.h> @@ -169,7 +171,6 @@ inline void ident_init(struct mddev_ident *ident) assert(ident); ident->assembled = false; - ident->autof = 0; ident->bitmap_fd = -1; ident->bitmap_file = NULL; ident->container = NULL; @@ -360,39 +361,41 @@ struct mddev_dev *load_partitions(void) struct mddev_dev *load_containers(void) { struct mdstat_ent *mdstat = mdstat_read(0, 0); + struct mddev_dev *dev_list = NULL; + struct map_ent *map_list = NULL; struct mdstat_ent *ent; - struct mddev_dev *d; - struct mddev_dev *rv = NULL; - struct map_ent *map = NULL, *me; - if (!mdstat) - return NULL; + for (ent = mdstat; ent; ent = ent->next) { + struct mddev_dev *d; + struct map_ent *map; - for (ent = mdstat; ent; ent = ent->next) - if (ent->metadata_version && - strncmp(ent->metadata_version, "external:", 9) == 0 && - !is_subarray(&ent->metadata_version[9])) { - d = xcalloc(1, sizeof(*d)); - me = map_by_devnm(&map, ent->devnm); - if (me) - d->devname = xstrdup(me->path); - else if (asprintf(&d->devname, "/dev/%s", ent->devnm) < 0) { - free(d); - continue; - } - d->next = rv; - rv = d; - map_free(map); - map = NULL; + if (!is_mdstat_ent_external(ent)) + continue; + + if (is_mdstat_ent_subarray(ent)) + continue; + + d = xcalloc(1, sizeof(*d)); + + map = map_by_devnm(&map_list, ent->devnm); + if (map) { + d->devname = xstrdup(map->path); + } else if (asprintf(&d->devname, "/dev/%s", ent->devnm) < 0) { + free(d); + continue; } + + d->next = dev_list; + dev_list = d; + } + free_mdstat(mdstat); - map_free(map); + map_free(map_list); - return rv; + return dev_list; } struct createinfo createinfo = { - .autof = 2, /* by default, create devices with standard names */ .names = 0, /* By default, stick with numbered md devices. */ .bblist = 1, /* Use a bad block list by default */ #ifdef DEBIAN @@ -403,52 +406,6 @@ struct createinfo createinfo = { #endif }; -int parse_auto(char *str, char *msg, int config) -{ - int autof; - if (str == NULL || *str == 0) - autof = 2; - else if (strcasecmp(str, "no") == 0) - autof = 1; - else if (strcasecmp(str, "yes") == 0) - autof = 2; - else if (strcasecmp(str, "md") == 0) - autof = config ? 5:3; - else { - /* There might be digits, and maybe a hypen, at the end */ - char *e = str + strlen(str); - int num = 4; - int len; - while (e > str && isdigit(e[-1])) - e--; - if (*e) { - num = atoi(e); - if (num <= 0) - num = 1; - } - if (e > str && e[-1] == '-') - e--; - len = e - str; - if ((len == 2 && strncasecmp(str, "md", 2) == 0)) { - autof = config ? 5 : 3; - } else if ((len == 3 && strncasecmp(str, "yes", 3) == 0)) { - autof = 2; - } else if ((len == 3 && strncasecmp(str, "mdp", 3) == 0)) { - autof = config ? 6 : 4; - } else if ((len == 1 && strncasecmp(str, "p", 1) == 0) || - (len >= 4 && strncasecmp(str, "part", 4) == 0)) { - autof = 6; - } else { - pr_err("%s arg of \"%s\" unrecognised: use no,yes,md,mdp,part\n" - " optionally followed by a number.\n", - msg, str); - exit(2); - } - autof |= num << 3; - } - return autof; -} - static void createline(char *line) { char *w; @@ -456,7 +413,8 @@ static void createline(char *line) for (w = dl_next(line); w != line; w = dl_next(w)) { if (strncasecmp(w, "auto=", 5) == 0) - createinfo.autof = parse_auto(w + 5, "auto=", 1); + /* auto is no supported now, ignore it silently */ + continue; else if (strncasecmp(w, "owner=", 6) == 0) { if (w[6] == 0) { pr_err("missing owner name\n"); @@ -625,9 +583,9 @@ void arrayline(char *line) if (!mis.st) pr_err("metadata format %s unknown, ignored.\n", w + 9); - } else if (strncasecmp(w, "auto=", 5) == 0 ) { - /* whether to create device special files as needed */ - mis.autof = parse_auto(w + 5, "auto type", 0); + } else if (strncasecmp(w, "auto=", 5) == 0) { + /* Ignore for backward compatibility */ + continue; } else if (strncasecmp(w, "member=", 7) == 0) { /* subarray within a container */ mis.member = xstrdup(w + 7); @@ -26,6 +26,21 @@ void dl_free(void *v) free(vv-1); } +void dl_free_all(void *head) +{ + /* The list head is linked with the list tail so in order to free + * all the elements properly there is a need to keep starting point. + */ + void *d = dl_next(head), *next; + + while (d != head) { + next = dl_next(d); + dl_free(d); + d = next; + } + dl_free(head); +} + void dl_init(void *v) { dl_next(v) = v; @@ -16,10 +16,11 @@ struct __dl_head #define dl_prev(p) *(&(((struct __dl_head*)(p))[-1].dh_prev)) void *dl_head(void); -char *dl_strdup(char *); -char *dl_strndup(char *, int); -void dl_insert(void*, void*); -void dl_add(void*, void*); -void dl_del(void*); -void dl_free(void*); -void dl_init(void*); +char *dl_strdup(char *s); +char *dl_strndup(char *s, int l); +void dl_insert(void *head, void *val); +void dl_add(void *head, void *val); +void dl_del(void *val); +void dl_free(void *v); +void dl_init(void *v); +void dl_free_all(void *head); @@ -24,6 +24,8 @@ #include "mdadm.h" #include "dlink.h" +#include "xmalloc.h" + #include <ctype.h> #include <limits.h> diff --git a/managemon.c b/managemon.c index 358459e..d798132 100644 --- a/managemon.c +++ b/managemon.c @@ -104,6 +104,8 @@ #endif #include "mdadm.h" #include "mdmon.h" +#include "xmalloc.h" + #include <sys/syscall.h> #include <sys/socket.h> @@ -438,6 +440,39 @@ static int disk_init_and_add(struct mdinfo *disk, struct mdinfo *clone, return 0; } +/** + * managemon_disk_remove()- remove disk from the MD array. + * @disk: device to be removed. + * @array_devnm: the name of the array to remove disk from. + * + * It tries to remove the disk from the MD array and if it is successful then it closes all opened + * descriptors. Removing action requires suspend, it might take a while. + * Invalidating mdi->state_fd will prevent from using this device further (see duplicate_aa()). + * + * To avoid deadlock, new file descriptor is opened because monitor may already wait on + * mdddev_suspend() in kernel and keep saved descriptor locked. + * + * Returns MDADM_STATUS_SUCCESS if disk has been removed, MDADM_STATUS_ERROR otherwise. + */ +static mdadm_status_t managemon_disk_remove(struct mdinfo *disk, char *array_devnm) +{ + int new_state_fd = sysfs_open2(array_devnm, disk->sys_name, "state"); + + if (!is_fd_valid(new_state_fd)) + return MDADM_STATUS_ERROR; + + if (write_attr("remove", new_state_fd) != MDADM_STATUS_SUCCESS) + return MDADM_STATUS_ERROR; + + close_fd(&new_state_fd); + close_fd(&disk->state_fd); + close_fd(&disk->recovery_fd); + close_fd(&disk->bb_fd); + close_fd(&disk->ubb_fd); + + return MDADM_STATUS_SUCCESS; +} + static void manage_member(struct mdstat_ent *mdstat, struct active_array *a) { @@ -512,15 +547,45 @@ static void manage_member(struct mdstat_ent *mdstat, if (a->container == NULL) return; - if (sigterm && a->info.safe_mode_delay != 1 && - a->safe_mode_delay_fd >= 0) { - long int new_delay = 1; - char delay[10]; - ssize_t len; + if (sigterm && a->info.safe_mode_delay != 1 && a->safe_mode_delay_fd >= 0) + if (write_attr("0.001", a->safe_mode_delay_fd) == MDADM_STATUS_SUCCESS) + a->info.safe_mode_delay = 1; + + if (a->check_member_remove) { + bool any_removed = false; + bool all_removed = true; + struct mdinfo *disk; + + for (disk = a->info.devs; disk; disk = disk->next) { + if (disk->man_disk_to_remove == false) + continue; + + if (disk->mon_descriptors_not_used == false) { + /* To early, repeat later */ + all_removed = false; + continue; + } - len = snprintf(delay, sizeof(delay), "0.%03ld\n", new_delay); - if (write(a->safe_mode_delay_fd, delay, len) == len) - a->info.safe_mode_delay = new_delay; + if (managemon_disk_remove(disk, a->info.sys_name)) { + all_removed = false; + continue; + } + + any_removed = true; + } + + if (any_removed) { + struct active_array *newa = duplicate_aa(a); + + if (all_removed) + newa->check_member_remove = false; + + replace_array(container, a, newa); + a = newa; + } + + if (!all_removed) + return; } /* We don't check the array while any update is pending, as it @@ -544,8 +609,6 @@ static void manage_member(struct mdstat_ent *mdstat, return; newa = duplicate_aa(a); - if (!newa) - goto out; /* prevent the kernel from activating the disk(s) before we * finish adding them */ @@ -575,7 +638,7 @@ static void manage_member(struct mdstat_ent *mdstat, "sync_action", "recover") == 0) newa->prev_action = recover; dprintf("recovery started on %s\n", a->info.sys_name); - out: + while (newdev) { d = newdev->next; free(newdev); @@ -609,11 +672,9 @@ static void manage_member(struct mdstat_ent *mdstat, if (d2) /* already have this one */ continue; - if (!newa) { + if (!newa) newa = duplicate_aa(a); - if (!newa) - break; - } + newd = xmalloc(sizeof(*newd)); disk_init_and_add(newd, d, newa); } @@ -776,10 +837,8 @@ static void manage_new(struct mdstat_ent *mdstat, error: pr_err("failed to monitor %s\n", mdstat->metadata_version); - if (new) { - new->container = NULL; - free_aa(new); - } + new->container = NULL; + free_aa(new); if (mdi) sysfs_free(mdi); } @@ -870,8 +929,15 @@ void read_sock(struct supertype *container) return; fl = fcntl(fd, F_GETFL, 0); + if (fl < 0) { + close_fd(&fd); + return; + } fl |= O_NONBLOCK; - fcntl(fd, F_SETFL, fl); + if (fcntl(fd, F_SETFL, fl) < 0) { + close_fd(&fd); + return; + } do { msg.buf = NULL; @@ -43,6 +43,8 @@ * at compile time via MAP_DIR and MAP_FILE. */ #include "mdadm.h" +#include "xmalloc.h" + #include <sys/file.h> #include <ctype.h> @@ -339,18 +341,14 @@ struct map_ent *map_by_name(struct map_ent **map, char *name) */ static char *get_member_info(struct mdstat_ent *ent) { + char *subarray; - if (ent->metadata_version == NULL || - strncmp(ent->metadata_version, "external:", 9) != 0) + if (!is_mdstat_ent_subarray(ent)) return NULL; - if (is_subarray(&ent->metadata_version[9])) { - char *subarray; + subarray = strrchr(ent->metadata_version, '/'); - subarray = strrchr(ent->metadata_version, '/'); - return subarray + 1; - } - return NULL; + return subarray + 1; } void RebuildMap(void) @@ -48,11 +48,9 @@ stored in the device. This metadata is sometimes called a The metadata records information about the structure and state of the array. This allows the array to be reliably re-assembled after a shutdown. -From Linux kernel version 2.6.10, .B md provides support for two different formats of metadata, and -other formats can be added. Prior to this release, only one format is -supported. +other formats can be added. The common format \(em known as version 0.90 \(em has a superblock that is 4K long and is written into a 64K aligned block that @@ -126,7 +124,6 @@ have special-purpose uses and is supported. .SS ARRAYS WITH EXTERNAL METADATA -From release 2.6.28, the .I md driver supports arrays with externally managed metadata. That is, the metadata is not managed by the kernel but rather by a user-space @@ -178,8 +175,7 @@ A RAID0 array (which has zero redundancy) is also known as a striped array. A RAID0 array is configured at creation with a .B "Chunk Size" -which must be a power of two (prior to Linux 2.6.31), and at least 4 -kibibytes. +which must be at least 4 kibibytes. The RAID0 driver assigns the first chunk of the array to the first device, the second chunk to the second device, and so on until all @@ -224,7 +220,7 @@ option. If you use this option to while running a newer kernel, the array will NOT assemble, but the metadata will be update so that it can be assembled on an older kernel. -No that setting the layout to "unspecified" removes protections against +Note that setting the layout to "unspecified" removes protections against this bug, and you must be sure that the kernel you use matches the layout of the array. @@ -303,8 +299,7 @@ drives. When configuring a RAID10 array, it is necessary to specify the number of replicas of each data block that are required (this will usually -be\ 2) and whether their layout should be "near", "far" or "offset" -(with "offset" being available since Linux\ 2.6.18). +be\ 2) and whether their layout should be "near", "far" or "offset". .B About the RAID10 Layout Examples: .br @@ -707,9 +702,7 @@ The array can still be used, though possibly with reduced performance. If a RAID4, RAID5 or RAID6 array is degraded (missing at least one drive, two for RAID6) when it is restarted after an unclean shutdown, it cannot recalculate parity, and so it is possible that data might be -undetectably corrupted. The 2.4 md driver -.B does not -alert the operator to this condition. The 2.6 md driver will fail to +undetectably corrupted. The md driver will fail to start an array in this condition without manual intervention, though this behaviour can be overridden by a kernel parameter. @@ -724,12 +717,9 @@ either by copying a working drive in a RAID1 configuration, or by doing calculations with the parity block on RAID4, RAID5 or RAID6, or by finding and copying originals for RAID10. -In kernels prior to about 2.6.15, a read error would cause the same -effect as a write error. In later kernels, a read-error will instead -cause md to attempt a recovery by overwriting the bad block. i.e. it -will find the correct data from elsewhere, write it over the block -that failed, and then try to read it back again. If either the write -or the re-read fail, md will treat the error the same way that a write +A read-error will cause md to attempt a recovery by overwriting the bad block. i.e. it will find +the correct data from elsewhere, write it over the block that failed, and then try to read it back +again. If either the write or the re-read fail, md will treat the error the same way that a write error is treated, and will fail the whole device. While this recovery process is happening, the md driver will monitor @@ -851,7 +841,6 @@ especially when the device is used for swap. .SS BITMAP WRITE-INTENT LOGGING -From Linux 2.6.13, .I md supports a bitmap based write-intent log. If configured, the bitmap is used to record which blocks of the array may be out of sync. @@ -878,12 +867,11 @@ be stored near the superblocks of an array which has superblocks. It is possible to add an intent log to an active array, or remove an intent log if one is present. -In 2.6.13, intent bitmaps are only supported with RAID1. Other levels -with redundancy are supported from 2.6.15. +All raid levels with redundancy are supported. .SS BAD BLOCK LIST -From Linux 3.5 each device in an +Each device in an .I md array can store a list of known-bad-blocks. This list is 4K in size and usually positioned at the end of the space between the superblock @@ -937,18 +925,12 @@ Partial parity for a write operation is the XOR of stripe data chunks not modified by the write. PPL is stored in the metadata region of RAID member drives, no additional journal drive is needed. After crashes, if one of the not modified data disks of -the stripe is missing, this updated parity can be used to recover its -data. +the stripe is missing, this updated parity can be used to recover its data. -This mechanism is documented more fully in the file -Documentation/md/raid5-ppl.rst +See Documentation/driver-api/md/raid5-ppl.rst for implementation details. .SS WRITE-BEHIND -From Linux 2.6.14, -.I md -supports WRITE-BEHIND on RAID1 arrays. - This allows certain devices in the array to be flagged as .IR write-mostly . MD will only read from such devices if there is no @@ -1030,7 +1012,8 @@ array (so the stripes are wider), changing the chunk size (so stripes are deeper or shallower), or changing the arrangement of data and parity (possibly changing the RAID level, e.g. 1 to 5 or 5 to 6). -As of Linux 2.6.35, md can reshape a RAID4, RAID5, or RAID6 array to +.I md +can reshape a RAID4, RAID5, or RAID6 array to have a different number of devices (more or fewer) and to have a different layout or chunk size. It can also convert between these different RAID levels. It can also convert between RAID0 and RAID10, @@ -1069,7 +1052,7 @@ after a system crash. .PP .B mdadm -versions from 2.4 do this for growing a RAID5 array. +do this for growing a RAID5 array. For operations that do not change the size of the array, like simply increasing chunk size, or converting RAID5 to RAID6 with one extra @@ -1231,18 +1214,6 @@ an MD array, and if any full arrays are found, they are started. This kernel parameter disables this behaviour. .TP -.B raid=partitionable -.TP -.B raid=part -These are available in 2.6 and later kernels only. They indicate that -autodetected MD arrays should be created as partitionable arrays, with -a different major device number to the original non-partitionable md -arrays. The device number is listed as -.I mdp -in -.IR /proc/devices . - -.TP .B md_mod.start_ro=1 .TP .B /sys/module/md_mod/parameters/start_ro @@ -1273,14 +1244,6 @@ from the listed devices. It is only necessary to start the device holding the root filesystem this way. Other arrays are best started once the system is booted. -In 2.6 kernels, the -.B d -immediately after the -.B = -indicates that a partitionable device (e.g. -.BR /dev/md/d0 ) -should be created rather than the original non-partitionable device. - .TP .BI md= n , l , c , i , dev... This tells the md driver to assemble a legacy RAID0 or LINEAR array @@ -48,11 +48,12 @@ multiple devices: each device is a path to one common physical storage device. New installations should not use md/multipath as it is not well supported and has no ongoing development. Use the Device Mapper based -multipath-tools instead. +multipath-tools instead. It is deprecated and support will be removed in the future. .B FAULTY is also not true RAID, and it only involves one device. It -provides a layer over a true device that can be used to inject faults. +provides a layer over a true device that can be used to inject faults. It is deprecated +and support will be removed in the future. .B CONTAINER is different again. A @@ -354,7 +355,7 @@ preferred 1.x format). 'if '{DEFAULT_METADATA}'1.2' "default" is equivalent to "1.2". .IP ddf Use the "Industry Standard" DDF (Disk Data Format) format defined by -SNIA. +SNIA. DDF is deprecated and there is no active development around it. When creating a DDF array a .B CONTAINER will be created, and normal arrays can be created in that container. @@ -443,8 +444,7 @@ multipath, RAID0 and RAID1. It is never allowed for RAID4, RAID5 or RAID6. .br This number can only be changed using .B \-\-grow -for RAID1, RAID4, RAID5 and RAID6 arrays, and only on kernels which provide -the necessary support. +for RAID1, RAID4, RAID5 and RAID6 arrays. .TP .BR \-x ", " \-\-spare\-devices= @@ -563,8 +563,7 @@ component will be rounded down to a multiple of this size. This is a synonym for .B \-\-chunk but highlights the different meaning for Linear as compared to other -RAID levels. The default is 64K if a kernel earlier than 2.6.16 is in -use, and is 0K (i.e. no rounding) in later kernels. +RAID levels. The default is 0K (i.e. no rounding). .TP .BR \-l ", " \-\-level= @@ -829,7 +828,7 @@ facts the operator knows. When an array is resized to a larger size with .B "\-\-grow \-\-size=" the new space is normally resynced in that same way that the whole -array is resynced at creation. From Linux version 3.0, +array is resynced at creation. .B \-\-assume\-clean can be used with that command to avoid the automatic resync. @@ -837,8 +836,7 @@ can be used with that command to avoid the automatic resync. .BR \-\-write-zeroes When creating an array, send write zeroes requests to all the block devices. This should zero the data area on all disks such that the -initial sync is not necessary and, if successfull, will behave -as if +initial sync is not necessary and, if successful, will behave as if .B \-\-assume\-clean was specified. .IP @@ -875,7 +873,6 @@ Setting the offset explicitly over-rides the default. The value given is in Kilobytes unless a suffix of 'K', 'M', 'G' or 'T' is used to explicitly indicate Kilobytes, Megabytes, Gigabytes or Terabytes respectively. -Since Linux 3.4, .B \-\-data\-offset can also be used with .B --grow @@ -969,63 +966,6 @@ array, and no resync, recovery, or reshape will be started. It works with Create, Assemble, Manage and Misc mode. .TP -.BR \-a ", " "\-\-auto{=yes,md,mdp,part,p}{NN}" -Instruct mdadm how to create the device file if needed, possibly allocating -an unused minor number. "md" causes a non-partitionable array -to be used (though since Linux 2.6.28, these array devices are in fact -partitionable). "mdp", "part" or "p" causes a partitionable array (2.6 and -later) to be used. "yes" requires the named md device to have -a 'standard' format, and the type and minor number will be determined -from this. With mdadm 3.0, device creation is normally left up to -.I udev -so this option is unlikely to be needed. -See DEVICE NAMES below. - -The argument can also come immediately after -"\-a". e.g. "\-ap". - -If -.B \-\-auto -is not given on the command line or in the config file, then -the default will be -.BR \-\-auto=yes . - -If -.B \-\-scan -is also given, then any -.I auto= -entries in the config file will override the -.B \-\-auto -instruction given on the command line. - -For partitionable arrays, -.I mdadm -will create the device file for the whole array and for the first 4 -partitions. A different number of partitions can be specified at the -end of this option (e.g. -.BR \-\-auto=p7 ). -If the device name ends with a digit, the partition names add a 'p', -and a number, e.g. -.IR /dev/md/home1p3 . -If there is no trailing digit, then the partition names just have a -number added, e.g. -.IR /dev/md/scratch3 . - -If the md device name is in a 'standard' format as described in DEVICE -NAMES, then it will be created, if necessary, with the appropriate -device number based on that name. If the device name is not in one of these -formats, then an unused device number will be allocated. The device -number will be considered unused if there is no active array for that -number, and there is no entry in /dev for that number and with a -non-standard name. Names that are not in 'standard' format are only -allowed in "/dev/md/". - -This is meaningful with -.B \-\-create -or -.BR \-\-build . - -.TP .BR \-a ", " "\-\-add" This option can be used in Grow mode in two cases. @@ -1169,10 +1109,6 @@ and can be used if the physical connections to devices are not as reliable as you would like. .TP -.BR \-a ", " "\-\-auto{=no,yes,md,mdp,part}" -See this option under Create and Build options. - -.TP .BR \-b ", " \-\-bitmap= Specify the bitmap file that was given when the array was created. If an array has an @@ -1206,7 +1142,6 @@ backup file. .BR \-U ", " \-\-update= Update the superblock on each device while assembling the array. The argument given to this flag can be one of -.BR sparc2.2 , .BR summaries , .BR uuid , .BR name , @@ -1229,16 +1164,6 @@ or .BR super\-minor . The -.B sparc2.2 -option will adjust the superblock of an array what was created on a Sparc -machine running a patched 2.2 Linux kernel. This kernel got the -alignment of part of the superblock wrong. You can use the -.B "\-\-examine \-\-sparc2.2" -option to -.I mdadm -to see what effect this would have. - -The .B super\-minor option will update the .B "preferred minor" @@ -1251,7 +1176,7 @@ reports a different "Preferred Minor" to In some cases this update will be performed automatically by the kernel driver. In particular, the update happens automatically at the first write to an array with redundancy (RAID level 1 or -greater) on a 2.6 (or later) kernel. +greater). The .B uuid @@ -1686,18 +1611,6 @@ and applies to devices which are components of an array, while .B \-\-detail applies to a whole array which is currently active. -.TP -.B \-\-sparc2.2 -If an array was created on a SPARC machine with a 2.2 Linux kernel -patched with RAID support, the superblock will have been created -incorrectly, or at least incompatibly with 2.4 and later kernels. -Using the -.B \-\-sparc2.2 -flag with -.B \-\-examine -will fix the superblock before displaying it. If this appears to do -the right thing, then the array can be successfully assembled using -.BR "\-\-assemble \-\-update=sparc2.2" . .TP .BR \-X ", " \-\-examine\-bitmap @@ -1830,6 +1743,16 @@ can be found it under .BR "SCRUBBING AND MISMATCHES" . +.TP +.B \-\-udev\-rules= +it generates the udev rules to the file that handles hot-plug bare devices. +Given the POLICYs defined under +.IR {CONFFILE}\ (or {CONFFILE2}) + +See +.BR mdadm.conf (5) +for more details and usage examples about POLICY. + .SH For Incremental Assembly mode: .TP .BR \-\-rebuild\-map ", " \-r @@ -1878,11 +1801,15 @@ script. .SH For Monitor mode: .TP .BR \-m ", " \-\-mail -Give a mail address to send alerts to. +Give an mail address to send alerts to. Can be configured in +.B mdadm.conf +as MAILADDR. .TP .BR \-p ", " \-\-program ", " \-\-alert -Give a program to be run whenever an event is detected. +Give a program to be run whenever an event is detected. Can be configured in +.B mdadm.conf +as PROGRAM. .TP .BR \-y ", " \-\-syslog @@ -1891,13 +1818,12 @@ facility of 'daemon' and varying priorities. .TP .BR \-d ", " \-\-delay -Give a delay in seconds. -.I mdadm -polls the md arrays and then waits this many seconds before polling -again. The default is 60 seconds. Since 2.6.16, there is no need to -reduce this as the kernel alerts +Give a delay in seconds. The default is 60 seconds. .I mdadm -immediately when there is any change. +polls the md arrays and then waits this many seconds before polling again if no event happened. +Can be configured in +.B mdadm.conf +as MONITORDELAY. .TP .BR \-r ", " \-\-increment @@ -2050,33 +1976,6 @@ detects that udev is not configured, it will create the devices in .B /dev itself. -In Linux kernels prior to version 2.6.28 there were two distinct -types of md devices that could be created: one that could be -partitioned using standard partitioning tools and one that could not. -Since 2.6.28 that distinction is no longer relevant as both types of -devices can be partitioned. -.I mdadm -will normally create the type that originally could not be partitioned -as it has a well-defined major number (9). - -Prior to 2.6.28, it is important that mdadm chooses the correct type -of array device to use. This can be controlled with the -.B \-\-auto -option. In particular, a value of "mdp" or "part" or "p" tells mdadm -to use a partitionable device rather than the default. - -In the no-udev case, the value given to -.B \-\-auto -can be suffixed by a number. This tells -.I mdadm -to create that number of partition devices rather than the default of 4. - -The value given to -.B \-\-auto -can also be given in the configuration file as a word starting -.B auto= -on the ARRAY line for the relevant array. - .SS Auto-Assembly When .B \-\-assemble @@ -2595,30 +2494,29 @@ is given, then a .B program or an .B e-mail -address must be specified on the -command line or in the config file. If neither are available, then +address must be specified on the command line or in the config file. If neither are available, then .I mdadm -will not monitor anything. -For devices given directly in command line, without +will not monitor anything. For devices given directly in command line, without .B program or .B email specified, each event is reported to .BR stdout. -Note: For systems where -.If mdadm monitor -is configured via systemd, -.B mdmonitor(mdmonitor.service) -should be configured. The service is designed to be primary solution for array monitoring, -it is configured to work in system wide mode. -It is automatically started and stopped according to current state and types of MD arrays in system. -The service may require additional configuration, like -.B e-mail -or -.B delay. -That should be done in -.B mdadm.conf. +Note: On systems where mdadm monitoring is managed through systemd, the mdmonitor.service +should be present. This service is designed to be the primary solution for array monitoring. +It is configured to operate in system-wide mode. It is initiated by udev when start criteria are +met, e.g. +.B mdadm.conf +exists and necessary configuration parameters are set. +It is kept alive as long as a redundant RAID array is active; it stops otherwise. User should +customize MAILADDR in +.B mdadm.conf +to receive mail notifications. MONITORDELAY, MAILFROM and PROGRAM are optional. See +.BR mdadm.conf (5) +for detailed description of these options. +Use systemctl status mdmonitor.service to verify status or determine if additional configuration +is needed. The different events are: @@ -2774,7 +2672,7 @@ and then follow similar steps as above if a matching spare is found. The GROW mode is used for changing the size or shape of an active array. -During the kernel 2.6 era the following changes were added: +The following changes are supported: .IP \(bu 4 change the "size" attribute for RAID1, RAID4, RAID5 and RAID6. .IP \(bu 4 @@ -2858,14 +2756,12 @@ When the number of devices is increased, any hot spares that are present will be activated immediately. Changing the number of active devices in a RAID5 or RAID6 is much more -effort. Every block in the array will need to be read and written -back to a new location. From 2.6.17, the Linux Kernel is able to -increase the number of devices in a RAID5 safely, including restarting -an interrupted "reshape". From 2.6.31, the Linux Kernel is able to -increase or decrease the number of devices in a RAID5 or RAID6. +effort. Every block in the array will need to be read and written +back to a new location. Linux Kernel is able to increase or decrease +the number of devices in a RAID5 and RAID6 safely, including restarting +an interrupted "reshape". -From 2.6.35, the Linux Kernel is able to convert a RAID0 in to a RAID4 -or RAID5. +The Linux Kernel is able to convert a RAID0 into a RAID4 or RAID5. .I mdadm uses this functionality and the ability to add devices to a RAID4 to allow devices to be added to a RAID0. When @@ -3419,26 +3315,14 @@ Also if the homehost is specified as will only use a suffix if a different array of the same name already exists or is listed in the config file. -The standard names for non-partitioned arrays (the only sort of md -array available in 2.4 and earlier) are of the form +The names for arrays are of the form: .IP .RB /dev/md NN .PP where NN is a number. -The standard names for partitionable arrays (as available from 2.6 -onwards) are of the form: -.IP -.RB /dev/md_d NN -.PP -Partition numbers should be indicated by adding "pMM" to these, thus "/dev/md/d1p2". -.PP -From kernel version 2.6.28 the "non-partitioned array" can actually -be partitioned. So the "md_d\fBNN\fP" -names are no longer needed, and -partitions such as "/dev/md\fBNN\fPp\fBXX\fP" -are possible. + .PP -From kernel version 2.6.29 standard names can be non-numeric following +Names can be non-numeric following the form: .IP .RB /dev/md_ XXX @@ -27,6 +27,8 @@ #include "mdadm.h" #include "md_p.h" +#include "xmalloc.h" + #include <ctype.h> /** @@ -104,16 +106,6 @@ int main(int argc, char *argv[]) mdu_array_info_t array; int devs_found = 0; int grow_continue = 0; - /* autof indicates whether and how to create device node. - * bottom 3 bits are style. Rest (when shifted) are number of parts - * 0 - unset - * 1 - don't create (no) - * 2 - if is_standard, then create (yes) - * 3 - create as 'md' - reject is_standard mdp (md) - * 4 - create as 'mdp' - reject is_standard md (mdp) - * 5 - default to md if not is_standard (md in config file) - * 6 - default to mdp if not is_standard (part, or mdp in config file) - */ struct context c = { .require_homehost = 1, }; @@ -137,7 +129,7 @@ int main(int argc, char *argv[]) struct supertype *ss = NULL; enum flag_mode writemostly = FlagDefault; enum flag_mode failfast = FlagDefault; - char *shortopt = short_options; + char *shortopt = short_opts; int dosyslog = 0; int rebuild_map = 0; char *remove_path = NULL; @@ -226,10 +218,10 @@ int main(int argc, char *argv[]) * set the mode if it isn't already */ - switch(opt) { + switch (opt) { case ManageOpt: newmode = MANAGE; - shortopt = short_bitmap_options; + shortopt = short_bitmap_opts; break; case 'a': case Add: @@ -245,27 +237,33 @@ int main(int argc, char *argv[]) case ClusterConfirm: if (!mode) { newmode = MANAGE; - shortopt = short_bitmap_options; + shortopt = short_bitmap_opts; } break; - case 'A': newmode = ASSEMBLE; - shortopt = short_bitmap_auto_options; + case 'A': + newmode = ASSEMBLE; + shortopt = short_bitmap_auto_opts; break; - case 'B': newmode = BUILD; - shortopt = short_bitmap_auto_options; + case 'B': + newmode = BUILD; + shortopt = short_bitmap_auto_opts; break; - case 'C': newmode = CREATE; - shortopt = short_bitmap_auto_options; + case 'C': + newmode = CREATE; + shortopt = short_bitmap_auto_opts; break; - case 'F': newmode = MONITOR; - shortopt = short_monitor_options; + case 'F': + newmode = MONITOR; + shortopt = short_monitor_opts; break; - case 'G': newmode = GROW; - shortopt = short_bitmap_options; + case 'G': + newmode = GROW; + shortopt = short_bitmap_opts; break; - case 'I': newmode = INCREMENTAL; - shortopt = short_bitmap_auto_options; + case 'I': + newmode = INCREMENTAL; + shortopt = short_bitmap_auto_opts; break; case AutoDetect: newmode = AUTODETECT; @@ -696,8 +694,8 @@ int main(int argc, char *argv[]) case O(INCREMENTAL,'a'): case O(INCREMENTAL,Auto): case O(ASSEMBLE,'a'): - case O(ASSEMBLE,Auto): /* auto-creation of device node */ - c.autof = parse_auto(optarg, "--auto flag", 0); + case O(ASSEMBLE, Auto): /* auto-creation of device node - deprecated */ + pr_info("--auto is deprecated and will be removed in future releases.\n"); continue; case O(BUILD,'f'): /* force honouring '-n 1' */ case O(BUILD,Force): /* force honouring '-n 1' */ @@ -1310,10 +1308,6 @@ int main(int argc, char *argv[]) if (ident_set_devname(&ident, devlist->devname) != MDADM_STATUS_SUCCESS) exit(1); - if ((int)ident.super_minor == -2 && c.autof) { - pr_err("--super-minor=dev is incompatible with --auto\n"); - exit(2); - } if (mode == MANAGE || mode == GROW) { mdfd = open_mddev(ident.devname, 1); if (mdfd < 0) @@ -1395,8 +1389,6 @@ int main(int argc, char *argv[]) exit(1); } - ident.autof = c.autof; - if (c.scan && c.verbose < 2) /* --scan implied --brief unless -vv */ c.brief = 1; @@ -1448,8 +1440,6 @@ int main(int argc, char *argv[]) if (mdfd >= 0) close(mdfd); } else { - if (array_ident->autof == 0) - array_ident->autof = c.autof; rv |= Assemble(ss, ident.devname, array_ident, NULL, &c); } } else if (!c.scan) @@ -1471,10 +1461,7 @@ int main(int argc, char *argv[]) rv |= 1; continue; } - if (array_ident->autof == 0) - array_ident->autof = c.autof; - rv |= Assemble(ss, dv->devname, array_ident, - NULL, &c); + rv |= Assemble(ss, dv->devname, array_ident, NULL, &c); } } else { if (c.update) { @@ -1703,8 +1690,7 @@ int main(int argc, char *argv[]) rv = 1; break; } - rv = IncrementalRemove(devlist->devname, remove_path, - c.verbose); + rv = Incremental_remove(devlist->devname, remove_path, c.verbose); } else rv = Incremental(devlist, &c, ss); break; @@ -1741,11 +1727,10 @@ static int scan_assemble(struct supertype *ss, pr_err("No devices listed in conf file were found.\n"); return 1; } - for (a = array_list; a; a = a->next) { + + for (a = array_list; a; a = a->next) a->assembled = 0; - if (a->autof == 0) - a->autof = c->autof; - } + if (map_lock(&map)) pr_err("failed to get exclusive lock on mapfile\n"); do { @@ -1777,7 +1762,7 @@ static int scan_assemble(struct supertype *ss, */ int rv2; int acnt; - ident->autof = c->autof; + do { struct mddev_dev *devlist = conf_get_devs(); acnt = 0; diff --git a/mdadm.conf.5.in b/mdadm.conf.5.in index 14302a9..44aff74 100644 --- a/mdadm.conf.5.in +++ b/mdadm.conf.5.in @@ -185,27 +185,6 @@ a group to another array in that group if the first array had a failed or missing drive but no spare. .TP -.B auto= -This option is rarely needed with mdadm-3.0, particularly if use with -the Linux kernel v2.6.28 or later. -It tells -.I mdadm -whether to use partitionable array or non-partitionable arrays and, -in the absence of -.IR udev , -how many partition devices to create. From 2.6.28 all md array -devices are partitionable, hence this option is not needed. - -The value of this option can be "yes" or "md" to indicate that a -traditional, non-partitionable md array should be created, or "mdp", -"part" or "partition" to indicate that a partitionable md array (only -available in linux 2.6 and later) should be used. This later set can -also have a number appended to indicate how many partitions to create -device files for, e.g. -.BR auto=mdp5 . -The default is 4. - -.TP .B bitmap= The option specifies a file in which a write-intent bitmap should be found. When assembling the array, @@ -314,17 +293,6 @@ defaults (root/wheel or root/disk). .B mode= An octal file mode such as 0660 can be given to override the default of 0600. -.TP -.B auto= -This corresponds to the -.B \-\-auto -flag to mdadm. Give -.BR yes , -.BR md , -.BR mdp , -.B part -\(em possibly followed by a number of partitions \(em to indicate how -missing device entries should be created. .TP .B metadata= @@ -333,7 +301,7 @@ This can be useful to impose a system-wide default of version-1 superblocks. .TP .B names=yes -Since Linux 2.6.29 it has been possible to create +It has been possible to create .B md devices with a name like .B md_home @@ -506,8 +474,12 @@ of the new disk or if both arrays have the same .IR spare-group . To update hot plug configuration it is necessary to execute -.B mdadm \-\-udev\-rules -command after changing the config file +.B mdadm \-\-udev\-rules\=<path_to_file> +e.g. +.B /etc/udev/rules.d/65-md-bare.rules +command after changing the config file. And also run +.B udevadm control \-\-reload +otherwise, a reboot is needed. Keywords used in the .I POLICY @@ -724,14 +696,6 @@ ARRAY /dev/md/home UUID=9187a482:5dde19d9:eea3cc4a:d646ab8b .br auto=part .br -POLICY domain=domain1 metadata=imsm path=pci-0000:00:1f.2-scsi-* -.br - action=spare -.br -POLICY domain=domain1 metadata=imsm path=pci-0000:04:00.0-scsi-[01]* -.br - action=include -.br # One domain comprising of devices attached to specified paths is defined. .br # Bare device matching first path will be made an imsm spare on hot plug. @@ -742,6 +706,14 @@ POLICY domain=domain1 metadata=imsm path=pci-0000:04:00.0-scsi-[01]* .br # given domain name can be migrated. .br +POLICY domain=domain1 metadata=imsm path=pci-0000:00:1f.2-scsi-* +.br + action=spare +.br +POLICY domain=domain1 metadata=imsm path=pci-0000:04:00.0-scsi-[01]* +.br + action=include +.br MAILADDR root@mydomain.tld .br PROGRAM /usr/sbin/handle\-mdadm\-events @@ -100,6 +100,11 @@ struct dlm_lksb { #define DEFAULT_BITMAP_DELAY 5 #define DEFAULT_MAX_WRITE_BEHIND 256 +#ifndef DEV_DIR +#define DEV_DIR "/dev/" +#define DEV_DIR_LEN (sizeof(DEV_DIR) - 1) +#endif /* DEV_DIR */ + /* DEV_NUM_PREF is a subpath to numbered MD devices, e.g. /dev/md1 or directory name. * DEV_NUM_PREF_LEN is a length with Null byte excluded. */ @@ -161,6 +166,7 @@ struct dlm_lksb { #include "md_p.h" #include "bitmap.h" #include "msg.h" +#include "mdadm_status.h" #include <endian.h> /* Redhat don't like to #include <asm/byteorder.h>, and @@ -408,8 +414,15 @@ struct mdinfo { #define DS_BLOCKED 16 #define DS_REMOVE 1024 #define DS_UNBLOCK 2048 + #define DS_EXTERNAL_BB 4096 int prev_state, curr_state, next_state; + /* If set by monitor, managemon needs to remove faulty device */ + bool man_disk_to_remove : 1; + + /* Managemon cannot close descriptors if monitor is using them for select() */ + bool mon_descriptors_not_used : 1; + /* info read from sysfs */ enum { ARRAY_CLEAR, @@ -444,12 +457,6 @@ struct spare_criteria { struct dev_policy *pols; }; -typedef enum mdadm_status { - MDADM_STATUS_SUCCESS = 0, - MDADM_STATUS_ERROR, - MDADM_STATUS_UNDEF, -} mdadm_status_t; - enum mode { ASSEMBLE=1, BUILD, @@ -463,10 +470,8 @@ enum mode { mode_count }; -extern char short_options[]; -extern char short_monitor_options[]; -extern char short_bitmap_options[]; -extern char short_bitmap_auto_options[]; +extern char short_opts[], short_monitor_opts[], short_bitmap_opts[], short_bitmap_auto_opts[]; + extern struct option long_options[]; extern char Version[], Usage[], Help[], OptionHelp[], *mode_help[], @@ -634,7 +639,6 @@ struct mddev_ident { int raid_disks; int spare_disks; struct supertype *st; - int autof; /* 1 for normal, 2 for partitioned */ char *spare_group; char *bitmap_file; int bitmap_fd; @@ -669,7 +673,6 @@ struct context { enum update_opt update; int scan; int SparcAdjust; - int autof; int delay; int freeze_reshape; char *backup_file; @@ -743,8 +746,12 @@ extern int mdstat_wait(int seconds); extern void mdstat_wait_fd(int fd, const sigset_t *sigmask); extern int mddev_busy(char *devnm); extern struct mdstat_ent *mdstat_by_component(char *name); +extern struct mdstat_ent *mdstat_find_by_member_name(struct mdstat_ent *mdstat, char *member_devnm); extern struct mdstat_ent *mdstat_by_subdev(char *subdev, char *container); +extern bool is_mdstat_ent_external(struct mdstat_ent *ent); +extern bool is_mdstat_ent_subarray(struct mdstat_ent *ent); + struct map_ent { struct map_ent *next; char devnm[32]; @@ -794,15 +801,17 @@ enum sysfs_read_flags { #define SYSFS_MAX_BUF_SIZE 64 +extern mdadm_status_t sysfs_write_descriptor(const int fd, const char *value, + const ssize_t len, int *errno_p); +extern mdadm_status_t write_attr(const char *value, const int fd); extern void sysfs_get_container_devnm(struct mdinfo *mdi, char *buf); -/* If fd >= 0, get the array it is open on, - * else use devnm. - */ extern int sysfs_open(char *devnm, char *devname, char *attr); +extern int sysfs_open_memb_attr(char *array_devnm, char *memb_devnm, char *attr, int oflag); extern int sysfs_init(struct mdinfo *mdi, int fd, char *devnm); extern void sysfs_init_dev(struct mdinfo *mdi, dev_t devid); extern void sysfs_free(struct mdinfo *sra); + extern struct mdinfo *sysfs_read(int fd, char *devnm, unsigned long options); extern int sysfs_attr_match(const char *attr, const char *str); extern int sysfs_match_word(const char *word, char **list); @@ -1597,7 +1606,7 @@ extern int Incremental(struct mddev_dev *devlist, struct context *c, struct supertype *st); extern void RebuildMap(void); extern int IncrementalScan(struct context *c, char *devnm); -extern int IncrementalRemove(char *devname, char *path, int verbose); +extern int Incremental_remove(char *devname, char *path, int verbose); extern int CreateBitmap(char *filename, int force, char uuid[16], unsigned long chunksize, unsigned long daemon_sleep, unsigned long write_behind, @@ -1749,8 +1758,6 @@ extern char *human_size(long long bytes); extern char *human_size_brief(long long bytes, int prefix); extern void print_r10_layout(int layout); -extern char *find_free_devnm(int use_partitions); - extern void put_md_name(char *name); extern char *devid2kname(dev_t devid); extern char *devid2devnm(dev_t devid); @@ -1759,8 +1766,7 @@ extern char *get_md_name(char *devnm); extern char DefaultConfFile[]; -extern int create_mddev(char *dev, char *name, int autof, int trustworthy, - char *chosen, int block_udev); +extern int create_mddev(char *dev, char *name, int trustworthy, char *chosen, int block_udev); /* values for 'trustworthy' */ #define LOCAL 1 #define LOCAL_ANY 10 @@ -1771,7 +1777,7 @@ extern int is_mddev(char *dev); extern int open_container(int fd); extern int metadata_container_matches(char *metadata, char *devnm); extern int metadata_subdev_matches(char *metadata, char *devnm); -extern int is_container_member(struct mdstat_ent *ent, char *devname); +extern bool is_container_member(struct mdstat_ent *ent, char *devname); extern int is_subarray_active(char *subarray, char *devname); extern int open_subarray(char *dev, char *subarray, struct supertype *st, int quiet); extern struct superswitch *version_to_superswitch(char *vers); @@ -1925,11 +1931,6 @@ static inline int xasprintf(char **strp, const char *fmt, ...) { #define pr_vrb(fmt, arg...) ((void)(verbose && pr_err(fmt, ##arg))) -void *xmalloc(size_t len); -void *xrealloc(void *ptr, size_t len); -void *xcalloc(size_t num, size_t size); -char *xstrdup(const char *str); - #define LEVEL_MULTIPATH (-4) #define LEVEL_LINEAR (-1) #define LEVEL_FAULTY (-5) @@ -2021,6 +2022,9 @@ enum r0layout { #define PATH_MAX 4096 #endif +/* The max string length necessary for decimal conversion, cannot be longer than count of bits */ +#define INT_2_DEC_STR_MAX (sizeof(int) * 8) + #define RESYNC_NONE -1 #define RESYNC_DELAYED -2 #define RESYNC_PENDING -3 diff --git a/mdadm_status.h b/mdadm_status.h new file mode 100644 index 0000000..905105e --- /dev/null +++ b/mdadm_status.h @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#ifndef MDADM_STATUS_H +#define MDADM_STATUS_H + +typedef enum mdadm_status { + MDADM_STATUS_SUCCESS = 0, + MDADM_STATUS_ERROR, + MDADM_STATUS_UNDEF, + MDADM_STATUS_MEM_FAIL +} mdadm_status_t; + +#endif @@ -65,6 +65,7 @@ #include "mdadm.h" #include "mdmon.h" +#include "xmalloc.h" char const Name[] = "mdmon"; @@ -198,8 +199,12 @@ static void try_kill_monitor(pid_t pid, char *devname, int sock) /* Wait for monitor to exit by reading from the socket, after * clearing the non-blocking flag */ fl = fcntl(sock, F_GETFL, 0); + if (fl < 0) + return; + fl &= ~O_NONBLOCK; - fcntl(sock, F_SETFL, fl); + if (fcntl(sock, F_SETFL, fl) < 0) + return; n = read(sock, buf, 100); /* If there is I/O going on it might took some time to get to @@ -249,7 +254,10 @@ static int make_control_sock(char *devname) listen(sfd, 10); fl = fcntl(sfd, F_GETFL, 0); fl |= O_NONBLOCK; - fcntl(sfd, F_SETFL, fl); + if (fcntl(sfd, F_SETFL, fl) < 0) { + close_fd(&sfd); + return -1; + } return sfd; } @@ -394,9 +402,7 @@ int main(int argc, char *argv[]) /* launch an mdmon instance for each container found */ mdstat = mdstat_read(0, 0); for (e = mdstat; e; e = e->next) { - if (e->metadata_version && - strncmp(e->metadata_version, "external:", 9) == 0 && - !is_subarray(&e->metadata_version[9])) { + if (is_mdstat_ent_external(e) && !is_mdstat_ent_subarray(e)) { /* update cmdline so this mdmon instance can be * distinguished from others in a call to ps(1) */ @@ -451,22 +457,25 @@ static int mdmon(char *devnm, int must_fork, int takeover) if (must_fork) { if (pipe(pfd) != 0) { pr_err("failed to create pipe\n"); + close_fd(&mdfd); return 1; } switch(fork()) { case -1: pr_err("failed to fork: %s\n", strerror(errno)); + close_fd(&mdfd); return 1; case 0: /* child */ - close(pfd[0]); + close_fd(&pfd[0]); break; default: /* parent */ - close(pfd[1]); + close_fd(&pfd[1]); if (read(pfd[0], &status, sizeof(status)) != sizeof(status)) { wait(&status); status = WEXITSTATUS(status); } - close(pfd[0]); + close_fd(&pfd[0]); + close_fd(&mdfd); return status; } } else @@ -48,8 +48,14 @@ struct active_array { enum array_state prev_state, curr_state, next_state; enum sync_action prev_action, curr_action, next_action; - int check_degraded; /* flag set by mon, read by manage */ - int check_reshape; /* flag set by mon, read by manage */ + bool check_degraded : 1; /* flag set by mon, read by manage */ + bool check_reshape : 1; /* flag set by mon, read by manage */ + + /** + * Signalize managemon there is a mdi to be removed. + * Monitor must acknowledge faulty state first. + */ + bool check_member_remove : 1; }; /* @@ -78,7 +84,7 @@ void do_manager(struct supertype *container); extern int sigterm; int read_dev_state(int fd); -int is_container_member(struct mdstat_ent *mdstat, char *container); +bool is_container_member(struct mdstat_ent *mdstat, char *container); struct mdstat_ent *mdstat_read(int hold, int start); @@ -23,9 +23,11 @@ */ #include "mdadm.h" -#include "udev.h" #include "md_p.h" #include "md_u.h" +#include "udev.h" +#include "xmalloc.h" + #include <sys/wait.h> #include <limits.h> #include <syslog.h> @@ -225,6 +227,11 @@ int Monitor(struct mddev_dev *devlist, return 1; } + if (mkdir(MDMON_DIR, 0700) < 0 && errno != EEXIST) { + pr_err("Failed to create directory " MDMON_DIR ": %s\n", strerror(errno)); + return 1; + } + if (share){ if (check_one_sharer(c->scan) == 2) return 1; @@ -432,12 +439,12 @@ static int make_daemon(char *pidfile) } /* - * check_one_sharer() - Checks for other mdmon processes running. + * check_one_sharer() - Checks for other mdmonitor processes running. * * Return: * 0 - no other processes running, * 1 - warning, - * 2 - error, or when scan mode is enabled, and one mdmon process already exists + * 2 - error, or when scan mode is enabled, and one mdmonitor process already exists */ static int check_one_sharer(int scan) { @@ -513,11 +520,6 @@ static int write_autorebuild_pid(void) FILE *fp; int fd; - if (mkdir(MDMON_DIR, 0700) < 0 && errno != EEXIST) { - pr_err("%s: %s\n", strerror(errno), MDMON_DIR); - return 1; - } - if (!is_directory(MDMON_DIR)) { pr_err("%s is not a regular directory.\n", MDMON_DIR); return 1; @@ -879,9 +881,7 @@ static int check_array(struct state *st, struct mdstat_ent *mdstat, } last_disk = i; - if (mse->metadata_version && - strncmp(mse->metadata_version, "external:", 9) == 0 && - is_subarray(mse->metadata_version+9)) { + if (is_mdstat_ent_subarray(mse)) { char *sl; snprintf(st->parent_devnm, MD_NAME_MAX, "%s", mse->metadata_version + 10); sl = strchr(st->parent_devnm, '/'); @@ -991,13 +991,12 @@ static int add_new_arrays(struct mdstat_ent *mdstat, struct state **statelist) snprintf(st->devnm, MD_NAME_MAX, "%s", mse->devnm); st->percent = RESYNC_UNKNOWN; st->expected_spares = -1; - if (mse->metadata_version && - strncmp(mse->metadata_version, - "external:", 9) == 0 && - is_subarray(mse->metadata_version+9)) { + + if (is_mdstat_ent_subarray(mse)) { char *sl; - snprintf(st->parent_devnm, MD_NAME_MAX, - "%s", mse->metadata_version + 10); + + snprintf(st->parent_devnm, MD_NAME_MAX, "%s", + mse->metadata_version + 10); sl = strchr(st->parent_devnm, '/'); if (sl) *sl = 0; @@ -1297,8 +1296,7 @@ int Wait(char *dev) } } if (!e || e->percent == RESYNC_NONE) { - if (e && e->metadata_version && - strncmp(e->metadata_version, "external:", 9) == 0) { + if (e && is_mdstat_ent_external(e)) { if (is_subarray(&e->metadata_version[9])) ping_monitor(&e->metadata_version[9]); else @@ -25,83 +25,9 @@ #include "mdadm.h" #include "udev.h" #include "md_p.h" -#include <ctype.h> - -void make_parts(char *dev, int cnt) -{ - /* make 'cnt' partition devices for 'dev' - * If dev is a device name we use the - * major/minor from dev and add 1..cnt - * If it is a symlink, we make similar symlinks. - * If dev ends with a digit, we add "p%d" else "%d" - * If the name exists, we use it's owner/mode, - * else that of dev - */ - struct stat stb; - int major_num; - int minor_num; - int odig; - int i; - int nlen = strlen(dev) + 20; - char *name; - int dig = isdigit(dev[strlen(dev)-1]); - char orig[1001]; - char sym[1024]; - int err; - - if (cnt == 0) - cnt = 4; - if (lstat(dev, &stb)!= 0) - return; - - if (S_ISBLK(stb.st_mode)) { - major_num = major(stb.st_rdev); - minor_num = minor(stb.st_rdev); - odig = -1; - } else if (S_ISLNK(stb.st_mode)) { - int len; - - len = readlink(dev, orig, sizeof(orig)); - if (len < 0 || len >= (int)sizeof(orig)) - return; - orig[len] = 0; - odig = isdigit(orig[len-1]); - major_num = -1; - minor_num = -1; - } else - return; - name = xmalloc(nlen); - for (i = 1; i <= cnt ; i++) { - struct stat stb2; - snprintf(name, nlen, "%s%s%d", dev, dig?"p":"", i); - if (stat(name, &stb2) == 0) { - if (!S_ISBLK(stb2.st_mode) || !S_ISBLK(stb.st_mode)) - continue; - if (stb2.st_rdev == makedev(major_num, minor_num+i)) - continue; - unlink(name); - } else { - stb2 = stb; - } - if (S_ISBLK(stb.st_mode)) { - if (mknod(name, S_IFBLK | 0600, - makedev(major_num, minor_num+i))) - perror("mknod"); - if (chown(name, stb2.st_uid, stb2.st_gid)) - perror("chown"); - if (chmod(name, stb2.st_mode & 07777)) - perror("chmod"); - err = 0; - } else { - snprintf(sym, sizeof(sym), "%s%s%d", orig, odig?"p":"", i); - err = symlink(sym, name); - } +#include "xmalloc.h" - if (err == 0 && stat(name, &stb2) == 0) - add_dev(name, &stb2, 0, NULL); - } - free(name); -} +#include <ctype.h> int create_named_array(char *devnm) { @@ -129,6 +55,35 @@ int create_named_array(char *devnm) return 1; } +char *find_free_devnm(void) +{ + static char devnm[MD_NAME_MAX]; + int devnum; + + for (devnum = 127; devnum != 128; devnum = devnum ? devnum - 1 : 511) { + sprintf(devnm, "md%d", devnum); + + if (mddev_busy(devnm)) + continue; + + if (!conf_name_is_free(devnm)) + continue; + + if (!udev_is_available()) { + /* make sure it is new to /dev too*/ + dev_t devid = devnm2devid(devnm); + + if (devid && map_dev(major(devid), minor(devid), 0)) + continue; + } + + break; + } + if (devnum == 128) + return NULL; + return devnm; +} + /* * We need a new md device to assemble/build/create an array. * 'dev' is a name given us by the user (command line or mdadm.conf) @@ -163,15 +118,13 @@ int create_named_array(char *devnm) * When we create devices, we use uid/gid/umask from config file. */ -int create_mddev(char *dev, char *name, int autof, int trustworthy, +int create_mddev(char *dev, char *name, int trustworthy, char *chosen, int block_udev) { int mdfd; struct stat stb; int num = -1; - int use_mdp = -1; struct createinfo *ci = conf_get_create_info(); - int parts; char *cname; char devname[37]; char devnm[32]; @@ -183,12 +136,6 @@ int create_mddev(char *dev, char *name, int autof, int trustworthy, if (chosen == NULL) chosen = cbuf; - if (autof == 0) - autof = ci->autof; - - parts = autof >> 3; - autof &= 7; - strcpy(chosen, DEV_MD_DIR); cname = chosen + strlen(chosen); @@ -210,12 +157,9 @@ int create_mddev(char *dev, char *name, int autof, int trustworthy, dev, dev+5); return -1; } - if (strcmp(cname, "md") == 0) - use_mdp = 0; - else - use_mdp = 1; + /* recreate name: /dev/md/0 or /dev/md/d0 */ - sprintf(cname, "%s%d", use_mdp?"d":"", num); + sprintf(cname, "%d", num); } else strcpy(cname, dev); @@ -244,29 +188,13 @@ int create_mddev(char *dev, char *name, int autof, int trustworthy, ep = sp; if (ep == sp || *ep || num < 0) num = -1; - else if (cname[0] == 'd') - use_mdp = 1; - else - use_mdp = 0; } } /* Now determine device number */ - /* named 'METADATA' cannot use 'mdp'. */ if (name && name[0] == 0) name = NULL; - if (name && trustworthy == METADATA && use_mdp == 1) { - pr_err("%s is not allowed for a %s container. Consider /dev/md%d.\n", dev, name, num); - return -1; - } - if (name && trustworthy == METADATA) - use_mdp = 0; - if (use_mdp == -1) { - if (autof == 4 || autof == 6) - use_mdp = 1; - else - use_mdp = 0; - } + if (num < 0 && trustworthy == LOCAL && name) { /* if name is numeric, possibly prefixed by * 'md' or '/dev/md', use that for num @@ -283,7 +211,7 @@ int create_mddev(char *dev, char *name, int autof, int trustworthy, if (ep == n2 || *ep) num = -1; else { - sprintf(devnm, "md%s%d", use_mdp ? "_d":"", num); + sprintf(devnm, "md%d", num); if (mddev_busy(devnm)) num = -1; } @@ -355,14 +283,15 @@ int create_mddev(char *dev, char *name, int autof, int trustworthy, if (devnm[0] == 0) { if (num < 0) { /* need to choose a free number. */ - char *_devnm = find_free_devnm(use_mdp); - if (_devnm == NULL) { + char *_devnm = find_free_devnm(); + + if (!_devnm) { pr_err("No avail md devices - aborting\n"); return -1; } strcpy(devnm, _devnm); } else { - sprintf(devnm, "%s%d", use_mdp?"md_d":"md", num); + sprintf(devnm, "md%d", num); if (mddev_busy(devnm)) { pr_err("%s is already in use.\n", dev); @@ -376,7 +305,7 @@ int create_mddev(char *dev, char *name, int autof, int trustworthy, sprintf(devname, "/dev/%s", devnm); - if (dev && dev[0] == '/') + if (dev && dev[0] == '/' && strlen(dev) < 400) strcpy(chosen, dev); else if (cname[0] == 0) strcpy(chosen, devname); @@ -406,11 +335,13 @@ int create_mddev(char *dev, char *name, int autof, int trustworthy, perror("chown"); if (chmod(devname, ci->mode)) perror("chmod"); - stat(devname, &stb); + if (stat(devname, &stb) < 0) { + pr_err("failed to stat %s\n", + devname); + return -1; + } add_dev(devname, &stb, 0, NULL); } - if (use_mdp == 1) - make_parts(devname, parts); if (strcmp(chosen, devname) != 0) { if (mkdir(DEV_NUM_PREF, 0700) == 0) { @@ -440,8 +371,6 @@ int create_mddev(char *dev, char *name, int autof, int trustworthy, } else if (symlink(devname, chosen) != 0) pr_err("failed to create %s: %s\n", chosen, strerror(errno)); - if (use_mdp && strcmp(chosen, devname) != 0) - make_parts(chosen, parts); } } mdfd = open_dev_excl(devnm); @@ -493,36 +422,3 @@ int is_mddev(char *dev) return 0; } - -char *find_free_devnm(int use_partitions) -{ - static char devnm[32]; - int devnum; - for (devnum = 127; devnum != 128; - devnum = devnum ? devnum-1 : (1<<9)-1) { - - if (use_partitions) - sprintf(devnm, "md_d%d", devnum); - else - sprintf(devnm, "md%d", devnum); - if (mddev_busy(devnm)) - continue; - if (!conf_name_is_free(devnm)) - continue; - if (!udev_is_available()) { - /* make sure it is new to /dev too, at least as a - * non-standard */ - dev_t devid = devnm2devid(devnm); - if (devid) { - char *dn = map_dev(major(devid), - minor(devid), 0); - if (dn && ! is_standard(dn, NULL)) - continue; - } - } - break; - } - if (devnum == 128) - return NULL; - return devnm; -} @@ -80,6 +80,8 @@ #include "mdadm.h" #include "dlink.h" +#include "xmalloc.h" + #include <sys/select.h> #include <ctype.h> @@ -110,6 +112,30 @@ static int add_member_devname(struct dev_member **m, char *name) return 1; } +/* Detach element from the list, it may modify list_head */ +static void mdstat_ent_list_detach_element(struct mdstat_ent **list_head, struct mdstat_ent *el) +{ + struct mdstat_ent *ent = *list_head; + + if (ent == el) { + *list_head = ent->next; + } else { + while (ent) { + if (ent->next == el) { + ent->next = el->next; + break; + } + + ent = ent->next; + } + + } + + /* Guard if not found or list is empty - not allowed */ + assert(ent); + el->next = NULL; +} + void free_mdstat(struct mdstat_ent *ms) { while (ms) { @@ -124,6 +150,32 @@ void free_mdstat(struct mdstat_ent *ms) } } +bool is_mdstat_ent_external(struct mdstat_ent *ent) +{ + if (!ent->metadata_version) + return false; + + if (strncmp(ent->metadata_version, "external:", 9) == 0) + return true; + return false; +} + +bool is_mdstat_ent_subarray(struct mdstat_ent *ent) +{ + if (is_mdstat_ent_external(ent) && is_subarray(ent->metadata_version + 9)) + return true; + return false; +} + +bool is_container_member(struct mdstat_ent *mdstat, char *container) +{ + if (is_mdstat_ent_external(mdstat) && + metadata_container_matches(mdstat->metadata_version + 9, container)) + return true; + + return false; +} + static int mdstat_fd = -1; struct mdstat_ent *mdstat_read(int hold, int start) { @@ -146,8 +198,11 @@ struct mdstat_ent *mdstat_read(int hold, int start) f = fopen("/proc/mdstat", "r"); if (f == NULL) return NULL; - else - fcntl(fileno(f), F_SETFD, FD_CLOEXEC); + + if (fcntl(fileno(f), F_SETFD, FD_CLOEXEC) < 0) { + fclose(f); + return NULL; + } all = NULL; end = &all; @@ -281,7 +336,10 @@ struct mdstat_ent *mdstat_read(int hold, int start) } if (hold && mdstat_fd == -1) { mdstat_fd = dup(fileno(f)); - fcntl(mdstat_fd, F_SETFD, FD_CLOEXEC); + if (fcntl(mdstat_fd, F_SETFD, FD_CLOEXEC) < 0) { + fclose(f); + return NULL; + } } fclose(f); @@ -382,61 +440,70 @@ int mddev_busy(char *devnm) return me != NULL; } -struct mdstat_ent *mdstat_by_component(char *name) +/** + * mdstat_find_by_member_devnm()- Return first array or external container with member device. + * @mdstat: Preloaded mdstat to iterate over. + * @member_devnm: devnm of the device to find. + * + * External subarrays are skipped. + */ +struct mdstat_ent *mdstat_find_by_member_name(struct mdstat_ent *mdstat, char *member_devnm) { - struct mdstat_ent *mdstat = mdstat_read(0, 0); + struct mdstat_ent *ent; - while (mdstat) { - struct dev_member *m; - struct mdstat_ent *ent; - if (mdstat->metadata_version && - strncmp(mdstat->metadata_version, "external:", 9) == 0 && - is_subarray(mdstat->metadata_version+9)) - /* don't return subarrays, only containers */ - ; - else for (m = mdstat->members; m; m = m->next) { - if (strcmp(m->name, name) == 0) { - free_mdstat(mdstat->next); - mdstat->next = NULL; - return mdstat; - } - } - ent = mdstat; - mdstat = mdstat->next; - ent->next = NULL; - free_mdstat(ent); + for (ent = mdstat; ent; ent = ent->next) { + struct dev_member *member; + + if (is_mdstat_ent_subarray(ent)) + continue; + + for (member = ent->members; member; member = member->next) + if (strcmp(member->name, member_devnm) == 0) + return ent; } + return NULL; } + +struct mdstat_ent *mdstat_by_component(char *name) +{ + struct mdstat_ent *mdstat = mdstat_read(0, 0); + struct mdstat_ent *ent = mdstat_find_by_member_name(mdstat, name); + + if (ent) + mdstat_ent_list_detach_element(&mdstat, ent); + + free_mdstat(mdstat); + + return ent; +} + struct mdstat_ent *mdstat_by_subdev(char *subdev, char *container) { struct mdstat_ent *mdstat = mdstat_read(0, 0); struct mdstat_ent *ent = NULL; - while (mdstat) { + for (ent = mdstat; ent; ent = ent->next) { /* metadata version must match: * external:[/-]%s/%s * where first %s is 'container' and second %s is 'subdev' */ - if (ent) - free_mdstat(ent); - ent = mdstat; - mdstat = mdstat->next; - ent->next = NULL; - if (ent->metadata_version == NULL || - strncmp(ent->metadata_version, "external:", 9) != 0) + if (!is_mdstat_ent_external(ent)) continue; - if (!metadata_container_matches(ent->metadata_version+9, - container) || - !metadata_subdev_matches(ent->metadata_version+9, - subdev)) + if (!metadata_container_matches(ent->metadata_version + 9, container)) + continue; + if (!metadata_subdev_matches(ent->metadata_version + 9, subdev)) continue; - free_mdstat(mdstat); - return ent; + break; } - return NULL; + + if (ent) + mdstat_ent_list_detach_element(&mdstat, ent); + + free_mdstat(mdstat); + return ent; } @@ -35,11 +35,6 @@ enum bb_action { COMPARE_BB, }; -static int write_attr(char *attr, int fd) -{ - return write(fd, attr, strlen(attr)); -} - static void add_fd(fd_set *fds, int *maxfd, int fd) { struct stat st; @@ -155,6 +150,8 @@ int read_dev_state(int fd) rv |= DS_SPARE; if (sysfs_attr_match(cp, "blocked")) rv |= DS_BLOCKED; + if (sysfs_attr_match(cp, "external_bbl")) + rv |= DS_EXTERNAL_BB; cp = strchr(cp, ','); if (cp) cp++; @@ -173,7 +170,7 @@ int process_ubb(struct active_array *a, struct mdinfo *mdi, const unsigned long * via sysfs file */ if ((ss->record_bad_block(a, mdi->disk.raid_disk, sector, length)) && - (write(mdi->bb_fd, buf, buf_len) == buf_len)) + (sysfs_write_descriptor(mdi->bb_fd, buf, buf_len, NULL) == MDADM_STATUS_SUCCESS)) return 1; /* @@ -311,9 +308,6 @@ static int check_for_cleared_bb(struct active_array *a, struct mdinfo *mdi) struct md_bb *bb; int i; - if (!ss->get_bad_blocks) - return -1; - /* * Get a list of bad blocks for an array, then read list of * acknowledged bad blocks from kernel and compare it against metadata @@ -402,11 +396,12 @@ static void signal_manager(void) #define ARRAY_DIRTY 1 #define ARRAY_BUSY 2 -static int read_and_act(struct active_array *a, fd_set *fds) +static int read_and_act(struct active_array *a) { unsigned long long sync_completed; - int check_degraded = 0; - int check_reshape = 0; + bool disks_to_remove = false; + bool check_degraded = false; + bool check_reshape = false; int deactivate = 0; struct mdinfo *mdi; int ret = 0; @@ -430,23 +425,32 @@ static int read_and_act(struct active_array *a, fd_set *fds) for (mdi = a->info.devs; mdi ; mdi = mdi->next) { mdi->next_state = 0; mdi->curr_state = 0; - if (mdi->state_fd >= 0) { - read_resync_start(mdi->recovery_fd, - &mdi->recovery_start); - mdi->curr_state = read_dev_state(mdi->state_fd); - } - /* - * If array is blocked and metadata handler is able to handle - * BB, check if you can acknowledge them to md driver. If - * successful, clear faulty state and unblock the array. - */ - if ((mdi->curr_state & DS_BLOCKED) && - a->container->ss->record_bad_block && - (process_dev_ubb(a, mdi) > 0)) { + + if (mdi->man_disk_to_remove) + /* We are removing this device, skip it then */ + continue; + + read_resync_start(mdi->recovery_fd, &mdi->recovery_start); + mdi->curr_state = read_dev_state(mdi->state_fd); + + if (!(mdi->curr_state & DS_EXTERNAL_BB)) + /* + * It assumes that superswitch badblock functions are set if disk + * has external badblocks support configured. + */ + continue; + + if ((mdi->curr_state & DS_BLOCKED) && process_dev_ubb(a, mdi) > 0) + /* + * Blocked has two meanings: we need to acknowledge failure or badblocks + * (if supported). Here, badblocks are handled. + * + * If successful, unblock the array. This is not perfect but + * process_dev_ubb() may disable badblock support in case of failure. + */ mdi->next_state |= DS_UNBLOCK; - } - if (FD_ISSET(mdi->bb_fd, fds)) - check_for_cleared_bb(a, mdi); + + check_for_cleared_bb(a, mdi); } gettimeofday(&tv, NULL); @@ -621,24 +625,12 @@ static int read_and_act(struct active_array *a, fd_set *fds) write_attr("-blocked", mdi->state_fd); } - if ((mdi->next_state & DS_REMOVE) && mdi->state_fd >= 0) { - int remove_result; - - /* The kernel may not be able to immediately remove the - * disk. In that case we wait a little while and - * try again. - */ - remove_result = write_attr("remove", mdi->state_fd); - if (remove_result > 0) { - dprintf_cont(" %d:removed", mdi->disk.raid_disk); - close(mdi->state_fd); - close(mdi->recovery_fd); - close(mdi->bb_fd); - close(mdi->ubb_fd); - mdi->state_fd = -1; - } else - ret |= ARRAY_BUSY; + if ((mdi->next_state & DS_REMOVE) && !mdi->man_disk_to_remove) { + dprintf_cont(" %d:disk_to_remove", mdi->disk.raid_disk); + mdi->man_disk_to_remove = true; + disks_to_remove = true; } + if (mdi->next_state & DS_INSYNC) { write_attr("+in_sync", mdi->state_fd); dprintf_cont(" %d:+in_sync", mdi->disk.raid_disk); @@ -651,17 +643,14 @@ static int read_and_act(struct active_array *a, fd_set *fds) a->prev_action = a->curr_action; - for (mdi = a->info.devs; mdi ; mdi = mdi->next) { + for (mdi = a->info.devs; mdi ; mdi = mdi->next) mdi->prev_state = mdi->curr_state; - mdi->next_state = 0; - } - if (check_degraded || check_reshape) { - /* manager will do the actual check */ - if (check_degraded) - a->check_degraded = 1; - if (check_reshape) - a->check_reshape = 1; + if (check_degraded || check_reshape || disks_to_remove) { + + a->check_member_remove |= disks_to_remove; + a->check_degraded |= check_degraded; + a->check_reshape |= check_reshape; signal_manager(); } @@ -734,13 +723,11 @@ int monitor_loop_cnt; static int wait_and_act(struct supertype *container, int nowait) { - fd_set rfds; - int maxfd = 0; - struct active_array **aap = &container->arrays; - struct active_array *a, **ap; - int rv; - struct mdinfo *mdi; + struct active_array *a, **ap, **aap = &container->arrays; static unsigned int dirty_arrays = ~0; /* start at some non-zero value */ + struct mdinfo *mdi; + int rv, maxfd = 0; + fd_set rfds; FD_ZERO(&rfds); @@ -764,7 +751,18 @@ static int wait_and_act(struct supertype *container, int nowait) add_fd(&rfds, &maxfd, a->info.state_fd); add_fd(&rfds, &maxfd, a->action_fd); add_fd(&rfds, &maxfd, a->sync_completed_fd); + for (mdi = a->info.devs ; mdi ; mdi = mdi->next) { + if (mdi->man_disk_to_remove) { + mdi->mon_descriptors_not_used = true; + + /* Managemon could be blocked on suspend in kernel. + * Monitor must respond if any badblock is recorded in this time. + */ + container->retry_soon = 1; + continue; + } + add_fd(&rfds, &maxfd, mdi->state_fd); add_fd(&rfds, &maxfd, mdi->bb_fd); add_fd(&rfds, &maxfd, mdi->ubb_fd); @@ -863,7 +861,8 @@ static int wait_and_act(struct supertype *container, int nowait) signal_manager(); } if (a->container && !a->to_remove) { - int ret = read_and_act(a, &rfds); + int ret = read_and_act(a); + rv |= 1; dirty_arrays += !!(ret & ARRAY_DIRTY); /* when terminating stop manipulating the array after it @@ -30,6 +30,7 @@ #include <sys/un.h> #include "mdadm.h" #include "mdmon.h" +#include "xmalloc.h" static const __u32 start_magic = 0x5a5aa5a5; static const __u32 end_magic = 0xa5a55a5a; @@ -176,8 +177,15 @@ int connect_monitor(char *devname) } fl = fcntl(sfd, F_GETFL, 0); + if (fl < 0) { + close(sfd); + return -1; + } fl |= O_NONBLOCK; - fcntl(sfd, F_SETFL, fl); + if (fcntl(sfd, F_SETFL, fl) < 0) { + close(sfd); + return -1; + } return sfd; } diff --git a/platform-intel.c b/platform-intel.c index d6a5353..95bc492 100644 --- a/platform-intel.c +++ b/platform-intel.c @@ -19,6 +19,8 @@ #include "mdadm.h" #include "platform-intel.h" #include "probe_roms.h" +#include "xmalloc.h" + #include <stdio.h> #include <stdlib.h> #include <string.h> @@ -105,12 +107,75 @@ static void free_sys_dev(struct sys_dev **list) } } +/** + * vmd_find_pci_bus() - look for PCI bus created by VMD. + * @vmd_path: path to vmd driver. + * @buf: return buffer, must be PATH_MAX. + * + * Each VMD device represents one domain and each VMD device adds separate PCI bus. + * IMSM must know VMD domains, therefore it needs to determine and follow buses. + * + */ +mdadm_status_t vmd_find_pci_bus(char *vmd_path, char *buf) +{ + char tmp[PATH_MAX]; + struct dirent *ent; + DIR *vmd_dir; + char *rp_ret; + + snprintf(tmp, PATH_MAX, "%s/domain/device", vmd_path); + + rp_ret = realpath(tmp, buf); + + if (rp_ret) + return MDADM_STATUS_SUCCESS; + + if (errno != ENOENT) + return MDADM_STATUS_ERROR; + + /* + * If it is done early, there is a chance that kernel is still enumerating VMD device but + * kernel did enough to start enumerating child devices, {vmd_path}/domain/device link may + * not exist yet. We have to look into @vmd_path directory and find it ourselves. + */ + + vmd_dir = opendir(vmd_path); + + if (!vmd_dir) + return MDADM_STATUS_ERROR; + + for (ent = readdir(vmd_dir); ent; ent = readdir(vmd_dir)) { + static const char pci[] = "pci"; + + /** + * Pci bus must have form pciXXXXX:XX, where X is a digit i.e pci10000:00. + * We do not check digits here, it is sysfs so it should be safe to check + * length and ':' position only. + */ + if (strncmp(ent->d_name, pci, strlen(pci)) != 0) + continue; + + if (ent->d_name[8] != ':' || ent->d_name[11] != 0) + continue; + break; + } + + if (!ent) { + closedir(vmd_dir); + return MDADM_STATUS_ERROR; + } + + snprintf(buf, PATH_MAX, "%s/%s", vmd_path, ent->d_name); + closedir(vmd_dir); + return MDADM_STATUS_SUCCESS; +} + struct sys_dev *find_driver_devices(const char *bus, const char *driver) { /* search sysfs for devices driven by 'driver' */ char path[PATH_MAX]; char link[PATH_MAX]; - char *c, *p; + char *c; DIR *driver_dir; struct dirent *de; struct sys_dev *head = NULL; @@ -142,8 +207,9 @@ struct sys_dev *find_driver_devices(const char *bus, const char *driver) return NULL; } for (de = readdir(driver_dir); de; de = readdir(driver_dir)) { - int n; int skip = 0; + char *p; + int n; /* is 'de' a device? check that the 'subsystem' link exists and * that its target matches 'bus' @@ -195,18 +261,20 @@ struct sys_dev *find_driver_devices(const char *bus, const char *driver) if (devpath_to_ll(path, "class", &class) != 0) continue; - /* - * Each VMD device (domain) adds separate PCI bus, it is better - * to store path as a path to that bus (easier further - * determination which NVMe dev is connected to this particular - * VMD domain). - */ if (type == SYS_DEV_VMD) { - sprintf(path, "/sys/bus/%s/drivers/%s/%s/domain/device", - bus, driver, de->d_name); + char vmd_path[PATH_MAX]; + + sprintf(vmd_path, "/sys/bus/%s/drivers/%s/%s", bus, driver, de->d_name); + + if (vmd_find_pci_bus(vmd_path, path)) { + pr_err("Cannot determine VMD bus for %s\n", vmd_path); + continue; + } } + p = realpath(path, NULL); - if (p == NULL) { + + if (!p) { pr_err("Unable to get real path for '%s'\n", path); continue; } @@ -577,6 +645,9 @@ static const struct imsm_orom *find_imsm_hba_orom(struct sys_dev *hba) #define SYS_EFI_VAR_PATH "/sys/firmware/efi/vars" #define SYS_EFIVARS_PATH "/sys/firmware/efi/efivars" +#define ACPI_TABLES_PATH "/sys/firmware/acpi/tables/" +#define ACPI_UEFI_TABLE_BASE_NAME "UEFI" +#define ACPI_UEFI_DATA_OFFSET 52 #define SCU_PROP "RstScuV" #define AHCI_PROP "RstSataV" #define AHCI_SSATA_PROP "RstsSatV" @@ -584,10 +655,73 @@ static const struct imsm_orom *find_imsm_hba_orom(struct sys_dev *hba) #define VROC_VMD_PROP "RstUefiV" #define RST_VMD_PROP "RstVmdV" -#define VENDOR_GUID \ +#define PCI_CLASS_RAID_CNTRL 0x010400 + +/* GUID length in Bytes */ +#define GUID_LENGTH 16 + +/* GUID entry in 'UEFI' for Sata controller. */ +#define RST_SATA_V_GUID \ + EFI_GUID(0xe4dd92e0, 0xac7d, 0x11df, 0x94, 0xe2, 0x08, 0x00, 0x20, 0x0c, 0x9a, 0x66) + +/* GUID entry in 'UEFI' for sSata controller. */ +#define RST_SSATA_V_GUID \ + EFI_GUID(0xb002be42, 0x901d, 0x4018, 0xb4, 0x1e, 0xd7, 0x04, 0xab, 0x3a, 0x0f, 0x15) + +/* GUID entry in 'UEFI' for tSata controller. */ +#define RST_TSATA_V_GUID \ + EFI_GUID(0x101ce8f1, 0xb873, 0x4362, 0xa9, 0x76, 0xb5, 0x54, 0x31, 0x74, 0x52, 0x7e) + +/* GUID entry in 'UEFI' for Intel(R) VROC VMD. */ +#define RST_UEFI_V_GUID \ + EFI_GUID(0x4bf2da96, 0xde6e, 0x4d8a, 0xa8, 0x8b, 0xb3, 0xd, 0x33, 0xf6, 0xf, 0x3e) + +/** + * GUID entry in 'UEFI' for Intel(R) RST VMD. + * Currently is the same like in 'UEFI' for Sata controller. + */ +#define RST_VMD_V_GUID RST_SATA_V_GUID + +/* GUID of intel RST vendor EFI var. */ +#define INTEL_RST_VENDOR_GUID \ EFI_GUID(0x193dfefa, 0xa445, 0x4302, 0x99, 0xd8, 0xef, 0x3a, 0xad, 0x1a, 0x04, 0xc6) -#define PCI_CLASS_RAID_CNTRL 0x010400 +/* + * Unified Extensible Firmware Interface (UEFI) Specification Release 2.10 + * UEFI ACPI DATA TABLE, Table O.1 + */ +typedef struct uefi_acpi_table { + char signature[4]; + __u32 length; + __u8 revision; + __u8 checksum; + char oemid[6]; + /* controller name */ + char oem_table_id[8]; + __u32 oem_revision; + __u32 creator_id; + __u32 creator_revision; + /* controller GUID */ + struct efi_guid identifier; + /* OROM data offeset */ + __u16 dataOffset; +} uefi_acpi_table_t; + +typedef struct uefi_acpi_table_with_orom { + struct uefi_acpi_table table; + struct imsm_orom orom; +} uefi_acpi_table_with_orom_t; + +/* imsm_orom_id - Identifier used to match imsm efi var or acpi table + * @name: name of the UEFI property, it is part of efivar name or ACPI table oem_table_id + * @guid: acpi table guid identifier + * + * vendor guid (second part of evifar name) is not added here because it is cost. + */ +typedef struct imsm_orom_id { + char *name; + struct efi_guid guid; +} imsm_orom_id_t; static int read_efi_var(void *buffer, ssize_t buf_size, const char *variable_name, struct efi_guid guid) @@ -669,14 +803,238 @@ static int read_efi_variable(void *buffer, ssize_t buf_size, return 0; } +/** + * is_efi_guid_equal() - check if EFI guids are equal. + * @guid: EFI guid. + * @guid1: EFI guid to compare. + * + * Return: %true if guid are equal, %false otherwise. + */ +static inline bool is_efi_guid_equal(struct efi_guid guid, struct efi_guid guid1) +{ + if (memcmp(guid.b, guid1.b, GUID_LENGTH) == 0) + return true; + return false; +} + +/** + * acpi_any_imsm_orom_id_matching() - match ACPI table with any of given imsm_orom_id. + * @imsm_orom_ids: array of IMSM OROM Identifiers. + * @imsm_orom_ids_number: number of IMSM OROM Identifiers. + * @table: struct with read ACPI UEFI table. + * + * Check if read UEFI table contains requested OROM id. + * EFI GUID and controller name are compared with expected. + * + * Return: %true if length is proper table, %false otherwise. + */ +bool acpi_any_imsm_orom_id_matching(imsm_orom_id_t *imsm_orom_ids, int imsm_orom_ids_number, + struct uefi_acpi_table table) +{ + int index; + + for (index = 0; index < imsm_orom_ids_number; index++) + if (strncmp(table.oem_table_id, imsm_orom_ids[index].name, + strlen(imsm_orom_ids[index].name)) == 0 && + is_efi_guid_equal(table.identifier, + imsm_orom_ids[index].guid) == true) + return true; + return false; +} + +/** + * read_uefi_acpi_orom_data() - read OROM data from UEFI ACPI table. + * @fd: file descriptor. + * @uefi_table: struct to fill out. + * + * Read OROM from ACPI UEFI table under given file descriptor. + * Table must have the appropriate OROM data, which should be confirmed before call this function. + * In case of success, &orom in structure in &uefi_table will be filled.. + * + * Return: %MDADM_STATUS_SUCCESS on success, %MDADM_STATUS_ERROR otherwise. + */ +mdadm_status_t +read_uefi_acpi_orom_data(int fd, uefi_acpi_table_with_orom_t *uefi_table) +{ + assert(is_fd_valid(fd)); + + if (lseek(fd, uefi_table->table.dataOffset, 0) == -1L) + return MDADM_STATUS_ERROR; + + if (read(fd, &uefi_table->orom, sizeof(uefi_table->orom)) == -1) + return MDADM_STATUS_ERROR; + + return MDADM_STATUS_SUCCESS; +} + +/** + * verify_uefi_acpi_table_length() - verify if ACPI UEFI table have correct length with focus at + * OROM. + * @table: struct with UEFI table. + * + * Verify if ACPI UEFI table have correct length with focus at OROM. Make sure that the file is + * correct and contains the appropriate length data based on the length of the OROM. + * + * Return: %true if length is correct, %false otherwise. + */ +bool verify_uefi_acpi_table_length(struct uefi_acpi_table table) +{ + if (table.length < ACPI_UEFI_DATA_OFFSET) + return false; + + if (table.length - table.dataOffset != sizeof(struct imsm_orom)) + return false; + return true; +} + +/** + * find_orom_in_acpi_uefi_tables() - find OROM in UEFI ACPI tables based on requested OROM ids. + * @imsm_orom_ids: array of IMSM OROM Identifiers. + * @imsm_orom_ids_number: number of IMSM OROM Identifiers. + * @orom: OROM struct buffer to fill out. + * + * Find OROM in UEFI ACPI tables provided by Intel, based on requested controllers. + * The first one to be matched, will be used. + * If found, the buffer with the OROM structure will be filled. + * + * Return: %MDADM_STATUS_SUCCESS on success, %MDADM_STATUS_ERROR otherwise. + */ +mdadm_status_t +find_orom_in_acpi_uefi_tables(imsm_orom_id_t *imsm_orom_ids, int imsm_orom_ids_number, + struct imsm_orom *orom) +{ + mdadm_status_t status = MDADM_STATUS_ERROR; + uefi_acpi_table_with_orom_t uefi_table; + char path[PATH_MAX]; + struct dirent *ent; + int fd = -1; + DIR *dir; + + dir = opendir(ACPI_TABLES_PATH); + if (!dir) + return MDADM_STATUS_ERROR; + + for (ent = readdir(dir); ent; ent = readdir(dir)) { + close_fd(&fd); + + /* Check if file is a UEFI table */ + if (strncmp(ent->d_name, ACPI_UEFI_TABLE_BASE_NAME, + strlen(ACPI_UEFI_TABLE_BASE_NAME)) != 0) + continue; + + snprintf(path, PATH_MAX, "%s/%s", ACPI_TABLES_PATH, ent->d_name); + + fd = open(path, O_RDONLY); + if (!is_fd_valid(fd)) { + pr_err("Fail to open ACPI UEFI table file. File: %s, Error: %s\n", + ent->d_name, strerror(errno)); + continue; + } + + if (read(fd, &uefi_table.table, sizeof(struct uefi_acpi_table)) == -1) { + pr_err("Fail to read IMSM OROM from ACPI UEFI table file. File: %s\n", + ent->d_name); + continue; + } + + if (!acpi_any_imsm_orom_id_matching(imsm_orom_ids, imsm_orom_ids_number, + uefi_table.table)) + continue; + + if (!verify_uefi_acpi_table_length(uefi_table.table)) + continue; + + if (read_uefi_acpi_orom_data(fd, &uefi_table)) { + pr_err("Fail to read IMSM OROM from ACPI UEFI table file. File: %s\n", + ent->d_name); + continue; + } + + memcpy(orom, &uefi_table.orom, sizeof(uefi_table.orom)); + status = MDADM_STATUS_SUCCESS; + break; + } + + close_fd(&fd); + closedir(dir); + return status; +} + +/** + * find_orom_in_efi_variables() - find first IMSM OROM in EFI vars that matches any imsm_orom_id. + * @imsm_orom_ids: array of IMSM OROM Identifiers. + * @imsm_orom_ids_number: number of IMSM OROM Identifiers. + * @orom: OROM struct buffer to fill out. + * + * Find IMSM OROM that matches on of imsm_orom_id in EFI variables. The first match is used. + * If found, the buffer with the OROM structure is filled. + * + * Return: %MDADM_STATUS_SUCCESS on success, %MDADM_STATUS_ERROR otherwise. + */ +mdadm_status_t +find_orom_in_efi_variables(imsm_orom_id_t *imsm_orom_ids, int imsm_orom_ids_number, + struct imsm_orom *orom) +{ + int index; + + for (index = 0; index < imsm_orom_ids_number; index++) + if (!read_efi_variable(orom, sizeof(struct imsm_orom), imsm_orom_ids[index].name, + INTEL_RST_VENDOR_GUID)) + return MDADM_STATUS_SUCCESS; + return MDADM_STATUS_ERROR; +} + +/** + * find_imsm_efi_orom() - find OROM for requested controller. + * @orom: buffer for OROM. + * @controller_type: requested controller type. + * + * Based on controller type, function first search in EFI vars then in ACPI UEFI tables. + * For each controller there is defined an array of OROM ids from which we can read OROM, + * the first one to be matched, will be used. + * In case of success, the structure &orom will be filed out. + * + * Return: %MDADM_STATUS_SUCCESS on success. + */ +static mdadm_status_t +find_imsm_efi_orom(struct imsm_orom *orom, enum sys_dev_type controller_type) +{ + static imsm_orom_id_t sata_imsm_orrom_ids[] = { + {AHCI_PROP, RST_SATA_V_GUID}, + {AHCI_SSATA_PROP, RST_SSATA_V_GUID}, + {AHCI_TSATA_PROP, RST_TSATA_V_GUID}, + }; + static imsm_orom_id_t vmd_imsm_orom_ids[] = { + {VROC_VMD_PROP, RST_UEFI_V_GUID}, + {RST_VMD_PROP, RST_VMD_V_GUID}, + }; + static imsm_orom_id_t *imsm_orom_ids; + int imsm_orom_ids_number; + + switch (controller_type) { + case SYS_DEV_SATA: + imsm_orom_ids = sata_imsm_orrom_ids; + imsm_orom_ids_number = ARRAY_SIZE(sata_imsm_orrom_ids); + break; + case SYS_DEV_VMD: + case SYS_DEV_SATA_VMD: + imsm_orom_ids = vmd_imsm_orom_ids; + imsm_orom_ids_number = ARRAY_SIZE(vmd_imsm_orom_ids); + break; + default: + return MDADM_STATUS_UNDEF; + } + + if (!find_orom_in_efi_variables(imsm_orom_ids, imsm_orom_ids_number, orom)) + return MDADM_STATUS_SUCCESS; + + return find_orom_in_acpi_uefi_tables(imsm_orom_ids, imsm_orom_ids_number, orom); +} + const struct imsm_orom *find_imsm_efi(struct sys_dev *hba) { struct imsm_orom orom; struct orom_entry *ret; - static const char * const sata_efivars[] = {AHCI_PROP, AHCI_SSATA_PROP, - AHCI_TSATA_PROP}; - static const char * const vmd_efivars[] = {VROC_VMD_PROP, RST_VMD_PROP}; - unsigned long i; if (check_env("IMSM_TEST_AHCI_EFI") || check_env("IMSM_TEST_SCU_EFI")) return imsm_platform_test(hba); @@ -687,36 +1045,20 @@ const struct imsm_orom *find_imsm_efi(struct sys_dev *hba) switch (hba->type) { case SYS_DEV_SAS: - if (!read_efi_variable(&orom, sizeof(orom), SCU_PROP, - VENDOR_GUID)) + if (!read_efi_variable(&orom, sizeof(orom), SCU_PROP, INTEL_RST_VENDOR_GUID)) break; - return NULL; case SYS_DEV_SATA: if (hba->class != PCI_CLASS_RAID_CNTRL) return NULL; - for (i = 0; i < ARRAY_SIZE(sata_efivars); i++) { - if (!read_efi_variable(&orom, sizeof(orom), - sata_efivars[i], VENDOR_GUID)) - break; - - } - if (i == ARRAY_SIZE(sata_efivars)) + if (find_imsm_efi_orom(&orom, hba->type)) return NULL; - break; case SYS_DEV_VMD: case SYS_DEV_SATA_VMD: - for (i = 0; i < ARRAY_SIZE(vmd_efivars); i++) { - if (!read_efi_variable(&orom, sizeof(orom), - vmd_efivars[i], VENDOR_GUID)) - break; - } - - if (i == ARRAY_SIZE(vmd_efivars)) + if (find_imsm_efi_orom(&orom, hba->type)) return NULL; - break; default: return NULL; @@ -749,7 +1091,8 @@ const struct imsm_orom *find_imsm_nvme(struct sys_dev *hba) .vpa = IMSM_OROM_VOLUMES_PER_ARRAY, .vphba = IMSM_OROM_TOTAL_DISKS_NVME / 2 * IMSM_OROM_VOLUMES_PER_ARRAY, .attr = IMSM_OROM_ATTR_2TB | IMSM_OROM_ATTR_2TB_DISK, - .driver_features = IMSM_OROM_CAPABILITIES_EnterpriseSystem + .driver_features = IMSM_OROM_CAPABILITIES_EnterpriseSystem | + IMSM_OROM_CAPABILITIES_TPV }; nvme_orom = add_orom(&nvme_orom_compat); } @@ -991,31 +1334,28 @@ char *diskfd_to_devpath(int fd, int dev_level, char *buf) return devt_to_devpath(st.st_rdev, dev_level, buf); } - -int path_attached_to_hba(const char *disk_path, const char *hba_path) +/** + * is_path_attached_to_hba() - Check if disk is attached to hba + * + * @disk_path: Path to disk. + * @hba_path: Path to hba. + * + * Returns: true if disk is attached to hba, false otherwise. + */ +bool is_path_attached_to_hba(const char *disk_path, const char *hba_path) { - int rc; - - if (check_env("IMSM_TEST_AHCI_DEV") || - check_env("IMSM_TEST_SCU_DEV")) { - return 1; - } - if (!disk_path || !hba_path) - return 0; - dprintf("hba: %s - disk: %s\n", hba_path, disk_path); + return false; if (strncmp(disk_path, hba_path, strlen(hba_path)) == 0) - rc = 1; - else - rc = 0; + return true; - return rc; + return false; } int devt_attached_to_hba(dev_t dev, const char *hba_path) { char *disk_path = devt_to_devpath(dev, 1, NULL); - int rc = path_attached_to_hba(disk_path, hba_path); + int rc = is_path_attached_to_hba(disk_path, hba_path); if (disk_path) free(disk_path); @@ -1026,7 +1366,7 @@ int devt_attached_to_hba(dev_t dev, const char *hba_path) int disk_attached_to_hba(int fd, const char *hba_path) { char *disk_path = diskfd_to_devpath(fd, 1, NULL); - int rc = path_attached_to_hba(disk_path, hba_path); + int rc = is_path_attached_to_hba(disk_path, hba_path); if (disk_path) free(disk_path); diff --git a/platform-intel.h b/platform-intel.h index dcc5aaa..63d4168 100644 --- a/platform-intel.h +++ b/platform-intel.h @@ -106,7 +106,12 @@ struct imsm_orom { #define IMSM_OROM_CAPABILITIES_ReadPatrol (1 << 6) #define IMSM_OROM_CAPABILITIES_XorHw (1 << 7) #define IMSM_OROM_CAPABILITIES_SKUMode ((1 << 8)|(1 << 9)) + #define IMSM_OROM_CAPABILITIES_SKUMode_LOW ((1 << 8) | (1 << 9)) + #define IMSM_OROM_CAPABILITIES_SKUMode_LOW_SHIFT 8 #define IMSM_OROM_CAPABILITIES_TPV (1 << 10) + #define IMSM_OROM_CAPABILITIES_SKUMode_HIGH ((1 << 11) | (1 << 12)) + #define IMSM_OROM_CAPABILITIES_SKUMode_HIGH_SHIFT 9 + #define IMSM_OROM_CAPABILITIES_SKUMode_NON_PRODUCTION (1 << 13) } __attribute__((packed)); /* IMSM metadata requirements for each level */ @@ -257,7 +262,7 @@ const struct imsm_orom *find_imsm_orom(void); int disk_attached_to_hba(int fd, const char *hba_path); int devt_attached_to_hba(dev_t dev, const char *hba_path); char *devt_to_devpath(dev_t dev, int dev_level, char *buf); -int path_attached_to_hba(const char *disk_path, const char *hba_path); +bool is_path_attached_to_hba(const char *disk_path, const char *hba_path); const struct orom_entry *get_orom_entry_by_device_id(__u16 dev_id); const struct imsm_orom *get_orom_by_device_id(__u16 device_id); struct sys_dev *device_by_id(__u16 device_id); @@ -23,6 +23,8 @@ */ #include "mdadm.h" +#include "xmalloc.h" + #include <dirent.h> #include <fnmatch.h> #include <ctype.h> @@ -969,19 +971,13 @@ int generate_entries(int fd) */ int Write_rules(char *rule_name) { - int fd; - char udev_rule_file[PATH_MAX]; + int fd = fileno(stdout); - if (rule_name) { - strncpy(udev_rule_file, rule_name, sizeof(udev_rule_file) - 6); - udev_rule_file[sizeof(udev_rule_file) - 6] = '\0'; - strcat(udev_rule_file, ".temp"); - fd = creat(udev_rule_file, - S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); - if (fd == -1) - return 1; - } else - fd = 1; + if (rule_name) + fd = creat(rule_name, 0644); + + if (!is_fd_valid(fd)) + return 1; /* write static invocation */ if (write(fd, udev_template_start, sizeof(udev_template_start) - 1) != @@ -993,15 +989,14 @@ int Write_rules(char *rule_name) goto abort; fsync(fd); - if (rule_name) { + if (rule_name) close(fd); - rename(udev_rule_file, rule_name); - } + return 0; abort: if (rule_name) { close(fd); - unlink(udev_rule_file); + unlink(rule_name); } return 1; } @@ -23,6 +23,8 @@ */ #include "mdadm.h" +#include "xmalloc.h" + #include <stdint.h> /* To restripe, we read from old geometry to a buffer, and diff --git a/sg_io.c b/sg_io.c deleted file mode 100644 index 7889a95..0000000 --- a/sg_io.c +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (C) 2007-2008 Intel Corporation - * - * Retrieve drive serial numbers for scsi disks - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - */ -#include <string.h> -#include <scsi/scsi.h> -#include <scsi/sg.h> -#include <sys/ioctl.h> - -int scsi_get_serial(int fd, void *buf, size_t buf_len) -{ - unsigned char rsp_buf[255]; - unsigned char inq_cmd[] = {INQUIRY, 1, 0x80, 0, sizeof(rsp_buf), 0}; - unsigned char sense[32]; - struct sg_io_hdr io_hdr; - int rv; - unsigned int rsp_len; - - memset(&io_hdr, 0, sizeof(io_hdr)); - io_hdr.interface_id = 'S'; - io_hdr.cmdp = inq_cmd; - io_hdr.cmd_len = sizeof(inq_cmd); - io_hdr.dxferp = rsp_buf; - io_hdr.dxfer_len = sizeof(rsp_buf); - io_hdr.dxfer_direction = SG_DXFER_FROM_DEV; - io_hdr.sbp = sense; - io_hdr.mx_sb_len = sizeof(sense); - io_hdr.timeout = 5000; - - rv = ioctl(fd, SG_IO, &io_hdr); - - if (rv) - return rv; - - if ((io_hdr.info & SG_INFO_OK_MASK) != SG_INFO_OK) - return -1; - - rsp_len = rsp_buf[3]; - - if (!rsp_len || buf_len < rsp_len) - return -1; - - memcpy(buf, &rsp_buf[4], rsp_len); - - return 0; -} diff --git a/super-ddf.c b/super-ddf.c index d870102..6cd099a 100644 --- a/super-ddf.c +++ b/super-ddf.c @@ -29,6 +29,8 @@ #include "mdadm.h" #include "mdmon.h" #include "sha1.h" +#include "xmalloc.h" + #include <values.h> #include <stddef.h> diff --git a/super-gpt.c b/super-gpt.c index a1e9aa9..50c267f 100644 --- a/super-gpt.c +++ b/super-gpt.c @@ -40,6 +40,7 @@ #include "mdadm.h" #include "part.h" +#include "xmalloc.h" static void free_gpt(struct supertype *st) { @@ -105,7 +106,8 @@ static int load_gpt(struct supertype *st, int fd, char *devname) return 1; } /* Set offset to second block (GPT header) */ - lseek(fd, sector_size, SEEK_SET); + if (lseek(fd, sector_size, SEEK_SET) == -1L) + goto no_read; /* Seem to have GPT, load the header */ gpt_head = (struct GPT*)(super+1); if (read(fd, gpt_head, sizeof(*gpt_head)) != sizeof(*gpt_head)) @@ -118,7 +120,8 @@ static int load_gpt(struct supertype *st, int fd, char *devname) to_read = __le32_to_cpu(gpt_head->part_cnt) * sizeof(struct GPT_part_entry); to_read = ((to_read+511)/512) * 512; /* Set offset to third block (GPT entries) */ - lseek(fd, sector_size*2, SEEK_SET); + if (lseek(fd, sector_size * 2, SEEK_SET) == -1L) + goto no_read; if (read(fd, gpt_head+1, to_read) != to_read) goto no_read; diff --git a/super-intel.c b/super-intel.c index 713bfcc..55c71e2 100644 --- a/super-intel.c +++ b/super-intel.c @@ -21,13 +21,18 @@ #include "mdadm.h" #include "mdmon.h" #include "dlink.h" +#include "drive_encryption.h" #include "sha1.h" #include "platform-intel.h" -#include <values.h> -#include <scsi/sg.h> +#include "xmalloc.h" + #include <ctype.h> #include <dirent.h> -#include "drive_encryption.h" +#include <scsi/scsi.h> +#include <scsi/sg.h> +#include <string.h> +#include <sys/ioctl.h> +#include <values.h> /* MPB == Metadata Parameter Block */ #define MPB_SIGNATURE "Intel Raid ISM Cfg Sig. " @@ -194,6 +199,8 @@ ASSERT_SIZE(imsm_map, 52) struct imsm_vol { __u32 curr_migr_unit_lo; __u32 checkpoint_id; /* id to access curr_migr_unit */ +#define MIGR_STATE_NORMAL 0 +#define MIGR_STATE_MIGRATING 1 __u8 migr_state; /* Normal or Migrating */ #define MIGR_INIT 0 #define MIGR_REBUILD 1 @@ -638,6 +645,14 @@ struct imsm_update_rwh_policy { int dev_idx; }; +enum imsm_sku { + SKU_NO_KEY = 0, + SKU_STANDARD_KEY = 1, + SKU_PREMIUM_KEY = 2, + SKU_INTEL_SSD_ONLY_KEY = 3, + SKU_RAID1_ONLY_KEY = 4 +}; + static const char *_sys_dev_type[] = { [SYS_DEV_UNKNOWN] = "Unknown", [SYS_DEV_SAS] = "SAS", @@ -647,6 +662,31 @@ static const char *_sys_dev_type[] = { [SYS_DEV_SATA_VMD] = "SATA VMD" }; +struct imsm_chunk_ops { + uint chunk; + char *chunk_str; +}; + +static const struct imsm_chunk_ops imsm_chunk_ops[] = { + {IMSM_OROM_SSS_2kB, "2k"}, + {IMSM_OROM_SSS_4kB, "4k"}, + {IMSM_OROM_SSS_8kB, "8k"}, + {IMSM_OROM_SSS_16kB, "16k"}, + {IMSM_OROM_SSS_32kB, "32k"}, + {IMSM_OROM_SSS_64kB, "64k"}, + {IMSM_OROM_SSS_128kB, "128k"}, + {IMSM_OROM_SSS_256kB, "256k"}, + {IMSM_OROM_SSS_512kB, "512k"}, + {IMSM_OROM_SSS_1MB, "1M"}, + {IMSM_OROM_SSS_2MB, "2M"}, + {IMSM_OROM_SSS_4MB, "4M"}, + {IMSM_OROM_SSS_8MB, "8M"}, + {IMSM_OROM_SSS_16MB, "16M"}, + {IMSM_OROM_SSS_32MB, "32M"}, + {IMSM_OROM_SSS_64MB, "64M"}, + {0, NULL} +}; + static int no_platform = -1; static int check_no_platform(void) @@ -773,7 +813,7 @@ static struct sys_dev* find_disk_attached_hba(int fd, const char *devname) return 0; for (elem = list; elem; elem = elem->next) - if (path_attached_to_hba(disk_path, elem->path)) + if (is_path_attached_to_hba(disk_path, elem->path)) break; if (disk_path != devname) @@ -2385,7 +2425,7 @@ static int ahci_enumerate_ports(struct sys_dev *hba, unsigned long port_count, i path = devt_to_devpath(makedev(major, minor), 1, NULL); if (!path) continue; - if (!path_attached_to_hba(path, hba->path)) { + if (!is_path_attached_to_hba(path, hba->path)) { free(path); path = NULL; continue; @@ -2536,7 +2576,7 @@ static int print_nvme_info(struct sys_dev *hba) !diskfd_to_devpath(fd, 1, cntrl_path)) goto skip; - if (!path_attached_to_hba(cntrl_path, hba->path)) + if (!is_path_attached_to_hba(cntrl_path, hba->path)) goto skip; if (!imsm_is_nvme_namespace_supported(fd, 0)) @@ -2626,9 +2666,55 @@ static void print_imsm_level_capability(const struct imsm_orom *orom) printf("%s ", imsm_level_ops[idx].name); } -static void print_imsm_capability(const struct imsm_orom *orom) +static void print_imsm_sku_capability(const struct imsm_orom *orom) +{ + int key_val; + + key_val = (orom->driver_features & IMSM_OROM_CAPABILITIES_SKUMode_LOW) >> + IMSM_OROM_CAPABILITIES_SKUMode_LOW_SHIFT; + key_val |= (orom->driver_features & IMSM_OROM_CAPABILITIES_SKUMode_HIGH) >> + IMSM_OROM_CAPABILITIES_SKUMode_HIGH_SHIFT; + + switch (key_val) { + case SKU_NO_KEY: + printf("Pass-through"); + break; + case SKU_STANDARD_KEY: + printf("Standard"); + break; + case SKU_PREMIUM_KEY: + printf("Premium"); + break; + case SKU_INTEL_SSD_ONLY_KEY: + printf("Intel-SSD-only"); + break; + case SKU_RAID1_ONLY_KEY: + printf("RAID1 Only"); + break; + default: + printf("Unknown"); + } + + if (orom->driver_features & IMSM_OROM_CAPABILITIES_SKUMode_NON_PRODUCTION) + printf(" - for evaluation only"); +} + +static void print_imsm_chunk_size_capability(const struct imsm_orom *orom) { + int idx; + + for (idx = 0; imsm_chunk_ops[idx].chunk_str; idx++) + if (imsm_chunk_ops[idx].chunk & orom->sss) + printf("%s ", imsm_chunk_ops[idx].chunk_str); +} + + +static void print_imsm_capability(const struct orom_entry *entry) +{ + const struct imsm_orom *orom = &entry->orom; + printf(" Platform : Intel(R) "); + if (orom->capabilities == 0 && orom->driver_features == 0) printf("Matrix Storage Manager\n"); else if (imsm_orom_is_enterprise(orom) && orom->major_ver >= 6) @@ -2636,44 +2722,43 @@ static void print_imsm_capability(const struct imsm_orom *orom) else printf("Rapid Storage Technology%s\n", imsm_orom_is_enterprise(orom) ? " enterprise" : ""); + if (orom->major_ver || orom->minor_ver || orom->hotfix_ver || orom->build) { if (imsm_orom_is_vmd_without_efi(orom)) - printf(" Version : %d.%d\n", orom->major_ver, - orom->minor_ver); + printf(" Version : %d.%d\n", orom->major_ver, orom->minor_ver); else - printf(" Version : %d.%d.%d.%d\n", orom->major_ver, - orom->minor_ver, orom->hotfix_ver, orom->build); + printf(" Version : %d.%d.%d.%d\n", orom->major_ver, orom->minor_ver, + orom->hotfix_ver, orom->build); + } + + if (entry->type == SYS_DEV_VMD) { + printf(" License : "); + print_imsm_sku_capability(orom); + printf("\n"); } printf(" RAID Levels : "); print_imsm_level_capability(orom); printf("\n"); - printf(" Chunk Sizes :%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", - imsm_orom_has_chunk(orom, 2) ? " 2k" : "", - imsm_orom_has_chunk(orom, 4) ? " 4k" : "", - imsm_orom_has_chunk(orom, 8) ? " 8k" : "", - imsm_orom_has_chunk(orom, 16) ? " 16k" : "", - imsm_orom_has_chunk(orom, 32) ? " 32k" : "", - imsm_orom_has_chunk(orom, 64) ? " 64k" : "", - imsm_orom_has_chunk(orom, 128) ? " 128k" : "", - imsm_orom_has_chunk(orom, 256) ? " 256k" : "", - imsm_orom_has_chunk(orom, 512) ? " 512k" : "", - imsm_orom_has_chunk(orom, 1024*1) ? " 1M" : "", - imsm_orom_has_chunk(orom, 1024*2) ? " 2M" : "", - imsm_orom_has_chunk(orom, 1024*4) ? " 4M" : "", - imsm_orom_has_chunk(orom, 1024*8) ? " 8M" : "", - imsm_orom_has_chunk(orom, 1024*16) ? " 16M" : "", - imsm_orom_has_chunk(orom, 1024*32) ? " 32M" : "", - imsm_orom_has_chunk(orom, 1024*64) ? " 64M" : ""); - printf(" 2TB volumes :%s supported\n", - (orom->attr & IMSM_OROM_ATTR_2TB)?"":" not"); + printf(" Chunk Sizes : "); + print_imsm_chunk_size_capability(orom); + printf("\n"); + + printf(" 2TB volumes :%s supported\n", (orom->attr & IMSM_OROM_ATTR_2TB) ? "" : " not"); + printf(" 2TB disks :%s supported\n", - (orom->attr & IMSM_OROM_ATTR_2TB_DISK)?"":" not"); + (orom->attr & IMSM_OROM_ATTR_2TB_DISK) ? "" : " not"); + printf(" Max Disks : %d\n", orom->tds); - printf(" Max Volumes : %d per array, %d per %s\n", - orom->vpa, orom->vphba, + + printf(" Max Volumes : %d per array, %d per %s\n", orom->vpa, orom->vphba, imsm_orom_is_nvme(orom) ? "platform" : "controller"); + + if (entry->type == SYS_DEV_VMD || entry->type == SYS_DEV_NVME) + /* This is only meaningful for controllers with nvme support */ + printf(" 3rd party NVMe :%s supported\n", + imsm_orom_has_tpv_support(&entry->orom) ? "" : " not"); return; } @@ -2688,23 +2773,10 @@ static void print_imsm_capability_export(const struct imsm_orom *orom) print_imsm_level_capability(orom); printf("\n"); - printf("IMSM_SUPPORTED_CHUNK_SIZES=%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", - imsm_orom_has_chunk(orom, 2) ? "2k " : "", - imsm_orom_has_chunk(orom, 4) ? "4k " : "", - imsm_orom_has_chunk(orom, 8) ? "8k " : "", - imsm_orom_has_chunk(orom, 16) ? "16k " : "", - imsm_orom_has_chunk(orom, 32) ? "32k " : "", - imsm_orom_has_chunk(orom, 64) ? "64k " : "", - imsm_orom_has_chunk(orom, 128) ? "128k " : "", - imsm_orom_has_chunk(orom, 256) ? "256k " : "", - imsm_orom_has_chunk(orom, 512) ? "512k " : "", - imsm_orom_has_chunk(orom, 1024*1) ? "1M " : "", - imsm_orom_has_chunk(orom, 1024*2) ? "2M " : "", - imsm_orom_has_chunk(orom, 1024*4) ? "4M " : "", - imsm_orom_has_chunk(orom, 1024*8) ? "8M " : "", - imsm_orom_has_chunk(orom, 1024*16) ? "16M " : "", - imsm_orom_has_chunk(orom, 1024*32) ? "32M " : "", - imsm_orom_has_chunk(orom, 1024*64) ? "64M " : ""); + printf("IMSM_SUPPORTED_CHUNK_SIZES="); + print_imsm_chunk_size_capability(orom); + printf("\n"); + printf("IMSM_2TB_VOLUMES=%s\n",(orom->attr & IMSM_OROM_ATTR_2TB) ? "yes" : "no"); printf("IMSM_2TB_DISKS=%s\n",(orom->attr & IMSM_OROM_ATTR_2TB_DISK) ? "yes" : "no"); printf("IMSM_MAX_DISKS=%d\n",orom->tds); @@ -2725,26 +2797,25 @@ static int detail_platform_imsm(int verbose, int enumerate_only, char *controlle * platform capabilities. If raid support is disabled in the BIOS the * option-rom capability structure will not be available. */ + const struct orom_entry *entry; struct sys_dev *list, *hba; - int host_base = 0; + struct devid_list *devid; int port_count = 0; - int result=1; + int host_base = 0; + int result = 1; if (enumerate_only) { if (check_no_platform()) return 0; + list = find_intel_devices(); if (!list) return 2; - for (hba = list; hba; hba = hba->next) { - if (find_imsm_capability(hba)) { - result = 0; - break; - } - else - result = 2; - } - return result; + + for (hba = list; hba; hba = hba->next) + if (find_imsm_capability(hba)) + return 0; + return 2; } list = find_intel_devices(); @@ -2760,6 +2831,7 @@ static int detail_platform_imsm(int verbose, int enumerate_only, char *controlle continue; if (!find_imsm_capability(hba)) { char buf[PATH_MAX]; + pr_err("imsm capabilities not found for controller: %s (type %s)\n", hba->type == SYS_DEV_VMD || hba->type == SYS_DEV_SATA_VMD ? vmd_domain_to_controller(hba, buf) : @@ -2775,40 +2847,27 @@ static int detail_platform_imsm(int verbose, int enumerate_only, char *controlle return result; } - const struct orom_entry *entry; - for (entry = orom_entries; entry; entry = entry->next) { - if (entry->type == SYS_DEV_VMD) { - print_imsm_capability(&entry->orom); - printf(" 3rd party NVMe :%s supported\n", - imsm_orom_has_tpv_support(&entry->orom)?"":" not"); + print_imsm_capability(entry); + + if (entry->type == SYS_DEV_VMD || entry->type == SYS_DEV_NVME) { for (hba = list; hba; hba = hba->next) { - if (hba->type == SYS_DEV_VMD) { - char buf[PATH_MAX]; + char buf[PATH_MAX]; + + if (hba->type != entry->type) + continue; + + if (hba->type == SYS_DEV_VMD) printf(" I/O Controller : %s (%s)\n", - vmd_domain_to_controller(hba, buf), get_sys_dev_type(hba->type)); - if (print_nvme_info(hba)) { - if (verbose > 0) - pr_err("failed to get devices attached to VMD domain.\n"); - result |= 2; - } - } - } - printf("\n"); - continue; - } + vmd_domain_to_controller(hba, buf), + get_sys_dev_type(hba->type)); - print_imsm_capability(&entry->orom); - if (entry->type == SYS_DEV_NVME) { - for (hba = list; hba; hba = hba->next) { - if (hba->type == SYS_DEV_NVME) - print_nvme_info(hba); + print_nvme_info(hba); } printf("\n"); continue; } - struct devid_list *devid; for (devid = entry->devid_list; devid; devid = devid->next) { hba = device_by_id(devid->devid); if (!hba) @@ -4122,7 +4181,43 @@ static int nvme_get_serial(int fd, void *buf, size_t buf_len) return devpath_to_char(path, "serial", buf, buf_len, 0); } -extern int scsi_get_serial(int fd, void *buf, size_t buf_len); +mdadm_status_t scsi_get_serial(int fd, void *buf, size_t buf_len) +{ + struct sg_io_hdr io_hdr = {0}; + unsigned char rsp_buf[255]; + unsigned char inq_cmd[] = {INQUIRY, 1, 0x80, 0, sizeof(rsp_buf), 0}; + unsigned char sense[32]; + unsigned int rsp_len; + int rv; + + io_hdr.interface_id = 'S'; + io_hdr.cmdp = inq_cmd; + io_hdr.cmd_len = sizeof(inq_cmd); + io_hdr.dxferp = rsp_buf; + io_hdr.dxfer_len = sizeof(rsp_buf); + io_hdr.dxfer_direction = SG_DXFER_FROM_DEV; + io_hdr.sbp = sense; + io_hdr.mx_sb_len = sizeof(sense); + io_hdr.timeout = 5000; + + rv = ioctl(fd, SG_IO, &io_hdr); + + if (rv) + return MDADM_STATUS_ERROR; + + if ((io_hdr.info & SG_INFO_OK_MASK) != SG_INFO_OK) + return MDADM_STATUS_ERROR; + + rsp_len = rsp_buf[3]; + + if (!rsp_len || buf_len < rsp_len) + return MDADM_STATUS_ERROR; + + memcpy(buf, &rsp_buf[4], rsp_len); + + return MDADM_STATUS_SUCCESS; +} + static int imsm_read_serial(int fd, char *devname, __u8 *serial, size_t serial_buf_len) @@ -4297,7 +4392,7 @@ static void migrate(struct imsm_dev *dev, struct intel_super *super, struct imsm_map *dest; struct imsm_map *src = get_imsm_map(dev, MAP_0); - dev->vol.migr_state = 1; + dev->vol.migr_state = MIGR_STATE_MIGRATING; set_migr_type(dev, migr_type); set_vol_curr_migr_unit(dev, 0); dest = get_imsm_map(dev, MAP_1); @@ -4324,8 +4419,14 @@ static void migrate(struct imsm_dev *dev, struct intel_super *super, static void end_migration(struct imsm_dev *dev, struct intel_super *super, __u8 map_state) { + /* To avoid compilation error, saying dev can't be NULL when + * migr_state is assigned. + */ + if (dev == NULL) + return; + struct imsm_map *map = get_imsm_map(dev, MAP_0); - struct imsm_map *prev = get_imsm_map(dev, dev->vol.migr_state == 0 ? + struct imsm_map *prev = get_imsm_map(dev, dev->vol.migr_state == MIGR_STATE_NORMAL ? MAP_0 : MAP_1); int i, j; @@ -4357,7 +4458,7 @@ static void end_migration(struct imsm_dev *dev, struct intel_super *super, map_state = imsm_check_degraded(super, dev, failed, MAP_0); } - dev->vol.migr_state = 0; + dev->vol.migr_state = MIGR_STATE_NORMAL; set_migr_type(dev, 0); set_vol_curr_migr_unit(dev, 0); map->map_state = map_state; @@ -4437,7 +4538,7 @@ int check_mpb_migr_compatibility(struct intel_super *super) for (i = 0; i < super->anchor->num_raid_devs; i++) { struct imsm_dev *dev_iter = __get_imsm_dev(super->anchor, i); - if (dev_iter->vol.migr_state == 1 && + if (dev_iter->vol.migr_state == MIGR_STATE_MIGRATING && dev_iter->vol.migr_type == MIGR_GEN_MIGR) { /* This device is migrating */ map0 = get_imsm_map(dev_iter, MAP_0); @@ -5642,7 +5743,7 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info, set_imsm_dev_size(dev, array_blocks); dev->status = (DEV_READ_COALESCING | DEV_WRITE_COALESCING); vol = &dev->vol; - vol->migr_state = 0; + vol->migr_state = MIGR_STATE_NORMAL; set_migr_type(dev, MIGR_INIT); vol->dirty = !info->state; set_vol_curr_migr_unit(dev, 0); @@ -5962,12 +6063,12 @@ static int add_to_super_imsm(struct supertype *st, mdu_disk_info_t *dk, unsigned long long data_offset) { struct intel_super *super = st->sb; - struct dl *dd; - unsigned long long size; unsigned int member_sector_size; + unsigned long long size; + struct stat stb; + struct dl *dd; __u32 id; int rv; - struct stat stb; /* If we are on an RAID enabled platform check that the disk is * attached to the raid controller. @@ -5977,114 +6078,85 @@ static int add_to_super_imsm(struct supertype *st, mdu_disk_info_t *dk, rv = find_intel_hba_capability(fd, super, devname); /* no orom/efi or non-intel hba of the disk */ if (rv != 0) { - dprintf("capability: %p fd: %d ret: %d\n", - super->orom, fd, rv); - return 1; + dprintf("capability: %p fd: %d ret: %d\n", super->orom, fd, rv); + return MDADM_STATUS_ERROR; } if (super->current_vol >= 0) return add_to_super_imsm_volume(st, dk, fd, devname); if (fstat(fd, &stb) != 0) - return 1; + return MDADM_STATUS_ERROR; + dd = xcalloc(sizeof(*dd), 1); + + if (devname) + dd->devname = xstrdup(devname); + + if (sysfs_disk_to_scsi_id(fd, &id) == 0) + dd->disk.scsi_id = __cpu_to_le32(id); + dd->major = major(stb.st_rdev); dd->minor = minor(stb.st_rdev); - dd->devname = devname ? xstrdup(devname) : NULL; - dd->fd = fd; - dd->e = NULL; dd->action = DISK_ADD; + dd->fd = fd; + rv = imsm_read_serial(fd, devname, dd->serial, MAX_RAID_SERIAL_LEN); if (rv) { pr_err("failed to retrieve scsi serial, aborting\n"); - __free_imsm_disk(dd, 0); - abort(); + goto error; } if (super->hba && ((super->hba->type == SYS_DEV_NVME) || (super->hba->type == SYS_DEV_VMD))) { - int i; - char cntrl_path[PATH_MAX]; - char *cntrl_name; char pci_dev_path[PATH_MAX]; + char cntrl_path[PATH_MAX]; if (!diskfd_to_devpath(fd, 2, pci_dev_path) || !diskfd_to_devpath(fd, 1, cntrl_path)) { pr_err("failed to get dev paths, aborting\n"); - __free_imsm_disk(dd, 0); - return 1; + goto error; } - cntrl_name = basename(cntrl_path); if (is_multipath_nvme(fd)) pr_err("%s controller supports Multi-Path I/O, Intel (R) VROC does not support multipathing\n", - cntrl_name); - - if (devpath_to_vendor(pci_dev_path) == 0x8086) { - /* - * If Intel's NVMe drive has serial ended with - * "-A","-B","-1" or "-2" it means that this is "x8" - * device (double drive on single PCIe card). - * User should be warned about potential data loss. - */ - for (i = MAX_RAID_SERIAL_LEN-1; i > 0; i--) { - /* Skip empty character at the end */ - if (dd->serial[i] == 0) - continue; + basename(cntrl_path)); - if (((dd->serial[i] == 'A') || - (dd->serial[i] == 'B') || - (dd->serial[i] == '1') || - (dd->serial[i] == '2')) && - (dd->serial[i-1] == '-')) - pr_err("\tThe action you are about to take may put your data at risk.\n" - "\tPlease note that x8 devices may consist of two separate x4 devices " - "located on a single PCIe port.\n" - "\tRAID 0 is the only supported configuration for this type of x8 device.\n"); - break; - } - } else if (super->hba->type == SYS_DEV_VMD && super->orom && - !imsm_orom_has_tpv_support(super->orom)) { + if (super->orom && !imsm_orom_has_tpv_support(super->orom)) { pr_err("\tPlatform configuration does not support non-Intel NVMe drives.\n" "\tPlease refer to Intel(R) RSTe/VROC user guide.\n"); - __free_imsm_disk(dd, 0); - return 1; + goto error; } } - get_dev_size(fd, NULL, &size); - if (!get_dev_sector_size(fd, NULL, &member_sector_size)) { - __free_imsm_disk(dd, 0); - return 1; - } + if (!get_dev_size(fd, NULL, &size) || !get_dev_sector_size(fd, NULL, &member_sector_size)) + goto error; - if (super->sector_size == 0) { + if (super->sector_size == 0) /* this a first device, so sector_size is not set yet */ super->sector_size = member_sector_size; - } /* clear migr_rec when adding disk to container */ - memset(super->migr_rec_buf, 0, MIGR_REC_BUF_SECTORS*MAX_SECTOR_SIZE); - if (lseek64(fd, size - MIGR_REC_SECTOR_POSITION*member_sector_size, - SEEK_SET) >= 0) { - if ((unsigned int)write(fd, super->migr_rec_buf, - MIGR_REC_BUF_SECTORS*member_sector_size) != - MIGR_REC_BUF_SECTORS*member_sector_size) + memset(super->migr_rec_buf, 0, MIGR_REC_BUF_SECTORS * MAX_SECTOR_SIZE); + + if (lseek64(fd, (size - MIGR_REC_SECTOR_POSITION * member_sector_size), SEEK_SET) >= 0) { + unsigned int nbytes = MIGR_REC_BUF_SECTORS * member_sector_size; + + if ((unsigned int)write(fd, super->migr_rec_buf, nbytes) != nbytes) perror("Write migr_rec failed"); } size /= 512; serialcpy(dd->disk.serial, dd->serial); set_total_blocks(&dd->disk, size); + if (__le32_to_cpu(dd->disk.total_blocks_hi) > 0) { struct imsm_super *mpb = super->anchor; + mpb->attributes |= MPB_ATTRIB_2TB_DISK; } + mark_spare(dd); - if (sysfs_disk_to_scsi_id(fd, &id) == 0) - dd->disk.scsi_id = __cpu_to_le32(id); - else - dd->disk.scsi_id = __cpu_to_le32(0); if (st->update_tail) { dd->next = super->disk_mgmt_list; @@ -6099,7 +6171,11 @@ static int add_to_super_imsm(struct supertype *st, mdu_disk_info_t *dk, write_super_imsm_spare(super, dd); } - return 0; + return MDADM_STATUS_SUCCESS; + +error: + __free_imsm_disk(dd, 0); + return MDADM_STATUS_ERROR; } static int remove_from_super_imsm(struct supertype *st, mdu_disk_info_t *dk) @@ -6974,13 +7050,11 @@ active_arrays_by_format(char *name, char* hba, struct md_list **devlist, int found; for (memb = mdstat ; memb ; memb = memb->next) { - if (memb->metadata_version && - (strncmp(memb->metadata_version, "external:", 9) == 0) && - (strcmp(&memb->metadata_version[9], name) == 0) && - !is_subarray(memb->metadata_version+9) && - memb->members) { + if (is_mdstat_ent_external(memb) && !is_subarray(memb->metadata_version + 9) && + strcmp(&memb->metadata_version[9], name) == 0 && memb->members) { struct dev_member *dev = memb->members; int fd = -1; + while (dev && !is_fd_valid(fd)) { char *path = xmalloc(strlen(dev->name) + strlen("/dev/") + 1); num = snprintf(path, PATH_MAX, "%s%s", "/dev/", dev->name); @@ -6998,7 +7072,6 @@ active_arrays_by_format(char *name, char* hba, struct md_list **devlist, struct mdstat_ent *vol; for (vol = mdstat ; vol ; vol = vol->next) { if (vol->active > 0 && - vol->metadata_version && is_container_member(vol, memb->devnm)) { found++; count++; @@ -7066,7 +7139,7 @@ get_devices(const char *hba_path) path = devt_to_devpath(makedev(major, minor), 1, NULL); if (!path) continue; - if (!path_attached_to_hba(path, hba_path)) { + if (!is_path_attached_to_hba(path, hba_path)) { free(path); path = NULL; continue; @@ -8622,7 +8695,7 @@ static void imsm_progress_container_reshape(struct intel_super *super) copy_map_size = sizeof_imsm_map(map); prev_num_members = map->num_members; map->num_members = prev_disks; - dev->vol.migr_state = 1; + dev->vol.migr_state = MIGR_STATE_MIGRATING; set_vol_curr_migr_unit(dev, 0); set_migr_type(dev, MIGR_GEN_MIGR); for (i = prev_num_members; @@ -9854,7 +9927,7 @@ static int apply_reshape_container_disks_update(struct imsm_update_reshape *u, dprintf("imsm: modifying subdev: %i\n", id->index); devices_to_reshape--; - newdev->vol.migr_state = 1; + newdev->vol.migr_state = MIGR_STATE_MIGRATING; set_vol_curr_migr_unit(newdev, 0); set_migr_type(newdev, MIGR_GEN_MIGR); newmap->num_members = u->new_raid_disks; @@ -12646,8 +12719,6 @@ static int imsm_manage_reshape( dprintf("wait_for_reshape_imsm returned error!\n"); goto abort; } - if (sigterm) - goto abort; if (save_checkpoint_imsm(st, sra, UNIT_SRC_NORMAL) == 1) { /* ignore error == 2, this can mean end of reshape here @@ -12656,6 +12727,9 @@ static int imsm_manage_reshape( goto abort; } + if (sigterm) + goto abort; + } /* clear migr_rec on disks after successful migration */ diff --git a/super-mbr.c b/super-mbr.c index 839f000..9881017 100644 --- a/super-mbr.c +++ b/super-mbr.c @@ -41,6 +41,7 @@ #include "mdadm.h" #include "part.h" +#include "xmalloc.h" static void free_mbr(struct supertype *st) { @@ -25,6 +25,8 @@ #define HAVE_STDINT_H 1 #include "mdadm.h" #include "sha1.h" +#include "xmalloc.h" + /* * All handling for the 0.90.0 version superblock is in * this file. @@ -83,6 +85,9 @@ static void examine_super0(struct supertype *st, char *homehost) int d; int delta_extra = 0; char *c; + unsigned long expected_csum = 0; + + expected_csum = calc_sb0_csum(sb); printf(" Magic : %08x\n", sb->md_magic); printf(" Version : %d.%02d.%02d\n", @@ -187,11 +192,11 @@ static void examine_super0(struct supertype *st, char *homehost) printf("Working Devices : %d\n", sb->working_disks); printf(" Failed Devices : %d\n", sb->failed_disks); printf(" Spare Devices : %d\n", sb->spare_disks); - if (calc_sb0_csum(sb) == sb->sb_csum) + if (expected_csum == sb->sb_csum) printf(" Checksum : %x - correct\n", sb->sb_csum); else printf(" Checksum : %x - expected %lx\n", - sb->sb_csum, calc_sb0_csum(sb)); + sb->sb_csum, expected_csum); printf(" Events : %llu\n", ((unsigned long long)sb->events_hi << 32) + sb->events_lo); printf("\n"); @@ -229,7 +234,7 @@ static void examine_super0(struct supertype *st, char *homehost) d++) { mdp_disk_t *dp; char *dv; - char nb[11]; + char nb[INT_2_DEC_STR_MAX]; int wonly, failfast; if (d>=0) dp = &sb->disks[d]; else dp = &sb->this_disk; @@ -1212,7 +1217,8 @@ static int locate_bitmap0(struct supertype *st, int fd, int node_num) offset += MD_SB_BYTES; - lseek64(fd, offset, 0); + if (lseek64(fd, offset, 0) < 0) + return -1; return 0; } @@ -24,6 +24,8 @@ #include <stddef.h> #include "mdadm.h" +#include "xmalloc.h" + /* * The version-1 superblock : * All numeric fields are little-endian. @@ -260,7 +262,10 @@ static int aread(struct align_fd *afd, void *buf, int len) n = read(afd->fd, b, iosize); if (n <= 0) return n; - lseek(afd->fd, len - n, 1); + if (lseek(afd->fd, len - n, 1) < 0) { + pr_err("lseek fails\n"); + return -1; + } if (n > len) n = len; memcpy(buf, b, n); @@ -294,14 +299,20 @@ static int awrite(struct align_fd *afd, void *buf, int len) n = read(afd->fd, b, iosize); if (n <= 0) return n; - lseek(afd->fd, -n, 1); + if (lseek(afd->fd, -n, 1) < 0) { + pr_err("lseek fails\n"); + return -1; + } } memcpy(b, buf, len); n = write(afd->fd, b, iosize); if (n <= 0) return n; - lseek(afd->fd, len - n, 1); + if (lseek(afd->fd, len - n, 1) < 0) { + pr_err("lseek fails\n"); + return -1; + } return len; } @@ -331,6 +342,9 @@ static void examine_super1(struct supertype *st, char *homehost) unsigned long long sb_offset; struct mdinfo info; int inconsistent = 0; + unsigned int expected_csum = 0; + + expected_csum = calc_sb_1_csum(sb); printf(" Magic : %08x\n", __le32_to_cpu(sb->magic)); printf(" Version : 1"); @@ -498,13 +512,13 @@ static void examine_super1(struct supertype *st, char *homehost) printf("\n"); } - if (calc_sb_1_csum(sb) == sb->sb_csum) + if (expected_csum == sb->sb_csum) printf(" Checksum : %x - correct\n", __le32_to_cpu(sb->sb_csum)); else printf(" Checksum : %x - expected %x\n", __le32_to_cpu(sb->sb_csum), - __le32_to_cpu(calc_sb_1_csum(sb))); + __le32_to_cpu(expected_csum)); printf(" Events : %llu\n", (unsigned long long)__le64_to_cpu(sb->events)); printf("\n"); @@ -911,10 +925,12 @@ static int examine_badblocks_super1(struct supertype *st, int fd, char *devname) offset <<= 9; if (lseek64(fd, offset, 0) < 0) { pr_err("Cannot seek to bad-blocks list\n"); + free(bbl); return 1; } if (read(fd, bbl, size) != size) { pr_err("Cannot read bad-blocks list\n"); + free(bbl); return 1; } /* 64bits per entry. 10 bits is block-count, 54 bits is block @@ -935,6 +951,7 @@ static int examine_badblocks_super1(struct supertype *st, int fd, char *devname) printf("%20llu for %d sectors\n", sector, count); } + free(bbl); return 0; } @@ -1457,8 +1474,6 @@ static int update_super1(struct supertype *st, struct mdinfo *info, __le32_to_cpu(sb->chunksize)); if (space > optimal_space) space = optimal_space; - if (space > UINT16_MAX) - space = UINT16_MAX; } sb->ppl.offset = __cpu_to_le16(offset); @@ -2667,7 +2682,10 @@ static int locate_bitmap1(struct supertype *st, int fd, int node_num) } if (mustfree) free(sb); - lseek64(fd, offset<<9, 0); + if (lseek64(fd, offset<<9, 0) < 0) { + pr_err("lseek fails\n"); + ret = -1; + } return ret; } @@ -24,9 +24,11 @@ */ #include "mdadm.h" +#include "dlink.h" +#include "xmalloc.h" + #include <dirent.h> #include <ctype.h> -#include "dlink.h" #define MAX_SYSFS_PATH_LEN 120 @@ -73,6 +75,47 @@ void sysfs_free(struct mdinfo *sra) sra = sra2; } } +/** + * write_attr() - write value to fd, don't check errno. + * @attr: value to write. + * @fd: file descriptor write to. + * + * Size to write is calculated by strlen(). + */ +mdadm_status_t write_attr(const char *value, const int fd) +{ + return sysfs_write_descriptor(fd, value, strlen(value), NULL); +} + +/** + * sysfs_write_descriptor()- wrapper for write(), projected to be used with sysfs. + * @fd: file descriptor. + * @value: value to set. + * @len: length of the value. + * @errno_p: On write() failure, buffer to copy errno value, might be NULL. + * + * Errors are differentiated, because (at least theoretically) kernel may not process whole string + * and it may or may not be a problem (it depends on implementation in kernel). Decision belongs to + * caller then. + * Generally, it should be safe to check if @errno_p changed to determine if error occurred. + */ +mdadm_status_t sysfs_write_descriptor(const int fd, const char *value, const ssize_t len, + int *errno_p) +{ + ssize_t ret; + + ret = write(fd, value, len); + if (ret == -1) { + if (errno_p) + *errno_p = errno; + return MDADM_STATUS_ERROR; + } + + if (ret != len) + return MDADM_STATUS_UNDEF; + + return MDADM_STATUS_SUCCESS; +} /** * sysfs_get_container_devnm() - extract container device name. @@ -97,6 +140,24 @@ void sysfs_get_container_devnm(struct mdinfo *mdi, char *buf) *p = 0; } +/** + * sysfs_open_memb_attr() - helper to get sysfs attr descriptor for member device. + * @array_devnm: array kernel device name. + * @memb_devnm: member device kernel device name. + * @attr: requested sysfs attribute. + * @oflag: open() flags. + * + * To refer member device directory, we need to append "dev-" before the member device name. + */ +int sysfs_open_memb_attr(char *array_devnm, char *memb_devnm, char *attr, int oflag) +{ + char path[PATH_MAX]; + + snprintf(path, PATH_MAX, "/sys/block/%s/md/dev-%s/%s", array_devnm, memb_devnm, attr); + + return open(path, oflag); +} + int sysfs_open(char *devnm, char *devname, char *attr) { char fname[MAX_SYSFS_PATH_LEN]; @@ -139,13 +200,14 @@ int sysfs_init(struct mdinfo *mdi, int fd, char *devnm) goto out; if (!S_ISDIR(stb.st_mode)) goto out; - strcpy(mdi->sys_name, devnm); + strncpy(mdi->sys_name, devnm, sizeof(mdi->sys_name) - 1); retval = 0; out: return retval; } +/* If fd >= 0, get the array it is open on, else use devnm. */ struct mdinfo *sysfs_read(int fd, char *devnm, unsigned long options) { char fname[PATH_MAX]; @@ -179,6 +241,7 @@ struct mdinfo *sysfs_read(int fd, char *devnm, unsigned long options) sra->array.major_version = -1; sra->array.minor_version = -2; strcpy(sra->text_version, buf+9); + sra->text_version[sizeof(sra->text_version) - 1] = '\0'; } else { sscanf(buf, "%d.%d", &sra->array.major_version, @@ -340,6 +403,7 @@ struct mdinfo *sysfs_read(int fd, char *devnm, unsigned long options) } strcpy(dev->sys_name, de->d_name); + dev->sys_name[sizeof(dev->sys_name) - 1] = '\0'; dev->disk.raid_disk = strtoul(buf, &ep, 10); if (*ep) dev->disk.raid_disk = -1; @@ -484,7 +548,6 @@ int sysfs_set_str(struct mdinfo *sra, struct mdinfo *dev, char *name, char *val) { char fname[MAX_SYSFS_PATH_LEN]; - unsigned int n; int fd; snprintf(fname, MAX_SYSFS_PATH_LEN, "/sys/block/%s/md/%s/%s", @@ -492,13 +555,14 @@ int sysfs_set_str(struct mdinfo *sra, struct mdinfo *dev, fd = open(fname, O_WRONLY); if (fd < 0) return -1; - n = write(fd, val, strlen(val)); - close(fd); - if (n != strlen(val)) { - dprintf("failed to write '%s' to '%s' (%s)\n", - val, fname, strerror(errno)); + + if (write_attr(val, fd)) { + pr_err("failed to write '%s' to '%s' (%s)\n", val, fname, strerror(errno)); + close(fd); return -1; } + + close(fd); return 0; } @@ -521,7 +585,6 @@ int sysfs_set_num_signed(struct mdinfo *sra, struct mdinfo *dev, int sysfs_uevent(struct mdinfo *sra, char *event) { char fname[MAX_SYSFS_PATH_LEN]; - int n; int fd; snprintf(fname, MAX_SYSFS_PATH_LEN, "/sys/block/%s/uevent", @@ -529,13 +592,14 @@ int sysfs_uevent(struct mdinfo *sra, char *event) fd = open(fname, O_WRONLY); if (fd < 0) return -1; - n = write(fd, event, strlen(event)); - close(fd); - if (n != (int)strlen(event)) { - dprintf("failed to write '%s' to '%s' (%s)\n", - event, fname, strerror(errno)); + + if (write_attr(event, fd)) { + pr_err("failed to write '%s' to '%s' (%s)\n", event, fname, strerror(errno)); + close(fd); return -1; } + + close(fd); return 0; } @@ -772,8 +836,8 @@ int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd, int resume) memset(nm, 0, sizeof(nm)); dname = devid2kname(makedev(sd->disk.major, sd->disk.minor)); - strcpy(sd->sys_name, "dev-"); - strcpy(sd->sys_name+4, dname); + + snprintf(sd->sys_name, sizeof(sd->sys_name), "dev-%s", dname); /* test write to see if 'recovery_start' is available */ if (resume && sd->recovery_start < MaxSector && diff --git a/systemd/SUSE-mdadm_env.sh b/systemd/SUSE-mdadm_env.sh deleted file mode 100644 index c13b48a..0000000 --- a/systemd/SUSE-mdadm_env.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/sh - -# extract configuration from /etc/sysconfig/mdadm and write -# environment to /run/sysconfig/mdadm to be used by -# systemd unit files. - -MDADM_SCAN="yes" - -# Following adapted from /etc/init.d/mdadmd on openSUSE - -mdadmd_CONFIG=/etc/sysconfig/mdadm -if test -r $mdadmd_CONFIG; then - . $mdadmd_CONFIG -fi - -if [ x$MDADM_DELAY != x"" ]; then - MDADM_DELAY="-d "$MDADM_DELAY; -fi - -if [ x$MDADM_MAIL != x"" ]; then - MDADM_MAIL="-m \"$MDADM_MAIL\"" -fi - -if [ x$MDADM_PROGRAM != x"" ]; then - MDADM_PROGRAM="-p \"$MDADM_PROGRAM\"" -fi - -if [ x$MDADM_SCAN = x"yes" ]; then - MDADM_SCAN="--scan" -else - MDADM_SCAN="" -fi - -if [ x$MDADM_SEND_MAIL_ON_START = x"yes" ]; then - MDADM_SEND_MAIL="-t" -else - MDADM_SEND_MAIL="" -fi - -if [ x$MDADM_CONFIG != x"" ]; then - MDADM_CONFIG="-c \"$MDADM_CONFIG\"" -fi - -mkdir -p /run/sysconfig -echo "MDADM_MONITOR_ARGS=$MDADM_RAIDDEVICES $MDADM_DELAY $MDADM_MAIL $MDADM_PROGRAM $MDADM_SCAN $MDADM_SEND_MAIL $MDADM_CONFIG" > /run/sysconfig/mdadm -if [ -n "$MDADM_CHECK_DURATION" ]; then - echo "MDADM_CHECK_DURATION=$MDADM_CHECK_DURATION" >> /run/sysconfig/mdadm -fi diff --git a/systemd/mdmonitor.service b/systemd/mdmonitor.service index 9c36478..95046bc 100644 --- a/systemd/mdmonitor.service +++ b/systemd/mdmonitor.service @@ -11,7 +11,21 @@ DefaultDependencies=no Documentation=man:mdadm(8) [Service] -Environment= MDADM_MONITOR_ARGS=--scan -EnvironmentFile=-/run/sysconfig/mdadm -ExecStartPre=-/usr/lib/mdadm/mdadm_env.sh -ExecStart=BINDIR/mdadm --monitor $MDADM_MONITOR_ARGS +# For Maintainers: +# We need to ensure that the mdmonitor configuration aligns with the guidelines provided +# in the man page for users. +# /etc/sysconfig/mdadm, /etc/sysconfig/mdmonitor, or any other similar configuration file should +# not be supported because non upstream components are not described in man pages. + +# Parameters designed to be customized by user, should be settable via mdadm.conf: +# - MONITORDELAY (do not set --delay in service) +# - MAILADDR (do not set --mail in service) +# - MAILFROM (not settable from cmdline) +# - PROGRAM (do not set --program or --alert in service) +# +# Following parameters can be customized in service: +# - --syslog (configure syslog logging) +# - --fork (Type=forking must be added, not recommended and not needed with systemd) +# - --pid-file (allowed only when --fork selected) + +ExecStart=BINDIR/mdadm --monitor --scan @@ -78,7 +78,7 @@ mdadm() { $mdadm --zero $args > /dev/null } done - $mdadm 2> $targetdir/stderr "$@" --auto=yes + $mdadm 2> $targetdir/stderr "$@" ;; * ) $mdadm 2> $targetdir/stderr "$@" diff --git a/tests/05r6tor0 b/tests/05r6tor0 index 2fd51f2..b2685b7 100644 --- a/tests/05r6tor0 +++ b/tests/05r6tor0 @@ -13,6 +13,10 @@ check raid5 testdev $md0 3 19456 512 mdadm -G $md0 -l0 check wait; sleep 1 +while ps auxf | grep "mdadm -G" | grep -v grep +do + sleep 1 +done check raid0 testdev $md0 3 19456 512 mdadm -G $md0 -l5 --add $dev3 $dev4 diff --git a/tests/07changelevels b/tests/07changelevels index a328874..3df8660 100644 --- a/tests/07changelevels +++ b/tests/07changelevels @@ -10,7 +10,6 @@ export MDADM_GROW_VERIFY=1 dotest() { sleep 2 check wait - testdev $md0 $1 19968 64 nd blockdev --flushbufs $md0 cmp -s -n $[textK*1024] $md0 /tmp/RandFile || { echo cmp failed; exit 2; } # write something new - shift chars 4 space @@ -24,7 +23,7 @@ checkgeo() { # level raid_disks chunk_size layout dev=$1 shift - sleep 0.5 + sleep 15 check wait sleep 1 for attr in level raid_disks chunk_size layout @@ -43,22 +42,25 @@ checkgeo() { bu=/tmp/md-test-backup rm -f $bu -mdadm -CR $md0 -l1 -n2 -x1 $dev0 $dev1 $dev2 -z 19968 -testdev $md0 1 $mdsize1a 64 +mdadm -CR $md0 -l1 -n2 -x1 $dev0 $dev1 $dev2 +[ -b $md0 ] || die "$1 isn't a block device." dd if=/tmp/RandFile of=$md0 dotest 1 -mdadm --grow $md0 -l5 -n3 --chunk 64 +mdadm --grow $md0 -l5 -n3 +checkgeo md0 raid5 3 dotest 2 mdadm $md0 --add $dev3 $dev4 mdadm --grow $md0 -n4 --chunk 32 +checkgeo md0 raid5 4 $[32*1024] dotest 3 mdadm -G $md0 -l6 --backup-file $bu +checkgeo md0 raid6 5 $[32*1024] dotest 3 -mdadm -G /dev/md0 --array-size 39936 +mdadm -G /dev/md0 --array-size 37888 mdadm -G $md0 -n4 --backup-file $bu checkgeo md0 raid6 4 $[32*1024] dotest 2 @@ -67,14 +69,11 @@ mdadm -G $md0 -l5 --backup-file $bu checkgeo md0 raid5 3 $[32*1024] dotest 2 -mdadm -G /dev/md0 --array-size 19968 +mdadm -G /dev/md0 --array-size 18944 mdadm -G $md0 -n2 --backup-file $bu checkgeo md0 raid5 2 $[32*1024] dotest 1 -mdadm -G --level=1 $md0 -dotest 1 - # now repeat that last few steps only with a degraded array. mdadm -S $md0 mdadm -CR $md0 -l6 -n5 $dev0 $dev1 $dev2 $dev3 $dev4 @@ -83,7 +82,7 @@ dotest 3 mdadm $md0 --fail $dev0 -mdadm -G /dev/md0 --array-size 37888 +mdadm -G /dev/md0 --array-size 35840 mdadm -G $md0 -n4 --backup-file $bu dotest 2 checkgeo md0 raid6 4 $[512*1024] @@ -103,12 +102,10 @@ dotest 2 mdadm -G $md0 -l5 --backup-file $bu dotest 2 -mdadm -G /dev/md0 --array-size 18944 +mdadm -G /dev/md0 --array-size 17920 mdadm -G $md0 -n2 --backup-file $bu dotest 1 checkgeo md0 raid5 2 $[512*1024] mdadm $md0 --fail $dev2 -mdadm -G --level=1 $md0 -dotest 1 -checkgeo md0 raid1 2 +mdadm -S $md0 diff --git a/tests/07changelevels.broken b/tests/07changelevels.broken deleted file mode 100644 index 9b930d9..0000000 --- a/tests/07changelevels.broken +++ /dev/null @@ -1,9 +0,0 @@ -always fails - -Fails with errors: - - mdadm: /dev/loop0 is smaller than given size. 18976K < 19968K + metadata - mdadm: /dev/loop1 is smaller than given size. 18976K < 19968K + metadata - mdadm: /dev/loop2 is smaller than given size. 18976K < 19968K + metadata - - ERROR: /dev/md0 isn't a block device. diff --git a/tests/07reshape5intr.broken b/tests/07reshape5intr.broken deleted file mode 100644 index efe52a6..0000000 --- a/tests/07reshape5intr.broken +++ /dev/null @@ -1,45 +0,0 @@ -always fails - -This patch, recently added to md-next causes the test to always fail: - -7e6ba434cc60 ("md: don't unregister sync_thread with reconfig_mutex -held") - -The new error is simply: - - ERROR: no reshape happening - -Before the patch, the error seen is below. - --- - -fails infrequently - -Fails roughly 1 in 4 runs with errors: - - mdadm: Merging with already-assembled /dev/md/0 - mdadm: cannot re-read metadata from /dev/loop6 - aborting - - ERROR: no reshape happening - -Also have seen a random deadlock: - - INFO: task mdadm:109702 blocked for more than 30 seconds. - Not tainted 5.18.0-rc3-eid-vmlocalyes-dbg-00095-g3c2b5427979d #2040 - "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. - task:mdadm state:D stack: 0 pid:109702 ppid: 1 flags:0x00004000 - Call Trace: - <TASK> - __schedule+0x67e/0x13b0 - schedule+0x82/0x110 - mddev_suspend+0x2e1/0x330 - suspend_lo_store+0xbd/0x140 - md_attr_store+0xcb/0x130 - sysfs_kf_write+0x89/0xb0 - kernfs_fop_write_iter+0x202/0x2c0 - new_sync_write+0x222/0x330 - vfs_write+0x3bc/0x4d0 - ksys_write+0xd9/0x180 - __x64_sys_write+0x43/0x50 - do_syscall_64+0x3b/0x90 - entry_SYSCALL_64_after_hwframe+0x44/0xae diff --git a/tests/07testreshape5 b/tests/07testreshape5 index 0e1f25f..d90fd15 100644 --- a/tests/07testreshape5 +++ b/tests/07testreshape5 @@ -4,6 +4,7 @@ # kernel md code to move data into and out of variously # shaped md arrays. set -x +dir="." layouts=(la ra ls rs) for level in 5 6 do diff --git a/tests/07testreshape5.broken b/tests/07testreshape5.broken deleted file mode 100644 index a8ce03e..0000000 --- a/tests/07testreshape5.broken +++ /dev/null @@ -1,12 +0,0 @@ -always fails - -Test seems to run 'test_stripe' at $dir directory, but $dir is never -set. If $dir is adjusted to $PWD, the test still fails with: - - mdadm: /dev/loop2 is not suitable for this array. - mdadm: create aborted - ++ return 1 - ++ cmp -s -n 8192 /dev/md0 /tmp/RandFile - ++ echo cmp failed - cmp failed - ++ exit 2 diff --git a/tests/09imsm-assemble.broken b/tests/09imsm-assemble.broken deleted file mode 100644 index a6d4d5c..0000000 --- a/tests/09imsm-assemble.broken +++ /dev/null @@ -1,6 +0,0 @@ -fails infrequently - -Fails roughly 1 in 10 runs with errors: - - mdadm: /dev/loop2 is still in use, cannot remove. - /dev/loop2 removal from /dev/md/container should have succeeded diff --git a/tests/18imsm-1d-takeover-r0_1d b/tests/18imsm-1d-takeover-r0_1d index 6f5cf5a..203e240 100644 --- a/tests/18imsm-1d-takeover-r0_1d +++ b/tests/18imsm-1d-takeover-r0_1d @@ -7,12 +7,12 @@ vol0_num_comps=1 vol0_comp_size=$((10 * 1024)) # Create container -mdadm --create --run $container --auto=md --metadata=imsm --force --raid-disks=$vol0_num_comps $dev0 +mdadm --create --run $container --metadata=imsm --force --raid-disks=$vol0_num_comps $dev0 check wait imsm_check container $vol0_num_comps # Create RAID 0 volume -mdadm --create --run $member0 --auto=md --level=0 --size=$vol0_comp_size --chunk=64 --force --raid-disks=$vol0_num_comps $dev0 +mdadm --create --run $member0 --level=0 --size=$vol0_comp_size --chunk=64 --force --raid-disks=$vol0_num_comps $dev0 check wait # Test the member diff --git a/tests/func.sh b/tests/func.sh index e7ccc4f..567d91d 100644 --- a/tests/func.sh +++ b/tests/func.sh @@ -362,6 +362,10 @@ check() { do sleep 0.5 done + while ps auxf | grep "mdadm --grow --continue" | grep -v grep + do + sleep 1 + done echo $min > /proc/sys/dev/raid/speed_limit_min echo $max > /proc/sys/dev/raid/speed_limit_max ;; diff --git a/tests/imsm-grow-template b/tests/imsm-grow-template index 1a8676e..f69e025 100644 --- a/tests/imsm-grow-template +++ b/tests/imsm-grow-template @@ -37,24 +37,24 @@ function grow_member() { } # Create container -mdadm --create --run $container --auto=md --metadata=imsm --raid-disks=$num_disks $device_list +mdadm --create --run $container --metadata=imsm --raid-disks=$num_disks $device_list check wait imsm_check container $num_disks # Create first volume inside the container if [[ ! -z $vol0_chunk ]]; then - mdadm --create --run $member0 --auto=md --level=$vol0_level --size=$vol0_comp_size --chunk=$vol0_chunk --raid-disks=$num_disks $device_list + mdadm --create --run $member0 --level=$vol0_level --size=$vol0_comp_size --chunk=$vol0_chunk --raid-disks=$num_disks $device_list else - mdadm --create --run $member0 --auto=md --level=$vol0_level --size=$vol0_comp_size --raid-disks=$num_disks $device_list + mdadm --create --run $member0 --level=$vol0_level --size=$vol0_comp_size --raid-disks=$num_disks $device_list fi check wait # Create second volume inside the container (if defined) if [ ! -z $vol1_level ]; then if [ ! -z $vol1_chunk ]; then - mdadm --create --run $member1 --auto=md --level=$vol1_level --size=$vol1_comp_size --chunk=$vol1_chunk --raid-disks=$num_disks $device_list + mdadm --create --run $member1 --level=$vol1_level --size=$vol1_comp_size --chunk=$vol1_chunk --raid-disks=$num_disks $device_list else - mdadm --create --run $member1 --auto=md --level=$vol1_level --size=$vol1_comp_size --raid-disks=$num_disks $device_list + mdadm --create --run $member1 --level=$vol1_level --size=$vol1_comp_size --raid-disks=$num_disks $device_list fi check wait fi diff --git a/udev-md-raid-assembly.rules b/udev-md-raid-assembly.rules index d4a7f0a..4cd2c6f 100644 --- a/udev-md-raid-assembly.rules +++ b/udev-md-raid-assembly.rules @@ -41,7 +41,7 @@ ACTION=="change", KERNEL!="dm-*|md*", GOTO="md_inc_end" ACTION!="remove", IMPORT{program}="BINDIR/mdadm --incremental --export $devnode --offroot $env{DEVLINKS}" ACTION!="remove", ENV{MD_STARTED}=="*unsafe*", ENV{MD_FOREIGN}=="no", ENV{SYSTEMD_WANTS}+="mdadm-last-resort@$env{MD_DEVICE}.timer" -ACTION=="remove", ENV{ID_PATH}=="?*", RUN+="BINDIR/mdadm -If $name --path $env{ID_PATH}" -ACTION=="remove", ENV{ID_PATH}!="?*", RUN+="BINDIR/mdadm -If $name" +ACTION=="remove", ENV{ID_PATH}=="?*", RUN+="BINDIR/mdadm -If $devnode --path $env{ID_PATH}" +ACTION=="remove", ENV{ID_PATH}!="?*", RUN+="BINDIR/mdadm -If $devnode" LABEL="md_inc_end" @@ -22,6 +22,8 @@ #include "udev.h" #include "md_p.h" #include "md_u.h" +#include "xmalloc.h" + #include <sys/wait.h> #include <signal.h> #include <limits.h> @@ -24,6 +24,8 @@ #include "mdadm.h" #include "md_p.h" +#include "xmalloc.h" + #include <sys/socket.h> #include <sys/utsname.h> #include <sys/wait.h> @@ -513,6 +515,9 @@ int enough(int level, int raid_disks, int layout, int clean, char *avail) int i; int avail_disks = 0; + if (raid_disks <= 0) + return 0; + for (i = 0; i < raid_disks; i++) avail_disks += !!avail[i]; @@ -521,7 +526,7 @@ int enough(int level, int raid_disks, int layout, int clean, char *avail) /* This is the tricky one - we need to check * which actual disks are present. */ - copies = (layout&255)* ((layout>>8) & 255); + copies = (layout & 255) * ((layout >> 8) & 255); first = 0; do { /* there must be one of the 'copies' form 'first' */ @@ -531,16 +536,16 @@ int enough(int level, int raid_disks, int layout, int clean, char *avail) while (n--) { if (avail[this]) cnt++; - this = (this+1) % raid_disks; + this = (this + 1) % raid_disks; } if (cnt == 0) return 0; - first = (first+(layout&255)) % raid_disks; + first = (first + (layout & 255)) % raid_disks; } while (first != 0); return 1; case LEVEL_MULTIPATH: - return avail_disks>= 1; + return avail_disks >= 1; case LEVEL_LINEAR: case 0: return avail_disks == raid_disks; @@ -556,12 +561,12 @@ int enough(int level, int raid_disks, int layout, int clean, char *avail) /* FALL THROUGH */ case 5: if (clean) - return avail_disks >= raid_disks-1; + return avail_disks >= raid_disks - 1; else return avail_disks >= raid_disks; case 6: if (clean) - return avail_disks >= raid_disks-2; + return avail_disks >= raid_disks - 2; else return avail_disks >= raid_disks; default: @@ -755,42 +760,6 @@ bad_option: return 0; } -int is_standard(char *dev, int *nump) -{ - /* tests if dev is a "standard" md dev name. - * i.e if the last component is "/dNN" or "/mdNN", - * where NN is a string of digits - * Returns 1 if a partitionable standard, - * -1 if non-partitonable, - * 0 if not a standard name. - */ - char *d = strrchr(dev, '/'); - int type = 0; - int num; - if (!d) - return 0; - if (strncmp(d, "/d",2) == 0) - d += 2, type = 1; /* /dev/md/dN{pM} */ - else if (strncmp(d, "/md_d", 5) == 0) - d += 5, type = 1; /* /dev/md_dN{pM} */ - else if (strncmp(d, "/md", 3) == 0) - d += 3, type = -1; /* /dev/mdN */ - else if (d-dev > 3 && strncmp(d-2, "md/", 3) == 0) - d += 1, type = -1; /* /dev/md/N */ - else - return 0; - if (!*d) - return 0; - num = atoi(d); - while (isdigit(*d)) - d++; - if (*d) - return 0; - if (nump) *nump = num; - - return type; -} - unsigned long calc_csum(void *super, int bytes) { unsigned long long newcsum = 0; @@ -1003,7 +972,7 @@ static bool is_devname_numbered(const char *devname, const char *pref, const int if (parse_num(&val, devname + pref_len) != 0) return false; - if (val > 127) + if (val > 1024) return false; return true; @@ -1106,17 +1075,6 @@ int dev_open(char *dev, int flags) fd = open(devname, flags); unlink(devname); } - if (fd < 0) { - /* Try /tmp as /dev appear to be read-only */ - snprintf(devname, sizeof(devname), - "/tmp/.tmp.md.%d:%d:%d", - (int)getpid(), major, minor); - if (mknod(devname, S_IFBLK|0600, - makedev(major, minor)) == 0) { - fd = open(devname, flags); - unlink(devname); - } - } } else fd = open(dev, flags); return fd; @@ -1253,7 +1211,7 @@ struct supertype *super_by_fd(int fd, char **subarrayp) *subarray++ = '\0'; subarray = xstrdup(subarray); } - strcpy(container, dev); + snprintf(container, sizeof(container), "%s", dev); sysfs_free(sra); sra = sysfs_read(-1, container, GET_VERSION); if (sra && sra->text_version[0]) @@ -1430,7 +1388,8 @@ static int get_gpt_last_partition_end(int fd, unsigned long long *endofpart) /* skip protective MBR */ if (!get_dev_sector_size(fd, NULL, §or_size)) return 0; - lseek(fd, sector_size, SEEK_SET); + if (lseek(fd, sector_size, SEEK_SET) == -1L) + return 0; /* read GPT header */ if (read(fd, &gpt, 512) != 512) return 0; @@ -1451,7 +1410,8 @@ static int get_gpt_last_partition_end(int fd, unsigned long long *endofpart) part = (struct GPT_part_entry *)buf; /* set offset to third block (GPT entries) */ - lseek(fd, sector_size*2, SEEK_SET); + if (lseek(fd, sector_size*2, SEEK_SET) == -1L) + return 0; for (part_nr = 0; part_nr < all_partitions; part_nr++) { /* read partition entry */ if (read(fd, buf, entry_size) != (ssize_t)entry_size) @@ -1486,7 +1446,8 @@ static int get_last_partition_end(int fd, unsigned long long *endofpart) BUILD_BUG_ON(sizeof(boot_sect) != 512); /* read MBR */ - lseek(fd, 0, 0); + if (lseek(fd, 0, 0) == -1L) + goto abort; if (read(fd, &boot_sect, 512) != 512) goto abort; @@ -1671,16 +1632,6 @@ int metadata_subdev_matches(char *metadata, char *devnm) return 0; } -int is_container_member(struct mdstat_ent *mdstat, char *container) -{ - if (mdstat->metadata_version == NULL || - strncmp(mdstat->metadata_version, "external:", 9) != 0 || - !metadata_container_matches(mdstat->metadata_version+9, container)) - return 0; - - return 1; -} - int is_subarray_active(char *subarray, char *container) { struct mdstat_ent *mdstat = mdstat_read(0, 0); @@ -1725,7 +1676,7 @@ int open_subarray(char *dev, char *subarray, struct supertype *st, int quiet) dev); goto close_fd; } - strcpy(st->devnm, _devnm); + snprintf(st->devnm, sizeof(st->devnm), "%s", _devnm); mdi = sysfs_read(fd, st->devnm, GET_VERSION|GET_LEVEL); if (!mdi) { @@ -1857,14 +1808,23 @@ int hot_remove_disk(int mdfd, unsigned long dev, int force) int sys_hot_remove_disk(int statefd, int force) { + static const char val[] = "remove"; int cnt = force ? 500 : 5; - int ret; - while ((ret = write(statefd, "remove", 6)) == -1 && - errno == EBUSY && - cnt-- > 0) + while (cnt--) { + int err = 0; + int ret = sysfs_write_descriptor(statefd, val, strlen(val), &err); + + if (ret == MDADM_STATUS_SUCCESS) + return 0; + + if (err != EBUSY) + break; + sleep_for(0, MSEC_TO_NSEC(10), true); - return ret == 6 ? 0 : -1; + } + + return -1; } int set_array_info(int mdfd, struct supertype *st, struct mdinfo *info) @@ -2303,14 +2263,16 @@ void manage_fork_fds(int close_all) { DIR *dir; struct dirent *dirent; + int fd = open("/dev/null", O_RDWR); - close(0); - open("/dev/null", O_RDWR); - + if (is_fd_valid(fd)) { + dup2(fd, 0); #ifndef DEBUG dup2(0, 1); dup2(0, 2); + close_fd(&fd); #endif + } if (close_all == 0) return; @@ -2329,8 +2291,10 @@ void manage_fork_fds(int close_all) fd = strtol(dirent->d_name, NULL, 10); if (fd > 2) - close(fd); + close_fd(&fd); } + closedir(dir); + return; } /* In a systemd/udev world, it is best to get systemd to @@ -2377,13 +2341,15 @@ void reopen_mddev(int mdfd) /* Re-open without any O_EXCL, but keep * the same fd */ - char *devnm; - int fd; - devnm = fd2devnm(mdfd); - close(mdfd); - fd = open_dev(devnm); - if (fd >= 0 && fd != mdfd) - dup2(fd, mdfd); + char *devnm = fd2devnm(mdfd); + int fd = open_dev(devnm); + + if (!is_fd_valid(fd)) + return; + + dup2(fd, mdfd); + + close_fd(&fd); } static struct cmap_hooks *cmap_hooks = NULL; @@ -21,64 +21,57 @@ * Email: <neilb@suse.de> */ -#include "mdadm.h" -/*#include <sys/socket.h> -#include <sys/utsname.h> -#include <sys/wait.h> -#include <sys/un.h> -#include <ctype.h> -#include <dirent.h> -#include <signal.h> -*/ +#include "xmalloc.h" +#include "mdadm_status.h" + +#include <string.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> + +static void *exit_memory_alloc_failure(void) +{ + fprintf(stderr, "Memory allocation failure - aborting\n"); + + exit(MDADM_STATUS_MEM_FAIL); +} void *xmalloc(size_t len) { void *rv = malloc(len); - char *msg; - int n; + if (rv) return rv; - msg = ": memory allocation failure - aborting\n"; - n = write(2, Name, strlen(Name)); - n += write(2, msg, strlen(msg)); - exit(4+!!n); + + return exit_memory_alloc_failure(); } void *xrealloc(void *ptr, size_t len) { void *rv = realloc(ptr, len); - char *msg; - int n; + if (rv) return rv; - msg = ": memory allocation failure - aborting\n"; - n = write(2, Name, strlen(Name)); - n += write(2, msg, strlen(msg)); - exit(4+!!n); + + return exit_memory_alloc_failure(); } void *xcalloc(size_t num, size_t size) { void *rv = calloc(num, size); - char *msg; - int n; + if (rv) return rv; - msg = ": memory allocation failure - aborting\n"; - n = write(2, Name, strlen(Name)); - n += write(2, msg, strlen(msg)); - exit(4+!!n); + + return exit_memory_alloc_failure(); } char *xstrdup(const char *str) { char *rv = strdup(str); - char *msg; - int n; + if (rv) return rv; - msg = ": memory allocation failure - aborting\n"; - n = write(2, Name, strlen(Name)); - n += write(2, msg, strlen(msg)); - exit(4+!!n); + + return exit_memory_alloc_failure(); } diff --git a/xmalloc.h b/xmalloc.h new file mode 100644 index 0000000..0904b0a --- /dev/null +++ b/xmalloc.h @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#ifndef XMALLOC_H +#define XMALLOC_H + +#include <stddef.h> + +void *xmalloc(size_t len); +void *xrealloc(void *ptr, size_t len); +void *xcalloc(size_t num, size_t size); +char *xstrdup(const char *str); + +#endif |